From 141e9d4b5492499c4735d764b599c21e83dac154 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 11:57:48 -0500
Subject: Move dmapool.c to mm/ directory

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 drivers/base/Makefile  |   2 +-
 drivers/base/dmapool.c | 481 -------------------------------------------------
 mm/Makefile            |   1 +
 mm/dmapool.c           | 481 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 483 insertions(+), 482 deletions(-)
 delete mode 100644 drivers/base/dmapool.c
 create mode 100644 mm/dmapool.c

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b39ea3f..ed0a722 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -5,7 +5,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 			   cpu.o firmware.o init.o map.o devres.o \
 			   attribute_container.o transport_class.o
 obj-y			+= power/
-obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o dmapool.o
+obj-$(CONFIG_HAS_DMA)	+= dma-mapping.o
 obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
deleted file mode 100644
index b5034dc..0000000
--- a/drivers/base/dmapool.c
+++ /dev/null
@@ -1,481 +0,0 @@
-
-#include <linux/device.h>
-#include <linux/mm.h>
-#include <asm/io.h>		/* Needed for i386 to build */
-#include <linux/dma-mapping.h>
-#include <linux/dmapool.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/poison.h>
-#include <linux/sched.h>
-
-/*
- * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
- * small blocks are easily used by drivers for bus mastering controllers.
- * This should probably be sharing the guts of the slab allocator.
- */
-
-struct dma_pool {	/* the pool */
-	struct list_head	page_list;
-	spinlock_t		lock;
-	size_t			blocks_per_page;
-	size_t			size;
-	struct device		*dev;
-	size_t			allocation;
-	char			name [32];
-	wait_queue_head_t	waitq;
-	struct list_head	pools;
-};
-
-struct dma_page {	/* cacheable header for 'allocation' bytes */
-	struct list_head	page_list;
-	void			*vaddr;
-	dma_addr_t		dma;
-	unsigned		in_use;
-	unsigned long		bitmap [0];
-};
-
-#define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
-
-static DEFINE_MUTEX (pools_lock);
-
-static ssize_t
-show_pools (struct device *dev, struct device_attribute *attr, char *buf)
-{
-	unsigned temp;
-	unsigned size;
-	char *next;
-	struct dma_page *page;
-	struct dma_pool *pool;
-
-	next = buf;
-	size = PAGE_SIZE;
-
-	temp = scnprintf(next, size, "poolinfo - 0.1\n");
-	size -= temp;
-	next += temp;
-
-	mutex_lock(&pools_lock);
-	list_for_each_entry(pool, &dev->dma_pools, pools) {
-		unsigned pages = 0;
-		unsigned blocks = 0;
-
-		list_for_each_entry(page, &pool->page_list, page_list) {
-			pages++;
-			blocks += page->in_use;
-		}
-
-		/* per-pool info, no real statistics yet */
-		temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
-				pool->name,
-				blocks, pages * pool->blocks_per_page,
-				pool->size, pages);
-		size -= temp;
-		next += temp;
-	}
-	mutex_unlock(&pools_lock);
-
-	return PAGE_SIZE - size;
-}
-static DEVICE_ATTR (pools, S_IRUGO, show_pools, NULL);
-
-/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
- * @name: name of pool, for diagnostics
- * @dev: device that will be doing the DMA
- * @size: size of the blocks in this pool.
- * @align: alignment requirement for blocks; must be a power of two
- * @allocation: returned blocks won't cross this boundary (or zero)
- * Context: !in_interrupt()
- *
- * Returns a dma allocation pool with the requested characteristics, or
- * null if one can't be created.  Given one of these pools, dma_pool_alloc()
- * may be used to allocate memory.  Such memory will all have "consistent"
- * DMA mappings, accessible by the device and its driver without using
- * cache flushing primitives.  The actual size of blocks allocated may be
- * larger than requested because of alignment.
- *
- * If allocation is nonzero, objects returned from dma_pool_alloc() won't
- * cross that size boundary.  This is useful for devices which have
- * addressing restrictions on individual DMA transfers, such as not crossing
- * boundaries of 4KBytes.
- */
-struct dma_pool *
-dma_pool_create (const char *name, struct device *dev,
-	size_t size, size_t align, size_t allocation)
-{
-	struct dma_pool		*retval;
-
-	if (align == 0)
-		align = 1;
-	if (size == 0)
-		return NULL;
-	else if (size < align)
-		size = align;
-	else if ((size % align) != 0) {
-		size += align + 1;
-		size &= ~(align - 1);
-	}
-
-	if (allocation == 0) {
-		if (PAGE_SIZE < size)
-			allocation = size;
-		else
-			allocation = PAGE_SIZE;
-		// FIXME: round up for less fragmentation
-	} else if (allocation < size)
-		return NULL;
-
-	if (!(retval = kmalloc_node (sizeof *retval, GFP_KERNEL, dev_to_node(dev))))
-		return retval;
-
-	strlcpy (retval->name, name, sizeof retval->name);
-
-	retval->dev = dev;
-
-	INIT_LIST_HEAD (&retval->page_list);
-	spin_lock_init (&retval->lock);
-	retval->size = size;
-	retval->allocation = allocation;
-	retval->blocks_per_page = allocation / size;
-	init_waitqueue_head (&retval->waitq);
-
-	if (dev) {
-		int ret;
-
-		mutex_lock(&pools_lock);
-		if (list_empty (&dev->dma_pools))
-			ret = device_create_file (dev, &dev_attr_pools);
-		else
-			ret = 0;
-		/* note:  not currently insisting "name" be unique */
-		if (!ret)
-			list_add (&retval->pools, &dev->dma_pools);
-		else {
-			kfree(retval);
-			retval = NULL;
-		}
-		mutex_unlock(&pools_lock);
-	} else
-		INIT_LIST_HEAD (&retval->pools);
-
-	return retval;
-}
-
-
-static struct dma_page *
-pool_alloc_page (struct dma_pool *pool, gfp_t mem_flags)
-{
-	struct dma_page	*page;
-	int		mapsize;
-
-	mapsize = pool->blocks_per_page;
-	mapsize = (mapsize + BITS_PER_LONG - 1) / BITS_PER_LONG;
-	mapsize *= sizeof (long);
-
-	page = kmalloc(mapsize + sizeof *page, mem_flags);
-	if (!page)
-		return NULL;
-	page->vaddr = dma_alloc_coherent (pool->dev,
-					    pool->allocation,
-					    &page->dma,
-					    mem_flags);
-	if (page->vaddr) {
-		memset (page->bitmap, 0xff, mapsize);	// bit set == free
-#ifdef	CONFIG_DEBUG_SLAB
-		memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
-#endif
-		list_add (&page->page_list, &pool->page_list);
-		page->in_use = 0;
-	} else {
-		kfree (page);
-		page = NULL;
-	}
-	return page;
-}
-
-
-static inline int
-is_page_busy (int blocks, unsigned long *bitmap)
-{
-	while (blocks > 0) {
-		if (*bitmap++ != ~0UL)
-			return 1;
-		blocks -= BITS_PER_LONG;
-	}
-	return 0;
-}
-
-static void
-pool_free_page (struct dma_pool *pool, struct dma_page *page)
-{
-	dma_addr_t	dma = page->dma;
-
-#ifdef	CONFIG_DEBUG_SLAB
-	memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
-#endif
-	dma_free_coherent (pool->dev, pool->allocation, page->vaddr, dma);
-	list_del (&page->page_list);
-	kfree (page);
-}
-
-
-/**
- * dma_pool_destroy - destroys a pool of dma memory blocks.
- * @pool: dma pool that will be destroyed
- * Context: !in_interrupt()
- *
- * Caller guarantees that no more memory from the pool is in use,
- * and that nothing will try to use the pool after this call.
- */
-void
-dma_pool_destroy (struct dma_pool *pool)
-{
-	mutex_lock(&pools_lock);
-	list_del (&pool->pools);
-	if (pool->dev && list_empty (&pool->dev->dma_pools))
-		device_remove_file (pool->dev, &dev_attr_pools);
-	mutex_unlock(&pools_lock);
-
-	while (!list_empty (&pool->page_list)) {
-		struct dma_page		*page;
-		page = list_entry (pool->page_list.next,
-				struct dma_page, page_list);
-		if (is_page_busy (pool->blocks_per_page, page->bitmap)) {
-			if (pool->dev)
-				dev_err(pool->dev, "dma_pool_destroy %s, %p busy\n",
-					pool->name, page->vaddr);
-			else
-				printk (KERN_ERR "dma_pool_destroy %s, %p busy\n",
-					pool->name, page->vaddr);
-			/* leak the still-in-use consistent memory */
-			list_del (&page->page_list);
-			kfree (page);
-		} else
-			pool_free_page (pool, page);
-	}
-
-	kfree (pool);
-}
-
-
-/**
- * dma_pool_alloc - get a block of consistent memory
- * @pool: dma pool that will produce the block
- * @mem_flags: GFP_* bitmask
- * @handle: pointer to dma address of block
- *
- * This returns the kernel virtual address of a currently unused block,
- * and reports its dma address through the handle.
- * If such a memory block can't be allocated, null is returned.
- */
-void *
-dma_pool_alloc (struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
-{
-	unsigned long		flags;
-	struct dma_page		*page;
-	int			map, block;
-	size_t			offset;
-	void			*retval;
-
-restart:
-	spin_lock_irqsave (&pool->lock, flags);
-	list_for_each_entry(page, &pool->page_list, page_list) {
-		int		i;
-		/* only cachable accesses here ... */
-		for (map = 0, i = 0;
-				i < pool->blocks_per_page;
-				i += BITS_PER_LONG, map++) {
-			if (page->bitmap [map] == 0)
-				continue;
-			block = ffz (~ page->bitmap [map]);
-			if ((i + block) < pool->blocks_per_page) {
-				clear_bit (block, &page->bitmap [map]);
-				offset = (BITS_PER_LONG * map) + block;
-				offset *= pool->size;
-				goto ready;
-			}
-		}
-	}
-	if (!(page = pool_alloc_page (pool, GFP_ATOMIC))) {
-		if (mem_flags & __GFP_WAIT) {
-			DECLARE_WAITQUEUE (wait, current);
-
-			__set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue (&pool->waitq, &wait);
-			spin_unlock_irqrestore (&pool->lock, flags);
-
-			schedule_timeout (POOL_TIMEOUT_JIFFIES);
-
-			remove_wait_queue (&pool->waitq, &wait);
-			goto restart;
-		}
-		retval = NULL;
-		goto done;
-	}
-
-	clear_bit (0, &page->bitmap [0]);
-	offset = 0;
-ready:
-	page->in_use++;
-	retval = offset + page->vaddr;
-	*handle = offset + page->dma;
-#ifdef	CONFIG_DEBUG_SLAB
-	memset (retval, POOL_POISON_ALLOCATED, pool->size);
-#endif
-done:
-	spin_unlock_irqrestore (&pool->lock, flags);
-	return retval;
-}
-
-
-static struct dma_page *
-pool_find_page (struct dma_pool *pool, dma_addr_t dma)
-{
-	unsigned long		flags;
-	struct dma_page		*page;
-
-	spin_lock_irqsave (&pool->lock, flags);
-	list_for_each_entry(page, &pool->page_list, page_list) {
-		if (dma < page->dma)
-			continue;
-		if (dma < (page->dma + pool->allocation))
-			goto done;
-	}
-	page = NULL;
-done:
-	spin_unlock_irqrestore (&pool->lock, flags);
-	return page;
-}
-
-
-/**
- * dma_pool_free - put block back into dma pool
- * @pool: the dma pool holding the block
- * @vaddr: virtual address of block
- * @dma: dma address of block
- *
- * Caller promises neither device nor driver will again touch this block
- * unless it is first re-allocated.
- */
-void
-dma_pool_free (struct dma_pool *pool, void *vaddr, dma_addr_t dma)
-{
-	struct dma_page		*page;
-	unsigned long		flags;
-	int			map, block;
-
-	if ((page = pool_find_page(pool, dma)) == NULL) {
-		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n",
-				pool->name, vaddr, (unsigned long) dma);
-		else
-			printk (KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
-				pool->name, vaddr, (unsigned long) dma);
-		return;
-	}
-
-	block = dma - page->dma;
-	block /= pool->size;
-	map = block / BITS_PER_LONG;
-	block %= BITS_PER_LONG;
-
-#ifdef	CONFIG_DEBUG_SLAB
-	if (((dma - page->dma) + (void *)page->vaddr) != vaddr) {
-		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
-				pool->name, vaddr, (unsigned long long) dma);
-		else
-			printk (KERN_ERR "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
-				pool->name, vaddr, (unsigned long long) dma);
-		return;
-	}
-	if (page->bitmap [map] & (1UL << block)) {
-		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
-				pool->name, (unsigned long long)dma);
-		else
-			printk (KERN_ERR "dma_pool_free %s, dma %Lx already free\n",
-				pool->name, (unsigned long long)dma);
-		return;
-	}
-	memset (vaddr, POOL_POISON_FREED, pool->size);
-#endif
-
-	spin_lock_irqsave (&pool->lock, flags);
-	page->in_use--;
-	set_bit (block, &page->bitmap [map]);
-	if (waitqueue_active (&pool->waitq))
-		wake_up (&pool->waitq);
-	/*
-	 * Resist a temptation to do
-	 *    if (!is_page_busy(bpp, page->bitmap)) pool_free_page(pool, page);
-	 * Better have a few empty pages hang around.
-	 */
-	spin_unlock_irqrestore (&pool->lock, flags);
-}
-
-/*
- * Managed DMA pool
- */
-static void dmam_pool_release(struct device *dev, void *res)
-{
-	struct dma_pool *pool = *(struct dma_pool **)res;
-
-	dma_pool_destroy(pool);
-}
-
-static int dmam_pool_match(struct device *dev, void *res, void *match_data)
-{
-	return *(struct dma_pool **)res == match_data;
-}
-
-/**
- * dmam_pool_create - Managed dma_pool_create()
- * @name: name of pool, for diagnostics
- * @dev: device that will be doing the DMA
- * @size: size of the blocks in this pool.
- * @align: alignment requirement for blocks; must be a power of two
- * @allocation: returned blocks won't cross this boundary (or zero)
- *
- * Managed dma_pool_create().  DMA pool created with this function is
- * automatically destroyed on driver detach.
- */
-struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
-				  size_t size, size_t align, size_t allocation)
-{
-	struct dma_pool **ptr, *pool;
-
-	ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return NULL;
-
-	pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
-	if (pool)
-		devres_add(dev, ptr);
-	else
-		devres_free(ptr);
-
-	return pool;
-}
-
-/**
- * dmam_pool_destroy - Managed dma_pool_destroy()
- * @pool: dma pool that will be destroyed
- *
- * Managed dma_pool_destroy().
- */
-void dmam_pool_destroy(struct dma_pool *pool)
-{
-	struct device *dev = pool->dev;
-
-	dma_pool_destroy(pool);
-	WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
-}
-
-EXPORT_SYMBOL (dma_pool_create);
-EXPORT_SYMBOL (dma_pool_destroy);
-EXPORT_SYMBOL (dma_pool_alloc);
-EXPORT_SYMBOL (dma_pool_free);
-EXPORT_SYMBOL (dmam_pool_create);
-EXPORT_SYMBOL (dmam_pool_destroy);
diff --git a/mm/Makefile b/mm/Makefile
index 5c0b0ea..e222cc5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 0000000..b5034dc
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,481 @@
+
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <asm/io.h>		/* Needed for i386 to build */
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+
+/*
+ * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
+ * small blocks are easily used by drivers for bus mastering controllers.
+ * This should probably be sharing the guts of the slab allocator.
+ */
+
+struct dma_pool {	/* the pool */
+	struct list_head	page_list;
+	spinlock_t		lock;
+	size_t			blocks_per_page;
+	size_t			size;
+	struct device		*dev;
+	size_t			allocation;
+	char			name [32];
+	wait_queue_head_t	waitq;
+	struct list_head	pools;
+};
+
+struct dma_page {	/* cacheable header for 'allocation' bytes */
+	struct list_head	page_list;
+	void			*vaddr;
+	dma_addr_t		dma;
+	unsigned		in_use;
+	unsigned long		bitmap [0];
+};
+
+#define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
+
+static DEFINE_MUTEX (pools_lock);
+
+static ssize_t
+show_pools (struct device *dev, struct device_attribute *attr, char *buf)
+{
+	unsigned temp;
+	unsigned size;
+	char *next;
+	struct dma_page *page;
+	struct dma_pool *pool;
+
+	next = buf;
+	size = PAGE_SIZE;
+
+	temp = scnprintf(next, size, "poolinfo - 0.1\n");
+	size -= temp;
+	next += temp;
+
+	mutex_lock(&pools_lock);
+	list_for_each_entry(pool, &dev->dma_pools, pools) {
+		unsigned pages = 0;
+		unsigned blocks = 0;
+
+		list_for_each_entry(page, &pool->page_list, page_list) {
+			pages++;
+			blocks += page->in_use;
+		}
+
+		/* per-pool info, no real statistics yet */
+		temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+				pool->name,
+				blocks, pages * pool->blocks_per_page,
+				pool->size, pages);
+		size -= temp;
+		next += temp;
+	}
+	mutex_unlock(&pools_lock);
+
+	return PAGE_SIZE - size;
+}
+static DEVICE_ATTR (pools, S_IRUGO, show_pools, NULL);
+
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ * Context: !in_interrupt()
+ *
+ * Returns a dma allocation pool with the requested characteristics, or
+ * null if one can't be created.  Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory.  Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives.  The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If allocation is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary.  This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ */
+struct dma_pool *
+dma_pool_create (const char *name, struct device *dev,
+	size_t size, size_t align, size_t allocation)
+{
+	struct dma_pool		*retval;
+
+	if (align == 0)
+		align = 1;
+	if (size == 0)
+		return NULL;
+	else if (size < align)
+		size = align;
+	else if ((size % align) != 0) {
+		size += align + 1;
+		size &= ~(align - 1);
+	}
+
+	if (allocation == 0) {
+		if (PAGE_SIZE < size)
+			allocation = size;
+		else
+			allocation = PAGE_SIZE;
+		// FIXME: round up for less fragmentation
+	} else if (allocation < size)
+		return NULL;
+
+	if (!(retval = kmalloc_node (sizeof *retval, GFP_KERNEL, dev_to_node(dev))))
+		return retval;
+
+	strlcpy (retval->name, name, sizeof retval->name);
+
+	retval->dev = dev;
+
+	INIT_LIST_HEAD (&retval->page_list);
+	spin_lock_init (&retval->lock);
+	retval->size = size;
+	retval->allocation = allocation;
+	retval->blocks_per_page = allocation / size;
+	init_waitqueue_head (&retval->waitq);
+
+	if (dev) {
+		int ret;
+
+		mutex_lock(&pools_lock);
+		if (list_empty (&dev->dma_pools))
+			ret = device_create_file (dev, &dev_attr_pools);
+		else
+			ret = 0;
+		/* note:  not currently insisting "name" be unique */
+		if (!ret)
+			list_add (&retval->pools, &dev->dma_pools);
+		else {
+			kfree(retval);
+			retval = NULL;
+		}
+		mutex_unlock(&pools_lock);
+	} else
+		INIT_LIST_HEAD (&retval->pools);
+
+	return retval;
+}
+
+
+static struct dma_page *
+pool_alloc_page (struct dma_pool *pool, gfp_t mem_flags)
+{
+	struct dma_page	*page;
+	int		mapsize;
+
+	mapsize = pool->blocks_per_page;
+	mapsize = (mapsize + BITS_PER_LONG - 1) / BITS_PER_LONG;
+	mapsize *= sizeof (long);
+
+	page = kmalloc(mapsize + sizeof *page, mem_flags);
+	if (!page)
+		return NULL;
+	page->vaddr = dma_alloc_coherent (pool->dev,
+					    pool->allocation,
+					    &page->dma,
+					    mem_flags);
+	if (page->vaddr) {
+		memset (page->bitmap, 0xff, mapsize);	// bit set == free
+#ifdef	CONFIG_DEBUG_SLAB
+		memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+		list_add (&page->page_list, &pool->page_list);
+		page->in_use = 0;
+	} else {
+		kfree (page);
+		page = NULL;
+	}
+	return page;
+}
+
+
+static inline int
+is_page_busy (int blocks, unsigned long *bitmap)
+{
+	while (blocks > 0) {
+		if (*bitmap++ != ~0UL)
+			return 1;
+		blocks -= BITS_PER_LONG;
+	}
+	return 0;
+}
+
+static void
+pool_free_page (struct dma_pool *pool, struct dma_page *page)
+{
+	dma_addr_t	dma = page->dma;
+
+#ifdef	CONFIG_DEBUG_SLAB
+	memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+	dma_free_coherent (pool->dev, pool->allocation, page->vaddr, dma);
+	list_del (&page->page_list);
+	kfree (page);
+}
+
+
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void
+dma_pool_destroy (struct dma_pool *pool)
+{
+	mutex_lock(&pools_lock);
+	list_del (&pool->pools);
+	if (pool->dev && list_empty (&pool->dev->dma_pools))
+		device_remove_file (pool->dev, &dev_attr_pools);
+	mutex_unlock(&pools_lock);
+
+	while (!list_empty (&pool->page_list)) {
+		struct dma_page		*page;
+		page = list_entry (pool->page_list.next,
+				struct dma_page, page_list);
+		if (is_page_busy (pool->blocks_per_page, page->bitmap)) {
+			if (pool->dev)
+				dev_err(pool->dev, "dma_pool_destroy %s, %p busy\n",
+					pool->name, page->vaddr);
+			else
+				printk (KERN_ERR "dma_pool_destroy %s, %p busy\n",
+					pool->name, page->vaddr);
+			/* leak the still-in-use consistent memory */
+			list_del (&page->page_list);
+			kfree (page);
+		} else
+			pool_free_page (pool, page);
+	}
+
+	kfree (pool);
+}
+
+
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * This returns the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, null is returned.
+ */
+void *
+dma_pool_alloc (struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
+{
+	unsigned long		flags;
+	struct dma_page		*page;
+	int			map, block;
+	size_t			offset;
+	void			*retval;
+
+restart:
+	spin_lock_irqsave (&pool->lock, flags);
+	list_for_each_entry(page, &pool->page_list, page_list) {
+		int		i;
+		/* only cachable accesses here ... */
+		for (map = 0, i = 0;
+				i < pool->blocks_per_page;
+				i += BITS_PER_LONG, map++) {
+			if (page->bitmap [map] == 0)
+				continue;
+			block = ffz (~ page->bitmap [map]);
+			if ((i + block) < pool->blocks_per_page) {
+				clear_bit (block, &page->bitmap [map]);
+				offset = (BITS_PER_LONG * map) + block;
+				offset *= pool->size;
+				goto ready;
+			}
+		}
+	}
+	if (!(page = pool_alloc_page (pool, GFP_ATOMIC))) {
+		if (mem_flags & __GFP_WAIT) {
+			DECLARE_WAITQUEUE (wait, current);
+
+			__set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue (&pool->waitq, &wait);
+			spin_unlock_irqrestore (&pool->lock, flags);
+
+			schedule_timeout (POOL_TIMEOUT_JIFFIES);
+
+			remove_wait_queue (&pool->waitq, &wait);
+			goto restart;
+		}
+		retval = NULL;
+		goto done;
+	}
+
+	clear_bit (0, &page->bitmap [0]);
+	offset = 0;
+ready:
+	page->in_use++;
+	retval = offset + page->vaddr;
+	*handle = offset + page->dma;
+#ifdef	CONFIG_DEBUG_SLAB
+	memset (retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+done:
+	spin_unlock_irqrestore (&pool->lock, flags);
+	return retval;
+}
+
+
+static struct dma_page *
+pool_find_page (struct dma_pool *pool, dma_addr_t dma)
+{
+	unsigned long		flags;
+	struct dma_page		*page;
+
+	spin_lock_irqsave (&pool->lock, flags);
+	list_for_each_entry(page, &pool->page_list, page_list) {
+		if (dma < page->dma)
+			continue;
+		if (dma < (page->dma + pool->allocation))
+			goto done;
+	}
+	page = NULL;
+done:
+	spin_unlock_irqrestore (&pool->lock, flags);
+	return page;
+}
+
+
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void
+dma_pool_free (struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+	struct dma_page		*page;
+	unsigned long		flags;
+	int			map, block;
+
+	if ((page = pool_find_page(pool, dma)) == NULL) {
+		if (pool->dev)
+			dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n",
+				pool->name, vaddr, (unsigned long) dma);
+		else
+			printk (KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+				pool->name, vaddr, (unsigned long) dma);
+		return;
+	}
+
+	block = dma - page->dma;
+	block /= pool->size;
+	map = block / BITS_PER_LONG;
+	block %= BITS_PER_LONG;
+
+#ifdef	CONFIG_DEBUG_SLAB
+	if (((dma - page->dma) + (void *)page->vaddr) != vaddr) {
+		if (pool->dev)
+			dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+				pool->name, vaddr, (unsigned long long) dma);
+		else
+			printk (KERN_ERR "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+				pool->name, vaddr, (unsigned long long) dma);
+		return;
+	}
+	if (page->bitmap [map] & (1UL << block)) {
+		if (pool->dev)
+			dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+				pool->name, (unsigned long long)dma);
+		else
+			printk (KERN_ERR "dma_pool_free %s, dma %Lx already free\n",
+				pool->name, (unsigned long long)dma);
+		return;
+	}
+	memset (vaddr, POOL_POISON_FREED, pool->size);
+#endif
+
+	spin_lock_irqsave (&pool->lock, flags);
+	page->in_use--;
+	set_bit (block, &page->bitmap [map]);
+	if (waitqueue_active (&pool->waitq))
+		wake_up (&pool->waitq);
+	/*
+	 * Resist a temptation to do
+	 *    if (!is_page_busy(bpp, page->bitmap)) pool_free_page(pool, page);
+	 * Better have a few empty pages hang around.
+	 */
+	spin_unlock_irqrestore (&pool->lock, flags);
+}
+
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+	struct dma_pool *pool = *(struct dma_pool **)res;
+
+	dma_pool_destroy(pool);
+}
+
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+	return *(struct dma_pool **)res == match_data;
+}
+
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create().  DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+				  size_t size, size_t align, size_t allocation)
+{
+	struct dma_pool **ptr, *pool;
+
+	ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+	if (pool)
+		devres_add(dev, ptr);
+	else
+		devres_free(ptr);
+
+	return pool;
+}
+
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+	struct device *dev = pool->dev;
+
+	dma_pool_destroy(pool);
+	WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+}
+
+EXPORT_SYMBOL (dma_pool_create);
+EXPORT_SYMBOL (dma_pool_destroy);
+EXPORT_SYMBOL (dma_pool_alloc);
+EXPORT_SYMBOL (dma_pool_free);
+EXPORT_SYMBOL (dmam_pool_create);
+EXPORT_SYMBOL (dmam_pool_destroy);
-- 
cgit v1.1


From e87aa773747fb5e4217d716ea22a573c03b6693a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 12:04:31 -0500
Subject: dmapool: Fix style problems

Run Lindent and fix all issues reported by checkpatch.pl

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 mm/dmapool.c | 288 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 142 insertions(+), 146 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index b5034dc..92e886d 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -15,32 +15,32 @@
  * This should probably be sharing the guts of the slab allocator.
  */
 
-struct dma_pool {	/* the pool */
-	struct list_head	page_list;
-	spinlock_t		lock;
-	size_t			blocks_per_page;
-	size_t			size;
-	struct device		*dev;
-	size_t			allocation;
-	char			name [32];
-	wait_queue_head_t	waitq;
-	struct list_head	pools;
+struct dma_pool {		/* the pool */
+	struct list_head page_list;
+	spinlock_t lock;
+	size_t blocks_per_page;
+	size_t size;
+	struct device *dev;
+	size_t allocation;
+	char name[32];
+	wait_queue_head_t waitq;
+	struct list_head pools;
 };
 
-struct dma_page {	/* cacheable header for 'allocation' bytes */
-	struct list_head	page_list;
-	void			*vaddr;
-	dma_addr_t		dma;
-	unsigned		in_use;
-	unsigned long		bitmap [0];
+struct dma_page {		/* cacheable header for 'allocation' bytes */
+	struct list_head page_list;
+	void *vaddr;
+	dma_addr_t dma;
+	unsigned in_use;
+	unsigned long bitmap[0];
 };
 
 #define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
 
-static DEFINE_MUTEX (pools_lock);
+static DEFINE_MUTEX(pools_lock);
 
 static ssize_t
-show_pools (struct device *dev, struct device_attribute *attr, char *buf)
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	unsigned temp;
 	unsigned size;
@@ -67,9 +67,9 @@ show_pools (struct device *dev, struct device_attribute *attr, char *buf)
 
 		/* per-pool info, no real statistics yet */
 		temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
-				pool->name,
-				blocks, pages * pool->blocks_per_page,
-				pool->size, pages);
+				 pool->name,
+				 blocks, pages * pool->blocks_per_page,
+				 pool->size, pages);
 		size -= temp;
 		next += temp;
 	}
@@ -77,7 +77,8 @@ show_pools (struct device *dev, struct device_attribute *attr, char *buf)
 
 	return PAGE_SIZE - size;
 }
-static DEVICE_ATTR (pools, S_IRUGO, show_pools, NULL);
+
+static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
 
 /**
  * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
@@ -100,11 +101,10 @@ static DEVICE_ATTR (pools, S_IRUGO, show_pools, NULL);
  * addressing restrictions on individual DMA transfers, such as not crossing
  * boundaries of 4KBytes.
  */
-struct dma_pool *
-dma_pool_create (const char *name, struct device *dev,
-	size_t size, size_t align, size_t allocation)
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+				 size_t size, size_t align, size_t allocation)
 {
-	struct dma_pool		*retval;
+	struct dma_pool *retval;
 
 	if (align == 0)
 		align = 1;
@@ -122,81 +122,79 @@ dma_pool_create (const char *name, struct device *dev,
 			allocation = size;
 		else
 			allocation = PAGE_SIZE;
-		// FIXME: round up for less fragmentation
+		/* FIXME: round up for less fragmentation */
 	} else if (allocation < size)
 		return NULL;
 
-	if (!(retval = kmalloc_node (sizeof *retval, GFP_KERNEL, dev_to_node(dev))))
+	if (!
+	    (retval =
+	     kmalloc_node(sizeof *retval, GFP_KERNEL, dev_to_node(dev))))
 		return retval;
 
-	strlcpy (retval->name, name, sizeof retval->name);
+	strlcpy(retval->name, name, sizeof retval->name);
 
 	retval->dev = dev;
 
-	INIT_LIST_HEAD (&retval->page_list);
-	spin_lock_init (&retval->lock);
+	INIT_LIST_HEAD(&retval->page_list);
+	spin_lock_init(&retval->lock);
 	retval->size = size;
 	retval->allocation = allocation;
 	retval->blocks_per_page = allocation / size;
-	init_waitqueue_head (&retval->waitq);
+	init_waitqueue_head(&retval->waitq);
 
 	if (dev) {
 		int ret;
 
 		mutex_lock(&pools_lock);
-		if (list_empty (&dev->dma_pools))
-			ret = device_create_file (dev, &dev_attr_pools);
+		if (list_empty(&dev->dma_pools))
+			ret = device_create_file(dev, &dev_attr_pools);
 		else
 			ret = 0;
 		/* note:  not currently insisting "name" be unique */
 		if (!ret)
-			list_add (&retval->pools, &dev->dma_pools);
+			list_add(&retval->pools, &dev->dma_pools);
 		else {
 			kfree(retval);
 			retval = NULL;
 		}
 		mutex_unlock(&pools_lock);
 	} else
-		INIT_LIST_HEAD (&retval->pools);
+		INIT_LIST_HEAD(&retval->pools);
 
 	return retval;
 }
+EXPORT_SYMBOL(dma_pool_create);
 
-
-static struct dma_page *
-pool_alloc_page (struct dma_pool *pool, gfp_t mem_flags)
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 {
-	struct dma_page	*page;
-	int		mapsize;
+	struct dma_page *page;
+	int mapsize;
 
 	mapsize = pool->blocks_per_page;
 	mapsize = (mapsize + BITS_PER_LONG - 1) / BITS_PER_LONG;
-	mapsize *= sizeof (long);
+	mapsize *= sizeof(long);
 
 	page = kmalloc(mapsize + sizeof *page, mem_flags);
 	if (!page)
 		return NULL;
-	page->vaddr = dma_alloc_coherent (pool->dev,
-					    pool->allocation,
-					    &page->dma,
-					    mem_flags);
+	page->vaddr = dma_alloc_coherent(pool->dev,
+					 pool->allocation,
+					 &page->dma, mem_flags);
 	if (page->vaddr) {
-		memset (page->bitmap, 0xff, mapsize);	// bit set == free
+		memset(page->bitmap, 0xff, mapsize);	/* bit set == free */
 #ifdef	CONFIG_DEBUG_SLAB
-		memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
+		memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
-		list_add (&page->page_list, &pool->page_list);
+		list_add(&page->page_list, &pool->page_list);
 		page->in_use = 0;
 	} else {
-		kfree (page);
+		kfree(page);
 		page = NULL;
 	}
 	return page;
 }
 
-
-static inline int
-is_page_busy (int blocks, unsigned long *bitmap)
+static inline int is_page_busy(int blocks, unsigned long *bitmap)
 {
 	while (blocks > 0) {
 		if (*bitmap++ != ~0UL)
@@ -206,20 +204,18 @@ is_page_busy (int blocks, unsigned long *bitmap)
 	return 0;
 }
 
-static void
-pool_free_page (struct dma_pool *pool, struct dma_page *page)
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
 {
-	dma_addr_t	dma = page->dma;
+	dma_addr_t dma = page->dma;
 
 #ifdef	CONFIG_DEBUG_SLAB
-	memset (page->vaddr, POOL_POISON_FREED, pool->allocation);
+	memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
-	dma_free_coherent (pool->dev, pool->allocation, page->vaddr, dma);
-	list_del (&page->page_list);
-	kfree (page);
+	dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+	list_del(&page->page_list);
+	kfree(page);
 }
 
-
 /**
  * dma_pool_destroy - destroys a pool of dma memory blocks.
  * @pool: dma pool that will be destroyed
@@ -228,36 +224,37 @@ pool_free_page (struct dma_pool *pool, struct dma_page *page)
  * Caller guarantees that no more memory from the pool is in use,
  * and that nothing will try to use the pool after this call.
  */
-void
-dma_pool_destroy (struct dma_pool *pool)
+void dma_pool_destroy(struct dma_pool *pool)
 {
 	mutex_lock(&pools_lock);
-	list_del (&pool->pools);
-	if (pool->dev && list_empty (&pool->dev->dma_pools))
-		device_remove_file (pool->dev, &dev_attr_pools);
+	list_del(&pool->pools);
+	if (pool->dev && list_empty(&pool->dev->dma_pools))
+		device_remove_file(pool->dev, &dev_attr_pools);
 	mutex_unlock(&pools_lock);
 
-	while (!list_empty (&pool->page_list)) {
-		struct dma_page		*page;
-		page = list_entry (pool->page_list.next,
-				struct dma_page, page_list);
-		if (is_page_busy (pool->blocks_per_page, page->bitmap)) {
+	while (!list_empty(&pool->page_list)) {
+		struct dma_page *page;
+		page = list_entry(pool->page_list.next,
+				  struct dma_page, page_list);
+		if (is_page_busy(pool->blocks_per_page, page->bitmap)) {
 			if (pool->dev)
-				dev_err(pool->dev, "dma_pool_destroy %s, %p busy\n",
+				dev_err(pool->dev,
+					"dma_pool_destroy %s, %p busy\n",
 					pool->name, page->vaddr);
 			else
-				printk (KERN_ERR "dma_pool_destroy %s, %p busy\n",
-					pool->name, page->vaddr);
+				printk(KERN_ERR
+				       "dma_pool_destroy %s, %p busy\n",
+				       pool->name, page->vaddr);
 			/* leak the still-in-use consistent memory */
-			list_del (&page->page_list);
-			kfree (page);
+			list_del(&page->page_list);
+			kfree(page);
 		} else
-			pool_free_page (pool, page);
+			pool_free_page(pool, page);
 	}
 
-	kfree (pool);
+	kfree(pool);
 }
-
+EXPORT_SYMBOL(dma_pool_destroy);
 
 /**
  * dma_pool_alloc - get a block of consistent memory
@@ -269,73 +266,72 @@ dma_pool_destroy (struct dma_pool *pool)
  * and reports its dma address through the handle.
  * If such a memory block can't be allocated, null is returned.
  */
-void *
-dma_pool_alloc (struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+		     dma_addr_t *handle)
 {
-	unsigned long		flags;
-	struct dma_page		*page;
-	int			map, block;
-	size_t			offset;
-	void			*retval;
-
-restart:
-	spin_lock_irqsave (&pool->lock, flags);
+	unsigned long flags;
+	struct dma_page *page;
+	int map, block;
+	size_t offset;
+	void *retval;
+
+ restart:
+	spin_lock_irqsave(&pool->lock, flags);
 	list_for_each_entry(page, &pool->page_list, page_list) {
-		int		i;
+		int i;
 		/* only cachable accesses here ... */
 		for (map = 0, i = 0;
-				i < pool->blocks_per_page;
-				i += BITS_PER_LONG, map++) {
-			if (page->bitmap [map] == 0)
+		     i < pool->blocks_per_page; i += BITS_PER_LONG, map++) {
+			if (page->bitmap[map] == 0)
 				continue;
-			block = ffz (~ page->bitmap [map]);
+			block = ffz(~page->bitmap[map]);
 			if ((i + block) < pool->blocks_per_page) {
-				clear_bit (block, &page->bitmap [map]);
+				clear_bit(block, &page->bitmap[map]);
 				offset = (BITS_PER_LONG * map) + block;
 				offset *= pool->size;
 				goto ready;
 			}
 		}
 	}
-	if (!(page = pool_alloc_page (pool, GFP_ATOMIC))) {
+	page = pool_alloc_page(pool, GFP_ATOMIC);
+	if (!page) {
 		if (mem_flags & __GFP_WAIT) {
-			DECLARE_WAITQUEUE (wait, current);
+			DECLARE_WAITQUEUE(wait, current);
 
 			__set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue (&pool->waitq, &wait);
-			spin_unlock_irqrestore (&pool->lock, flags);
+			add_wait_queue(&pool->waitq, &wait);
+			spin_unlock_irqrestore(&pool->lock, flags);
 
-			schedule_timeout (POOL_TIMEOUT_JIFFIES);
+			schedule_timeout(POOL_TIMEOUT_JIFFIES);
 
-			remove_wait_queue (&pool->waitq, &wait);
+			remove_wait_queue(&pool->waitq, &wait);
 			goto restart;
 		}
 		retval = NULL;
 		goto done;
 	}
 
-	clear_bit (0, &page->bitmap [0]);
+	clear_bit(0, &page->bitmap[0]);
 	offset = 0;
-ready:
+ ready:
 	page->in_use++;
 	retval = offset + page->vaddr;
 	*handle = offset + page->dma;
 #ifdef	CONFIG_DEBUG_SLAB
-	memset (retval, POOL_POISON_ALLOCATED, pool->size);
+	memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
-done:
-	spin_unlock_irqrestore (&pool->lock, flags);
+ done:
+	spin_unlock_irqrestore(&pool->lock, flags);
 	return retval;
 }
+EXPORT_SYMBOL(dma_pool_alloc);
 
-
-static struct dma_page *
-pool_find_page (struct dma_pool *pool, dma_addr_t dma)
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
 {
-	unsigned long		flags;
-	struct dma_page		*page;
+	unsigned long flags;
+	struct dma_page *page;
 
-	spin_lock_irqsave (&pool->lock, flags);
+	spin_lock_irqsave(&pool->lock, flags);
 	list_for_each_entry(page, &pool->page_list, page_list) {
 		if (dma < page->dma)
 			continue;
@@ -343,12 +339,11 @@ pool_find_page (struct dma_pool *pool, dma_addr_t dma)
 			goto done;
 	}
 	page = NULL;
-done:
-	spin_unlock_irqrestore (&pool->lock, flags);
+ done:
+	spin_unlock_irqrestore(&pool->lock, flags);
 	return page;
 }
 
-
 /**
  * dma_pool_free - put block back into dma pool
  * @pool: the dma pool holding the block
@@ -358,20 +353,21 @@ done:
  * Caller promises neither device nor driver will again touch this block
  * unless it is first re-allocated.
  */
-void
-dma_pool_free (struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 {
-	struct dma_page		*page;
-	unsigned long		flags;
-	int			map, block;
+	struct dma_page *page;
+	unsigned long flags;
+	int map, block;
 
-	if ((page = pool_find_page(pool, dma)) == NULL) {
+	page = pool_find_page(pool, dma);
+	if (!page) {
 		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n",
-				pool->name, vaddr, (unsigned long) dma);
+			dev_err(pool->dev,
+				"dma_pool_free %s, %p/%lx (bad dma)\n",
+				pool->name, vaddr, (unsigned long)dma);
 		else
-			printk (KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
-				pool->name, vaddr, (unsigned long) dma);
+			printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+			       pool->name, vaddr, (unsigned long)dma);
 		return;
 	}
 
@@ -383,37 +379,42 @@ dma_pool_free (struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 #ifdef	CONFIG_DEBUG_SLAB
 	if (((dma - page->dma) + (void *)page->vaddr) != vaddr) {
 		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
-				pool->name, vaddr, (unsigned long long) dma);
+			dev_err(pool->dev,
+				"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+				pool->name, vaddr, (unsigned long long)dma);
 		else
-			printk (KERN_ERR "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
-				pool->name, vaddr, (unsigned long long) dma);
+			printk(KERN_ERR
+			       "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+			       pool->name, vaddr, (unsigned long long)dma);
 		return;
 	}
-	if (page->bitmap [map] & (1UL << block)) {
+	if (page->bitmap[map] & (1UL << block)) {
 		if (pool->dev)
-			dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+			dev_err(pool->dev,
+				"dma_pool_free %s, dma %Lx already free\n",
 				pool->name, (unsigned long long)dma);
 		else
-			printk (KERN_ERR "dma_pool_free %s, dma %Lx already free\n",
-				pool->name, (unsigned long long)dma);
+			printk(KERN_ERR
+			       "dma_pool_free %s, dma %Lx already free\n",
+			       pool->name, (unsigned long long)dma);
 		return;
 	}
-	memset (vaddr, POOL_POISON_FREED, pool->size);
+	memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
 
-	spin_lock_irqsave (&pool->lock, flags);
+	spin_lock_irqsave(&pool->lock, flags);
 	page->in_use--;
-	set_bit (block, &page->bitmap [map]);
-	if (waitqueue_active (&pool->waitq))
-		wake_up (&pool->waitq);
+	set_bit(block, &page->bitmap[map]);
+	if (waitqueue_active(&pool->waitq))
+		wake_up(&pool->waitq);
 	/*
 	 * Resist a temptation to do
 	 *    if (!is_page_busy(bpp, page->bitmap)) pool_free_page(pool, page);
 	 * Better have a few empty pages hang around.
 	 */
-	spin_unlock_irqrestore (&pool->lock, flags);
+	spin_unlock_irqrestore(&pool->lock, flags);
 }
+EXPORT_SYMBOL(dma_pool_free);
 
 /*
  * Managed DMA pool
@@ -458,6 +459,7 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
 
 	return pool;
 }
+EXPORT_SYMBOL(dmam_pool_create);
 
 /**
  * dmam_pool_destroy - Managed dma_pool_destroy()
@@ -472,10 +474,4 @@ void dmam_pool_destroy(struct dma_pool *pool)
 	dma_pool_destroy(pool);
 	WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
 }
-
-EXPORT_SYMBOL (dma_pool_create);
-EXPORT_SYMBOL (dma_pool_destroy);
-EXPORT_SYMBOL (dma_pool_alloc);
-EXPORT_SYMBOL (dma_pool_free);
-EXPORT_SYMBOL (dmam_pool_create);
-EXPORT_SYMBOL (dmam_pool_destroy);
+EXPORT_SYMBOL(dmam_pool_destroy);
-- 
cgit v1.1


From 2cae367e4854ff055c4f5e8aacd56b0eeec9f6cb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 12:09:33 -0500
Subject: Avoid taking waitqueue lock in dmapool

With one trivial change (taking the lock slightly earlier on wakeup
from schedule), all uses of the waitq are under the pool lock, so we
can use the locked (or __) versions of the wait queue functions, and
avoid the extra spinlock.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 mm/dmapool.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 92e886d..b5ff9ce 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -275,8 +275,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	size_t offset;
 	void *retval;
 
- restart:
 	spin_lock_irqsave(&pool->lock, flags);
+ restart:
 	list_for_each_entry(page, &pool->page_list, page_list) {
 		int i;
 		/* only cachable accesses here ... */
@@ -299,12 +299,13 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 			DECLARE_WAITQUEUE(wait, current);
 
 			__set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&pool->waitq, &wait);
+			__add_wait_queue(&pool->waitq, &wait);
 			spin_unlock_irqrestore(&pool->lock, flags);
 
 			schedule_timeout(POOL_TIMEOUT_JIFFIES);
 
-			remove_wait_queue(&pool->waitq, &wait);
+			spin_lock_irqsave(&pool->lock, flags);
+			__remove_wait_queue(&pool->waitq, &wait);
 			goto restart;
 		}
 		retval = NULL;
@@ -406,7 +407,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	page->in_use--;
 	set_bit(block, &page->bitmap[map]);
 	if (waitqueue_active(&pool->waitq))
-		wake_up(&pool->waitq);
+		wake_up_locked(&pool->waitq);
 	/*
 	 * Resist a temptation to do
 	 *    if (!is_page_busy(bpp, page->bitmap)) pool_free_page(pool, page);
-- 
cgit v1.1


From 399154be2dcb6a58dbde9682162c38113cf3e40b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 12:10:24 -0500
Subject: dmapool: Validate parameters to dma_pool_create

Check that 'align' is a power of two, like the API specifies.
Align 'size' to 'align' correctly -- the current code has an off-by-one.
The ALIGN macro in kernel.h doesn't.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 mm/dmapool.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index b5ff9ce..744d541 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -106,17 +106,18 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 {
 	struct dma_pool *retval;
 
-	if (align == 0)
+	if (align == 0) {
 		align = 1;
-	if (size == 0)
+	} else if (align & (align - 1)) {
 		return NULL;
-	else if (size < align)
-		size = align;
-	else if ((size % align) != 0) {
-		size += align + 1;
-		size &= ~(align - 1);
 	}
 
+	if (size == 0)
+		return NULL;
+
+	if ((size % align) != 0)
+		size = ALIGN(size, align);
+
 	if (allocation == 0) {
 		if (PAGE_SIZE < size)
 			allocation = size;
-- 
cgit v1.1


From 6182a0943af2235756836ed7e021fa22b93ec68b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 12:16:57 -0500
Subject: dmapool: Tidy up includes and add comments

We were missing a copyright statement and license, so add GPLv2, David
Brownell's copyright and my copyright.

The asm/io.h include was superfluous, but we were missing a few other
necessary includes.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 mm/dmapool.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 744d541..e2ea454 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -1,19 +1,39 @@
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ *   Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 as published by the
+ * Free Software Foundation.
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device.  It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple.  The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages.  Each page in the page_list is split into blocks of at
+ * least 'size' bytes.
+ */
 
 #include <linux/device.h>
-#include <linux/mm.h>
-#include <asm/io.h>		/* Needed for i386 to build */
 #include <linux/dma-mapping.h>
 #include <linux/dmapool.h>
-#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
-
-/*
- * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
- * small blocks are easily used by drivers for bus mastering controllers.
- * This should probably be sharing the guts of the slab allocator.
- */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
 
 struct dma_pool {		/* the pool */
 	struct list_head page_list;
@@ -265,7 +285,7 @@ EXPORT_SYMBOL(dma_pool_destroy);
  *
  * This returns the kernel virtual address of a currently unused block,
  * and reports its dma address through the handle.
- * If such a memory block can't be allocated, null is returned.
+ * If such a memory block can't be allocated, %NULL is returned.
  */
 void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 		     dma_addr_t *handle)
-- 
cgit v1.1


From a35a3455142976e3fffdf27027f3082cbaba6e8c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 14:08:28 -0500
Subject: Change dmapool free block management

Use a list of free blocks within a page instead of using a bitmap.
Update documentation to reflect this.  As well as being a slight
reduction in memory allocation, locked ops and lines of code, it speeds
up a transaction processing benchmark by 0.4%.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 mm/dmapool.c | 119 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 58 insertions(+), 61 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index e2ea454..72e7ece 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -17,7 +17,9 @@
  * The current design of this allocator is fairly simple.  The pool is
  * represented by the 'struct dma_pool' which keeps a doubly-linked list of
  * allocated pages.  Each page in the page_list is split into blocks of at
- * least 'size' bytes.
+ * least 'size' bytes.  Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page.  Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
  */
 
 #include <linux/device.h>
@@ -38,7 +40,6 @@
 struct dma_pool {		/* the pool */
 	struct list_head page_list;
 	spinlock_t lock;
-	size_t blocks_per_page;
 	size_t size;
 	struct device *dev;
 	size_t allocation;
@@ -51,8 +52,8 @@ struct dma_page {		/* cacheable header for 'allocation' bytes */
 	struct list_head page_list;
 	void *vaddr;
 	dma_addr_t dma;
-	unsigned in_use;
-	unsigned long bitmap[0];
+	unsigned int in_use;
+	unsigned int offset;
 };
 
 #define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
@@ -87,8 +88,8 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
 
 		/* per-pool info, no real statistics yet */
 		temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
-				 pool->name,
-				 blocks, pages * pool->blocks_per_page,
+				 pool->name, blocks,
+				 pages * (pool->allocation / pool->size),
 				 pool->size, pages);
 		size -= temp;
 		next += temp;
@@ -132,8 +133,11 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 		return NULL;
 	}
 
-	if (size == 0)
+	if (size == 0) {
 		return NULL;
+	} else if (size < 4) {
+		size = 4;
+	}
 
 	if ((size % align) != 0)
 		size = ALIGN(size, align);
@@ -160,7 +164,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 	spin_lock_init(&retval->lock);
 	retval->size = size;
 	retval->allocation = allocation;
-	retval->blocks_per_page = allocation / size;
 	init_waitqueue_head(&retval->waitq);
 
 	if (dev) {
@@ -186,28 +189,36 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 }
 EXPORT_SYMBOL(dma_pool_create);
 
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+	unsigned int offset = 0;
+
+	do {
+		unsigned int next = offset + pool->size;
+		if (unlikely((next + pool->size) >= pool->allocation))
+			next = pool->allocation;
+		*(int *)(page->vaddr + offset) = next;
+		offset = next;
+	} while (offset < pool->allocation);
+}
+
 static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 {
 	struct dma_page *page;
-	int mapsize;
-
-	mapsize = pool->blocks_per_page;
-	mapsize = (mapsize + BITS_PER_LONG - 1) / BITS_PER_LONG;
-	mapsize *= sizeof(long);
 
-	page = kmalloc(mapsize + sizeof *page, mem_flags);
+	page = kmalloc(sizeof(*page), mem_flags);
 	if (!page)
 		return NULL;
-	page->vaddr = dma_alloc_coherent(pool->dev,
-					 pool->allocation,
+	page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
 					 &page->dma, mem_flags);
 	if (page->vaddr) {
-		memset(page->bitmap, 0xff, mapsize);	/* bit set == free */
 #ifdef	CONFIG_DEBUG_SLAB
 		memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
+		pool_initialise_page(pool, page);
 		list_add(&page->page_list, &pool->page_list);
 		page->in_use = 0;
+		page->offset = 0;
 	} else {
 		kfree(page);
 		page = NULL;
@@ -215,14 +226,9 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 	return page;
 }
 
-static inline int is_page_busy(int blocks, unsigned long *bitmap)
+static inline int is_page_busy(struct dma_page *page)
 {
-	while (blocks > 0) {
-		if (*bitmap++ != ~0UL)
-			return 1;
-		blocks -= BITS_PER_LONG;
-	}
-	return 0;
+	return page->in_use != 0;
 }
 
 static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
@@ -257,7 +263,7 @@ void dma_pool_destroy(struct dma_pool *pool)
 		struct dma_page *page;
 		page = list_entry(pool->page_list.next,
 				  struct dma_page, page_list);
-		if (is_page_busy(pool->blocks_per_page, page->bitmap)) {
+		if (is_page_busy(page)) {
 			if (pool->dev)
 				dev_err(pool->dev,
 					"dma_pool_destroy %s, %p busy\n",
@@ -292,27 +298,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 {
 	unsigned long flags;
 	struct dma_page *page;
-	int map, block;
 	size_t offset;
 	void *retval;
 
 	spin_lock_irqsave(&pool->lock, flags);
  restart:
 	list_for_each_entry(page, &pool->page_list, page_list) {
-		int i;
-		/* only cachable accesses here ... */
-		for (map = 0, i = 0;
-		     i < pool->blocks_per_page; i += BITS_PER_LONG, map++) {
-			if (page->bitmap[map] == 0)
-				continue;
-			block = ffz(~page->bitmap[map]);
-			if ((i + block) < pool->blocks_per_page) {
-				clear_bit(block, &page->bitmap[map]);
-				offset = (BITS_PER_LONG * map) + block;
-				offset *= pool->size;
-				goto ready;
-			}
-		}
+		if (page->offset < pool->allocation)
+			goto ready;
 	}
 	page = pool_alloc_page(pool, GFP_ATOMIC);
 	if (!page) {
@@ -333,10 +326,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 		goto done;
 	}
 
-	clear_bit(0, &page->bitmap[0]);
-	offset = 0;
  ready:
 	page->in_use++;
+	offset = page->offset;
+	page->offset = *(int *)(page->vaddr + offset);
 	retval = offset + page->vaddr;
 	*handle = offset + page->dma;
 #ifdef	CONFIG_DEBUG_SLAB
@@ -379,7 +372,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 {
 	struct dma_page *page;
 	unsigned long flags;
-	int map, block;
+	unsigned int offset;
 
 	page = pool_find_page(pool, dma);
 	if (!page) {
@@ -393,13 +386,9 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 		return;
 	}
 
-	block = dma - page->dma;
-	block /= pool->size;
-	map = block / BITS_PER_LONG;
-	block %= BITS_PER_LONG;
-
+	offset = vaddr - page->vaddr;
 #ifdef	CONFIG_DEBUG_SLAB
-	if (((dma - page->dma) + (void *)page->vaddr) != vaddr) {
+	if ((dma - page->dma) != offset) {
 		if (pool->dev)
 			dev_err(pool->dev,
 				"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -410,28 +399,36 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 			       pool->name, vaddr, (unsigned long long)dma);
 		return;
 	}
-	if (page->bitmap[map] & (1UL << block)) {
-		if (pool->dev)
-			dev_err(pool->dev,
-				"dma_pool_free %s, dma %Lx already free\n",
-				pool->name, (unsigned long long)dma);
-		else
-			printk(KERN_ERR
-			       "dma_pool_free %s, dma %Lx already free\n",
-			       pool->name, (unsigned long long)dma);
-		return;
+	{
+		unsigned int chain = page->offset;
+		while (chain < pool->allocation) {
+			if (chain != offset) {
+				chain = *(int *)(page->vaddr + chain);
+				continue;
+			}
+			if (pool->dev)
+				dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
+					"already free\n", pool->name,
+					(unsigned long long)dma);
+			else
+				printk(KERN_ERR "dma_pool_free %s, dma %Lx "
+					"already free\n", pool->name,
+					(unsigned long long)dma);
+			return;
+		}
 	}
 	memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
 
 	spin_lock_irqsave(&pool->lock, flags);
 	page->in_use--;
-	set_bit(block, &page->bitmap[map]);
+	*(int *)vaddr = page->offset;
+	page->offset = offset;
 	if (waitqueue_active(&pool->waitq))
 		wake_up_locked(&pool->waitq);
 	/*
 	 * Resist a temptation to do
-	 *    if (!is_page_busy(bpp, page->bitmap)) pool_free_page(pool, page);
+	 *    if (!is_page_busy(page)) pool_free_page(pool, page);
 	 * Better have a few empty pages hang around.
 	 */
 	spin_unlock_irqrestore(&pool->lock, flags);
-- 
cgit v1.1


From e34f44b3517fe545f7fd45a8c2f6ee1e5e4432d3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 3 Dec 2007 14:16:24 -0500
Subject: pool: Improve memory usage for devices which can't cross boundaries

The previous implementation simply refused to allocate more than a
boundary's worth of data from an entire page.  Some users didn't know
this, so specified things like SMP_CACHE_BYTES, not realising the
horrible waste of memory that this was.  It's fairly easy to correct
this problem, just by ensuring we don't cross a boundary within a page.
This even helps drivers like EHCI (which can't cross a 4k boundary)
on machines with larger page sizes.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 mm/dmapool.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 72e7ece..34aaac4 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -43,6 +43,7 @@ struct dma_pool {		/* the pool */
 	size_t size;
 	struct device *dev;
 	size_t allocation;
+	size_t boundary;
 	char name[32];
 	wait_queue_head_t waitq;
 	struct list_head pools;
@@ -107,7 +108,7 @@ static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
  * @dev: device that will be doing the DMA
  * @size: size of the blocks in this pool.
  * @align: alignment requirement for blocks; must be a power of two
- * @allocation: returned blocks won't cross this boundary (or zero)
+ * @boundary: returned blocks won't cross this power of two boundary
  * Context: !in_interrupt()
  *
  * Returns a dma allocation pool with the requested characteristics, or
@@ -117,15 +118,16 @@ static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
  * cache flushing primitives.  The actual size of blocks allocated may be
  * larger than requested because of alignment.
  *
- * If allocation is nonzero, objects returned from dma_pool_alloc() won't
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
  * cross that size boundary.  This is useful for devices which have
  * addressing restrictions on individual DMA transfers, such as not crossing
  * boundaries of 4KBytes.
  */
 struct dma_pool *dma_pool_create(const char *name, struct device *dev,
-				 size_t size, size_t align, size_t allocation)
+				 size_t size, size_t align, size_t boundary)
 {
 	struct dma_pool *retval;
+	size_t allocation;
 
 	if (align == 0) {
 		align = 1;
@@ -142,27 +144,26 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 	if ((size % align) != 0)
 		size = ALIGN(size, align);
 
-	if (allocation == 0) {
-		if (PAGE_SIZE < size)
-			allocation = size;
-		else
-			allocation = PAGE_SIZE;
-		/* FIXME: round up for less fragmentation */
-	} else if (allocation < size)
+	allocation = max_t(size_t, size, PAGE_SIZE);
+
+	if (!boundary) {
+		boundary = allocation;
+	} else if ((boundary < size) || (boundary & (boundary - 1))) {
 		return NULL;
+	}
 
-	if (!
-	    (retval =
-	     kmalloc_node(sizeof *retval, GFP_KERNEL, dev_to_node(dev))))
+	retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+	if (!retval)
 		return retval;
 
-	strlcpy(retval->name, name, sizeof retval->name);
+	strlcpy(retval->name, name, sizeof(retval->name));
 
 	retval->dev = dev;
 
 	INIT_LIST_HEAD(&retval->page_list);
 	spin_lock_init(&retval->lock);
 	retval->size = size;
+	retval->boundary = boundary;
 	retval->allocation = allocation;
 	init_waitqueue_head(&retval->waitq);
 
@@ -192,11 +193,14 @@ EXPORT_SYMBOL(dma_pool_create);
 static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
 {
 	unsigned int offset = 0;
+	unsigned int next_boundary = pool->boundary;
 
 	do {
 		unsigned int next = offset + pool->size;
-		if (unlikely((next + pool->size) >= pool->allocation))
-			next = pool->allocation;
+		if (unlikely((next + pool->size) >= next_boundary)) {
+			next = next_boundary;
+			next_boundary += pool->boundary;
+		}
 		*(int *)(page->vaddr + offset) = next;
 		offset = next;
 	} while (offset < pool->allocation);
-- 
cgit v1.1


From 0df29025fd0379d5950d206314d0b10a2c8a9607 Mon Sep 17 00:00:00 2001
From: Doug Chapman <doug.chapman@hp.com>
Date: Mon, 28 Jan 2008 15:33:28 -0800
Subject: [IA64] fix userspace compile error in gcc_intrin.h

Fixes userspace build errors when linux/ipv6.h is included such as in the
dhcpv6 package under fedora.  Likely causes other userspace build errors as
well.  I found this in akpm's tree from 2.6.18 but could not find any case
of anyone proposing it for the main tree.

Signed-off-by: Doug Chapman <doug.chapman@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/gcc_intrin.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/asm-ia64/gcc_intrin.h b/include/asm-ia64/gcc_intrin.h
index 5b6665c..de2ed2c 100644
--- a/include/asm-ia64/gcc_intrin.h
+++ b/include/asm-ia64/gcc_intrin.h
@@ -24,7 +24,9 @@
 extern void ia64_bad_param_for_setreg (void);
 extern void ia64_bad_param_for_getreg (void);
 
+#ifdef __KERNEL__
 register unsigned long ia64_r13 asm ("r13") __used;
+#endif
 
 #define ia64_setreg(regnum, val)						\
 ({										\
-- 
cgit v1.1


From a23fe55e132cd85108ab55b3fafb4b5060d847c7 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Tue, 22 Jan 2008 20:42:07 +0100
Subject: [IA64] constify function pointer tables

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/hp/common/sba_iommu.c     | 2 +-
 arch/ia64/kernel/perfmon.c          | 2 +-
 arch/ia64/kernel/setup.c            | 2 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c   | 2 +-
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 45bf04e..8c0ae4f 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1871,7 +1871,7 @@ ioc_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static struct seq_operations ioc_seq_ops = {
+static const struct seq_operations ioc_seq_ops = {
 	.start = ioc_start,
 	.next  = ioc_next,
 	.stop  = ioc_stop,
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5ae177f..48e5609 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5795,7 +5795,7 @@ pfm_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-struct seq_operations pfm_seq_ops = {
+const struct seq_operations pfm_seq_ops = {
 	.start =	pfm_proc_start,
  	.next =		pfm_proc_next,
  	.stop =		pfm_proc_stop,
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 86028c6..ebd1a09 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -654,7 +654,7 @@ c_stop (struct seq_file *m, void *v)
 {
 }
 
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start =	c_start,
 	.next =		c_next,
 	.stop =		c_stop,
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index f3c6932..dfc6bf1 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -523,7 +523,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
 	return count;
 }
 
-static struct seq_operations sn2_ptc_seq_ops = {
+static const struct seq_operations sn2_ptc_seq_ops = {
 	.start = sn2_ptc_seq_start,
 	.next = sn2_ptc_seq_next,
 	.stop = sn2_ptc_seq_stop,
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 1a8e496..ca88274 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -577,7 +577,7 @@ static void sn_topology_stop(struct seq_file *m, void *v)
 /*
  * /proc/sgi_sn/sn_topology, read-only using seq_file
  */
-static struct seq_operations sn_topology_seq_ops = {
+static const struct seq_operations sn_topology_seq_ops = {
 	.start = sn_topology_start,
 	.next = sn_topology_next,
 	.stop = sn_topology_stop,
-- 
cgit v1.1


From 97075c4b3b7fdd6a083eea075c3a4a601f0d64d8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 18 Jan 2008 11:20:46 -0500
Subject: [IA64] Fix the order of atomic operations in restore_previous_kprobes
 on ia64

Fix the order of atomic operations to prevent overwriting prev_kprobe[0].
To pop values from stack, we must decrement stack index right AFTER
reading values.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/kprobes.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index fc4d267..b618487 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -381,9 +381,10 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	unsigned int i;
-	i = atomic_sub_return(1, &kcb->prev_kprobe_index);
-	__get_cpu_var(current_kprobe) = kcb->prev_kprobe[i].kp;
-	kcb->kprobe_status = kcb->prev_kprobe[i].status;
+	i = atomic_read(&kcb->prev_kprobe_index);
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe[i-1].kp;
+	kcb->kprobe_status = kcb->prev_kprobe[i-1].status;
+	atomic_sub(1, &kcb->prev_kprobe_index);
 }
 
 static void __kprobes set_current_kprobe(struct kprobe *p,
-- 
cgit v1.1


From 5302ac5019367470e123cb91844a28d6941e6912 Mon Sep 17 00:00:00 2001
From: Zoltan Menyhart <Zoltan.Menyhart@bull.net>
Date: Mon, 4 Feb 2008 15:19:16 -0800
Subject: [IA64] Slim-down __clear_bit_unlock

- I removed the unnecessary barrier() from __clear_bit_unlock().
  ia64_st4_rel_nta() makes sure all the modifications are globally
  seen before the bit is seen to be off.
- I made __clear_bit() modeled after __set_bit() and __change_bit().
- I corrected some comments sating that a memory barrier is provided,
  yet in reality, it is the acquisition side of the memory barrier only.
- I corrected some comments, e.g. test_and_clear_bit() was peaking
  about "bit to set".

Signed-off-by: Zoltan Menyhart, <Zoltan.Menyhart@bull.net>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/bitops.h | 50 ++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index a1b9719..953d3df 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -122,38 +122,40 @@ clear_bit_unlock (int nr, volatile void *addr)
 }
 
 /**
- * __clear_bit_unlock - Non-atomically clear a bit with release
+ * __clear_bit_unlock - Non-atomically clears a bit in memory with release
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
  *
- * This is like clear_bit_unlock, but the implementation uses a store
+ * Similarly to clear_bit_unlock, the implementation uses a store
  * with release semantics. See also __raw_spin_unlock().
  */
 static __inline__ void
-__clear_bit_unlock(int nr, volatile void *addr)
+__clear_bit_unlock(int nr, void *addr)
 {
-	__u32 mask, new;
-	volatile __u32 *m;
+	__u32 * const m = (__u32 *) addr + (nr >> 5);
+	__u32 const new = *m & ~(1 << (nr & 31));
 
-	m = (volatile __u32 *)addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	new = *m & mask;
-	barrier();
 	ia64_st4_rel_nta(m, new);
 }
 
 /**
  * __clear_bit - Clears a bit in memory (non-atomic version)
+ * @nr: the bit to clear
+ * @addr: the address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
  */
 static __inline__ void
 __clear_bit (int nr, volatile void *addr)
 {
-	volatile __u32 *p = (__u32 *) addr + (nr >> 5);
-	__u32 m = 1 << (nr & 31);
-	*p &= ~m;
+	*((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31));
 }
 
 /**
  * change_bit - Toggle a bit in memory
- * @nr: Bit to clear
+ * @nr: Bit to toggle
  * @addr: Address to start counting from
  *
  * change_bit() is atomic and may not be reordered.
@@ -178,7 +180,7 @@ change_bit (int nr, volatile void *addr)
 
 /**
  * __change_bit - Toggle a bit in memory
- * @nr: the bit to set
+ * @nr: the bit to toggle
  * @addr: the address to start counting from
  *
  * Unlike change_bit(), this function is non-atomic and may be reordered.
@@ -197,7 +199,7 @@ __change_bit (int nr, volatile void *addr)
  * @addr: Address to count from
  *
  * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
+ * It also implies the acquisition side of the memory barrier.
  */
 static __inline__ int
 test_and_set_bit (int nr, volatile void *addr)
@@ -247,11 +249,11 @@ __test_and_set_bit (int nr, volatile void *addr)
 
 /**
  * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to set
+ * @nr: Bit to clear
  * @addr: Address to count from
  *
  * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
+ * It also implies the acquisition side of the memory barrier.
  */
 static __inline__ int
 test_and_clear_bit (int nr, volatile void *addr)
@@ -272,7 +274,7 @@ test_and_clear_bit (int nr, volatile void *addr)
 
 /**
  * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to set
+ * @nr: Bit to clear
  * @addr: Address to count from
  *
  * This operation is non-atomic and can be reordered.  
@@ -292,11 +294,11 @@ __test_and_clear_bit(int nr, volatile void * addr)
 
 /**
  * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to set
+ * @nr: Bit to change
  * @addr: Address to count from
  *
  * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
+ * It also implies the acquisition side of the memory barrier.
  */
 static __inline__ int
 test_and_change_bit (int nr, volatile void *addr)
@@ -315,8 +317,12 @@ test_and_change_bit (int nr, volatile void *addr)
 	return (old & bit) != 0;
 }
 
-/*
- * WARNING: non atomic version.
+/**
+ * __test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
  */
 static __inline__ int
 __test_and_change_bit (int nr, void *addr)
-- 
cgit v1.1


From cdef24c9cd38ae236065409c4a6289f165639e55 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 4 Feb 2008 15:23:10 -0800
Subject: [IA64] aliasing-test: fix gcc warnings on non-ia64

Eliminate all build warnings.  OK, these build warnings are from
a build on x86_64.  When I build on ia64, I don't see warnings.

Now builds cleanly on ia64 and x86_64.

Documentation/ia64/aliasing-test.c: In function 'map_mem':
Documentation/ia64/aliasing-test.c:39: warning: implicit declaration of function 'ioctl'
Documentation/ia64/aliasing-test.c: In function 'scan_rom':
Documentation/ia64/aliasing-test.c:183: warning: format '%ld' expects type 'long int', but argument 4 has type 'int'
Documentation/ia64/aliasing-test.c: At top level:
Documentation/ia64/aliasing-test.c:208: warning: function declaration isn't a prototype
Documentation/ia64/aliasing-test.c: In function 'main':
Documentation/ia64/aliasing-test.c:259: warning: control reaches end of non-void function
Documentation/ia64/aliasing-test.c: In function 'scan_rom':
Documentation/ia64/aliasing-test.c:152: warning: 'rc' may be used uninitialized in this function
Documentation/ia64/aliasing-test.c: In function 'scan_tree':
Documentation/ia64/aliasing-test.c:68: warning: 'rc' may be used uninitialized in this function

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/ia64/aliasing-test.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c
index 773a814..d23610f 100644
--- a/Documentation/ia64/aliasing-test.c
+++ b/Documentation/ia64/aliasing-test.c
@@ -16,6 +16,7 @@
 #include <fcntl.h>
 #include <fnmatch.h>
 #include <string.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
@@ -65,7 +66,7 @@ int scan_tree(char *path, char *file, off_t offset, size_t length, int touch)
 {
 	struct dirent **namelist;
 	char *name, *path2;
-	int i, n, r, rc, result = 0;
+	int i, n, r, rc = 0, result = 0;
 	struct stat buf;
 
 	n = scandir(path, &namelist, 0, alphasort);
@@ -113,7 +114,7 @@ skip:
 		free(namelist[i]);
 	}
 	free(namelist);
-	return rc;
+	return result;
 }
 
 char buf[1024];
@@ -149,7 +150,7 @@ int scan_rom(char *path, char *file)
 {
 	struct dirent **namelist;
 	char *name, *path2;
-	int i, n, r, rc, result = 0;
+	int i, n, r, rc = 0, result = 0;
 	struct stat buf;
 
 	n = scandir(path, &namelist, 0, alphasort);
@@ -180,7 +181,7 @@ int scan_rom(char *path, char *file)
 			 * important thing is that no MCA happened.
 			 */
 			if (rc > 0)
-				fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc);
+				fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc);
 			else {
 				fprintf(stderr, "PASS: %s not readable\n", path2);
 				return rc;
@@ -201,10 +202,10 @@ skip:
 		free(namelist[i]);
 	}
 	free(namelist);
-	return rc;
+	return result;
 }
 
-int main()
+int main(void)
 {
 	int rc;
 
@@ -256,4 +257,6 @@ int main()
 	scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0);
 	scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1);
 	scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0);
+
+	return rc;
 }
-- 
cgit v1.1


From 7d9aed26ed11d7a472104b7078b0c5e4fd416059 Mon Sep 17 00:00:00 2001
From: Aron Griffis <aron@hp.com>
Date: Mon, 4 Feb 2008 15:31:49 -0800
Subject: [IA64] Make efi.c mostly fit in 80 columns

This patch is purely whitespace changes to make the code fit in 80
columns, plus fix some inconsistent indentation.  The efi_guidcmp()
tests remain wider than 80-columns since that seems to be the most
clear.

Signed-off-by: Aron Griffis <aron@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c | 489 +++++++++++++++++++++++++++----------------------
 1 file changed, 266 insertions(+), 223 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 242d793..9e59109 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1,7 +1,8 @@
 /*
  * Extensible Firmware Interface
  *
- * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999
+ * Based on Extensible Firmware Interface Specification version 0.9
+ * April 30, 1999
  *
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
@@ -48,145 +49,157 @@ static unsigned long mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
 
 #define efi_call_virt(f, args...)	(*(f))(args)
 
-#define STUB_GET_TIME(prefix, adjust_arg)							  \
-static efi_status_t										  \
-prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)						  \
-{												  \
-	struct ia64_fpreg fr[6];								  \
-	efi_time_cap_t *atc = NULL;								  \
-	efi_status_t ret;									  \
-												  \
-	if (tc)											  \
-		atc = adjust_arg(tc);								  \
-	ia64_save_scratch_fpregs(fr);								  \
-	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \
-	ia64_load_scratch_fpregs(fr);								  \
-	return ret;										  \
+#define STUB_GET_TIME(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)			       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_time_cap_t *atc = NULL;					       \
+	efi_status_t ret;						       \
+									       \
+	if (tc)								       \
+		atc = adjust_arg(tc);					       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time),    \
+				adjust_arg(tm), atc);			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_SET_TIME(prefix, adjust_arg)							\
-static efi_status_t										\
-prefix##_set_time (efi_time_t *tm)								\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
+#define STUB_SET_TIME(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_set_time (efi_time_t *tm)					       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time),    \
+				adjust_arg(tm));			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm)		\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),	\
-				adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
+#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending,	       \
+			  efi_time_t *tm)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),      \
+		adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));     \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)					\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_time_t *atm = NULL;									\
-	efi_status_t ret;									\
-												\
-	if (tm)											\
-		atm = adjust_arg(tm);								\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),	\
-				enabled, atm);							\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
+#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)		       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_time_t *atm = NULL;						       \
+	efi_status_t ret;						       \
+									       \
+	if (tm)								       \
+		atm = adjust_arg(tm);					       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),      \
+		enabled, atm);						       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_GET_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t									\
-prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,		\
-		       unsigned long *data_size, void *data)				\
-{											\
-	struct ia64_fpreg fr[6];							\
-	u32 *aattr = NULL;									\
-	efi_status_t ret;								\
-											\
-	if (attr)									\
-		aattr = adjust_arg(attr);						\
-	ia64_save_scratch_fpregs(fr);							\
-	ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable),	\
-				adjust_arg(name), adjust_arg(vendor), aattr,		\
-				adjust_arg(data_size), adjust_arg(data));		\
-	ia64_load_scratch_fpregs(fr);							\
-	return ret;									\
+#define STUB_GET_VARIABLE(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,      \
+		       unsigned long *data_size, void *data)		       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	u32 *aattr = NULL;						       \
+	efi_status_t ret;						       \
+									       \
+	if (attr)							       \
+		aattr = adjust_arg(attr);				       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_variable_t *) __va(runtime->get_variable),	       \
+		adjust_arg(name), adjust_arg(vendor), aattr,		       \
+		adjust_arg(data_size), adjust_arg(data));		       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t										\
-prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor)	\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable),	\
-				adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
+#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)			       \
+static efi_status_t							       \
+prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name,      \
+			    efi_guid_t *vendor)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_get_next_variable_t *) __va(runtime->get_next_variable),  \
+		adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));  \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_SET_VARIABLE(prefix, adjust_arg)						\
-static efi_status_t									\
-prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr,	\
-		       unsigned long data_size, void *data)				\
-{											\
-	struct ia64_fpreg fr[6];							\
-	efi_status_t ret;								\
-											\
-	ia64_save_scratch_fpregs(fr);							\
-	ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable),	\
-				adjust_arg(name), adjust_arg(vendor), attr, data_size,	\
-				adjust_arg(data));					\
-	ia64_load_scratch_fpregs(fr);							\
-	return ret;									\
+#define STUB_SET_VARIABLE(prefix, adjust_arg)				       \
+static efi_status_t							       \
+prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor,		       \
+		       unsigned long attr, unsigned long data_size,	       \
+		       void *data)					       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix(					       \
+		(efi_set_variable_t *) __va(runtime->set_variable),	       \
+		adjust_arg(name), adjust_arg(vendor), attr, data_size,	       \
+		adjust_arg(data));					       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)					\
-static efi_status_t										\
-prefix##_get_next_high_mono_count (u32 *count)							\
-{												\
-	struct ia64_fpreg fr[6];								\
-	efi_status_t ret;									\
-												\
-	ia64_save_scratch_fpregs(fr);								\
-	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)				\
-				__va(runtime->get_next_high_mono_count), adjust_arg(count));	\
-	ia64_load_scratch_fpregs(fr);								\
-	return ret;										\
+#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)		       \
+static efi_status_t							       \
+prefix##_get_next_high_mono_count (u32 *count)				       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_status_t ret;						       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)	       \
+				__va(runtime->get_next_high_mono_count),       \
+				adjust_arg(count));			       \
+	ia64_load_scratch_fpregs(fr);					       \
+	return ret;							       \
 }
 
-#define STUB_RESET_SYSTEM(prefix, adjust_arg)					\
-static void									\
-prefix##_reset_system (int reset_type, efi_status_t status,			\
-		       unsigned long data_size, efi_char16_t *data)		\
-{										\
-	struct ia64_fpreg fr[6];						\
-	efi_char16_t *adata = NULL;						\
-										\
-	if (data)								\
-		adata = adjust_arg(data);					\
-										\
-	ia64_save_scratch_fpregs(fr);						\
-	efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system),	\
-			  reset_type, status, data_size, adata);		\
-	/* should not return, but just in case... */				\
-	ia64_load_scratch_fpregs(fr);						\
+#define STUB_RESET_SYSTEM(prefix, adjust_arg)				       \
+static void								       \
+prefix##_reset_system (int reset_type, efi_status_t status,		       \
+		       unsigned long data_size, efi_char16_t *data)	       \
+{									       \
+	struct ia64_fpreg fr[6];					       \
+	efi_char16_t *adata = NULL;					       \
+									       \
+	if (data)							       \
+		adata = adjust_arg(data);				       \
+									       \
+	ia64_save_scratch_fpregs(fr);					       \
+	efi_call_##prefix(						       \
+		(efi_reset_system_t *) __va(runtime->reset_system),	       \
+		reset_type, status, data_size, adata);			       \
+	/* should not return, but just in case... */			       \
+	ia64_load_scratch_fpregs(fr);					       \
 }
 
 #define phys_ptr(arg)	((__typeof__(arg)) ia64_tpa(arg))
@@ -223,7 +236,8 @@ efi_gettimeofday (struct timespec *ts)
 		return;
 	}
 
-	ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second);
+	ts->tv_sec = mktime(tm.year, tm.month, tm.day,
+			    tm.hour, tm.minute, tm.second);
 	ts->tv_nsec = tm.nanosecond;
 }
 
@@ -297,8 +311,8 @@ walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
- * has memory that is available for OS use.
+ * Walks the EFI memory map and calls CALLBACK once for each EFI memory
+ * descriptor that has memory that is available for OS use.
  */
 void
 efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
@@ -307,8 +321,8 @@ efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
- * has memory that is available for uncached allocator.
+ * Walks the EFI memory map and calls CALLBACK once for each EFI memory
+ * descriptor that has memory that is available for uncached allocator.
  */
 void
 efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
@@ -321,7 +335,6 @@ efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
  * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
  * Abstraction Layer chapter 11 in ADAG
  */
-
 void *
 efi_get_pal_addr (void)
 {
@@ -341,32 +354,33 @@ efi_get_pal_addr (void)
 			continue;
 
 		if (++pal_code_count > 1) {
-			printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n",
-			       md->phys_addr);
+			printk(KERN_ERR "Too many EFI Pal Code memory ranges, "
+			       "dropped @ %lx\n", md->phys_addr);
 			continue;
 		}
 		/*
-		 * The only ITLB entry in region 7 that is used is the one installed by
-		 * __start().  That entry covers a 64MB range.
+		 * The only ITLB entry in region 7 that is used is the one
+		 * installed by __start().  That entry covers a 64MB range.
 		 */
 		mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
 		vaddr = PAGE_OFFSET + md->phys_addr;
 
 		/*
-		 * We must check that the PAL mapping won't overlap with the kernel
-		 * mapping.
+		 * We must check that the PAL mapping won't overlap with the
+		 * kernel mapping.
 		 *
-		 * PAL code is guaranteed to be aligned on a power of 2 between 4k and
-		 * 256KB and that only one ITR is needed to map it. This implies that the
-		 * PAL code is always aligned on its size, i.e., the closest matching page
-		 * size supported by the TLB. Therefore PAL code is guaranteed never to
-		 * cross a 64MB unless it is bigger than 64MB (very unlikely!).  So for
-		 * now the following test is enough to determine whether or not we need a
-		 * dedicated ITR for the PAL code.
+		 * PAL code is guaranteed to be aligned on a power of 2 between
+		 * 4k and 256KB and that only one ITR is needed to map it. This
+		 * implies that the PAL code is always aligned on its size,
+		 * i.e., the closest matching page size supported by the TLB.
+		 * Therefore PAL code is guaranteed never to cross a 64MB unless
+		 * it is bigger than 64MB (very unlikely!).  So for now the
+		 * following test is enough to determine whether or not we need
+		 * a dedicated ITR for the PAL code.
 		 */
 		if ((vaddr & mask) == (KERNEL_START & mask)) {
-			printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
-			       __FUNCTION__);
+			printk(KERN_INFO "%s: no need to install ITR for "
+			       "PAL code\n", __FUNCTION__);
 			continue;
 		}
 
@@ -376,10 +390,11 @@ efi_get_pal_addr (void)
 #if EFI_DEBUG
 		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
 
-		printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
-			smp_processor_id(), md->phys_addr,
-			md->phys_addr + efi_md_size(md),
-			vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
+		printk(KERN_INFO "CPU %d: mapping PAL code "
+                       "[0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
+                       smp_processor_id(), md->phys_addr,
+                       md->phys_addr + efi_md_size(md),
+                       vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
 #endif
 		return __va(md->phys_addr);
 	}
@@ -401,7 +416,8 @@ efi_map_pal_code (void)
 	 * Cannot write to CRx with PSR.ic=1
 	 */
 	psr = ia64_clear_ic();
-	ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr),
+	ia64_itr(0x1, IA64_TR_PALCODE,
+		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
 	ia64_set_psr(psr);		/* restore psr */
@@ -418,7 +434,10 @@ efi_init (void)
 	char *cp, vendor[100] = "unknown";
 	int i;
 
-	/* it's too early to be able to use the standard kernel command line support... */
+	/*
+	 * it's too early to be able to use the standard kernel command line
+	 * support...
+	 */
 	for (cp = boot_command_line; *cp; ) {
 		if (memcmp(cp, "mem=", 4) == 0) {
 			mem_limit = memparse(cp + 4, &cp);
@@ -434,9 +453,11 @@ efi_init (void)
 		}
 	}
 	if (min_addr != 0UL)
-		printk(KERN_INFO "Ignoring memory below %luMB\n", min_addr >> 20);
+		printk(KERN_INFO "Ignoring memory below %luMB\n",
+		       min_addr >> 20);
 	if (max_addr != ~0UL)
-		printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
+		printk(KERN_INFO "Ignoring memory above %luMB\n",
+		       max_addr >> 20);
 
 	efi.systab = __va(ia64_boot_param->efi_systab);
 
@@ -464,7 +485,8 @@ efi_init (void)
 	}
 
 	printk(KERN_INFO "EFI v%u.%.02u by %s:",
-	       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
 
 	efi.mps        = EFI_INVALID_TABLE_ADDR;
 	efi.acpi       = EFI_INVALID_TABLE_ADDR;
@@ -519,9 +541,12 @@ efi_init (void)
 		efi_memory_desc_t *md;
 		void *p;
 
-		for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
+		for (i = 0, p = efi_map_start; p < efi_map_end;
+		     ++i, p += efi_desc_size)
+		{
 			md = p;
-			printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n",
+			printk("mem%02u: type=%u, attr=0x%lx, "
+			       "range=[0x%016lx-0x%016lx) (%luMB)\n",
 			       i, md->type, md->attribute, md->phys_addr,
 			       md->phys_addr + efi_md_size(md),
 			       md->num_pages >> (20 - EFI_PAGE_SHIFT));
@@ -549,8 +574,8 @@ efi_enter_virtual_mode (void)
 		md = p;
 		if (md->attribute & EFI_MEMORY_RUNTIME) {
 			/*
-			 * Some descriptors have multiple bits set, so the order of
-			 * the tests is relevant.
+			 * Some descriptors have multiple bits set, so the
+			 * order of the tests is relevant.
 			 */
 			if (md->attribute & EFI_MEMORY_WB) {
 				md->virt_addr = (u64) __va(md->phys_addr);
@@ -558,21 +583,26 @@ efi_enter_virtual_mode (void)
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
 			} else if (md->attribute & EFI_MEMORY_WC) {
 #if 0
-				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
-									   | _PAGE_D
-									   | _PAGE_MA_WC
-									   | _PAGE_PL_0
-									   | _PAGE_AR_RW));
+				md->virt_addr = ia64_remap(md->phys_addr,
+							   (_PAGE_A |
+							    _PAGE_P |
+							    _PAGE_D |
+							    _PAGE_MA_WC |
+							    _PAGE_PL_0 |
+							    _PAGE_AR_RW));
 #else
 				printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
 #endif
 			} else if (md->attribute & EFI_MEMORY_WT) {
 #if 0
-				md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
-									   | _PAGE_D | _PAGE_MA_WT
-									   | _PAGE_PL_0
-									   | _PAGE_AR_RW));
+				md->virt_addr = ia64_remap(md->phys_addr,
+							   (_PAGE_A |
+							    _PAGE_P |
+							    _PAGE_D |
+							    _PAGE_MA_WT |
+							    _PAGE_PL_0 |
+							    _PAGE_AR_RW));
 #else
 				printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
 				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
@@ -583,16 +613,18 @@ efi_enter_virtual_mode (void)
 
 	status = efi_call_phys(__va(runtime->set_virtual_address_map),
 			       ia64_boot_param->efi_memmap_size,
-			       efi_desc_size, ia64_boot_param->efi_memdesc_version,
+			       efi_desc_size,
+			       ia64_boot_param->efi_memdesc_version,
 			       ia64_boot_param->efi_memmap);
 	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "warning: unable to switch EFI into virtual mode "
-		       "(status=%lu)\n", status);
+		printk(KERN_WARNING "warning: unable to switch EFI into "
+		       "virtual mode (status=%lu)\n", status);
 		return;
 	}
 
 	/*
-	 * Now that EFI is in virtual mode, we call the EFI functions more efficiently:
+	 * Now that EFI is in virtual mode, we call the EFI functions more
+	 * efficiently:
 	 */
 	efi.get_time = virt_get_time;
 	efi.set_time = virt_set_time;
@@ -606,8 +638,8 @@ efi_enter_virtual_mode (void)
 }
 
 /*
- * Walk the EFI memory map looking for the I/O port range.  There can only be one entry of
- * this type, other I/O port ranges should be described via ACPI.
+ * Walk the EFI memory map looking for the I/O port range.  There can only be
+ * one entry of this type, other I/O port ranges should be described via ACPI.
  */
 u64
 efi_get_iobase (void)
@@ -678,7 +710,6 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
 
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
 		md = p;
-
 		if (md->phys_addr < end && efi_md_end(md) > phys_addr)
 			return 1;
 	}
@@ -883,7 +914,7 @@ efi_uart_console_only(void)
 				return 1;
 			uart = 0;
 		}
-		hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length);
+		hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length);
 	}
 	printk(KERN_ERR "Malformed %s value\n", name);
 	return 0;
@@ -921,10 +952,12 @@ find_memmap_space (void)
 		if (!efi_wb(md)) {
 			continue;
 		}
-		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+		if (pmd == NULL || !efi_wb(pmd) ||
+		    efi_md_end(pmd) != md->phys_addr) {
 			contig_low = GRANULEROUNDUP(md->phys_addr);
 			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+			for (q = p + efi_desc_size; q < efi_map_end;
+			     q += efi_desc_size) {
 				check_md = q;
 				if (!efi_wb(check_md))
 					break;
@@ -988,8 +1021,9 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
 		md = p;
 		if (!efi_wb(md)) {
-			if (efi_uc(md) && (md->type == EFI_CONVENTIONAL_MEMORY ||
-				    	   md->type == EFI_BOOT_SERVICES_DATA)) {
+			if (efi_uc(md) &&
+			    (md->type == EFI_CONVENTIONAL_MEMORY ||
+			     md->type == EFI_BOOT_SERVICES_DATA)) {
 				k->attribute = EFI_MEMORY_UC;
 				k->start = md->phys_addr;
 				k->num_pages = md->num_pages;
@@ -997,10 +1031,12 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 			}
 			continue;
 		}
-		if (pmd == NULL || !efi_wb(pmd) || efi_md_end(pmd) != md->phys_addr) {
+		if (pmd == NULL || !efi_wb(pmd) ||
+		    efi_md_end(pmd) != md->phys_addr) {
 			contig_low = GRANULEROUNDUP(md->phys_addr);
 			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end; q += efi_desc_size) {
+			for (q = p + efi_desc_size; q < efi_map_end;
+			     q += efi_desc_size) {
 				check_md = q;
 				if (!efi_wb(check_md))
 					break;
@@ -1025,13 +1061,17 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 		if (md->phys_addr < contig_low) {
 			lim = min(efi_md_end(md), contig_low);
 			if (efi_uc(md)) {
-				if (k > kern_memmap && (k-1)->attribute == EFI_MEMORY_UC &&
+				if (k > kern_memmap &&
+				    (k-1)->attribute == EFI_MEMORY_UC &&
 				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages += (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+					(k-1)->num_pages +=
+						(lim - md->phys_addr)
+						>> EFI_PAGE_SHIFT;
 				} else {
 					k->attribute = EFI_MEMORY_UC;
 					k->start = md->phys_addr;
-					k->num_pages = (lim - md->phys_addr) >> EFI_PAGE_SHIFT;
+					k->num_pages = (lim - md->phys_addr)
+						>> EFI_PAGE_SHIFT;
 					k++;
 				}
 			}
@@ -1049,7 +1089,8 @@ efi_memmap_init(unsigned long *s, unsigned long *e)
 				} else {
 					k->attribute = EFI_MEMORY_UC;
 					k->start = lim;
-					k->num_pages = (efi_md_end(md) - lim) >> EFI_PAGE_SHIFT;
+					k->num_pages = (efi_md_end(md) - lim)
+						>> EFI_PAGE_SHIFT;
 					k++;
 				}
 			}
@@ -1151,8 +1192,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				break;
 		}
 
-		if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL) {
-			printk(KERN_ERR "failed to alocate resource for iomem\n");
+		if ((res = kzalloc(sizeof(struct resource),
+				   GFP_KERNEL)) == NULL) {
+			printk(KERN_ERR
+			       "failed to alocate resource for iomem\n");
 			return;
 		}
 
@@ -1187,44 +1230,44 @@ efi_initialize_iomem_resources(struct resource *code_resource,
    rsvd_regions are sorted
  */
 unsigned long __init
-kdump_find_rsvd_region (unsigned long size,
-		struct rsvd_region *r, int n)
+kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n)
 {
-  int i;
-  u64 start, end;
-  u64 alignment = 1UL << _PAGE_SIZE_64M;
-  void *efi_map_start, *efi_map_end, *p;
-  efi_memory_desc_t *md;
-  u64 efi_desc_size;
-
-  efi_map_start = __va(ia64_boot_param->efi_memmap);
-  efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-  efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-  for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-	  md = p;
-	  if (!efi_wb(md))
-		  continue;
-	  start = ALIGN(md->phys_addr, alignment);
-	  end = efi_md_end(md);
-	  for (i = 0; i < n; i++) {
-		if (__pa(r[i].start) >= start && __pa(r[i].end) < end) {
-			if (__pa(r[i].start) > start + size)
-				return start;
-			start = ALIGN(__pa(r[i].end), alignment);
-			if (i < n-1 && __pa(r[i+1].start) < start + size)
-				continue;
-			else
-				break;
+	int i;
+	u64 start, end;
+	u64 alignment = 1UL << _PAGE_SIZE_64M;
+	void *efi_map_start, *efi_map_end, *p;
+	efi_memory_desc_t *md;
+	u64 efi_desc_size;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		md = p;
+		if (!efi_wb(md))
+			continue;
+		start = ALIGN(md->phys_addr, alignment);
+		end = efi_md_end(md);
+		for (i = 0; i < n; i++) {
+			if (__pa(r[i].start) >= start && __pa(r[i].end) < end) {
+				if (__pa(r[i].start) > start + size)
+					return start;
+				start = ALIGN(__pa(r[i].end), alignment);
+				if (i < n-1 &&
+				    __pa(r[i+1].start) < start + size)
+					continue;
+				else
+					break;
+			}
 		}
-	  }
-	  if (end > start + size)
-		return start;
-  }
-
-  printk(KERN_WARNING "Cannot reserve 0x%lx byte of memory for crashdump\n",
-	size);
-  return ~0UL;
+		if (end > start + size)
+			return start;
+	}
+
+	printk(KERN_WARNING
+	       "Cannot reserve 0x%lx byte of memory for crashdump\n", size);
+	return ~0UL;
 }
 #endif
 
-- 
cgit v1.1


From 965e7c8affeca27f7e5de75c97954e74d3b8052d Mon Sep 17 00:00:00 2001
From: Aron Griffis <aron@hp.com>
Date: Tue, 8 Jan 2008 22:29:37 -0500
Subject: [IA64] efi.c Spelling/punctuation fixes

Incorporates the suggestions from Peter Chubb the last time I submitted
this.  This called for using the same verb tense in the couple of preceding
comments as well.

Signed-off-by: Aron Griffis <aron@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 9e59109..269f4f4 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -311,7 +311,7 @@ walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory
+ * Walk the EFI memory map and call CALLBACK once for each EFI memory
  * descriptor that has memory that is available for OS use.
  */
 void
@@ -321,7 +321,7 @@ efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
 }
 
 /*
- * Walks the EFI memory map and calls CALLBACK once for each EFI memory
+ * Walk the EFI memory map and call CALLBACK once for each EFI memory
  * descriptor that has memory that is available for uncached allocator.
  */
 void
@@ -331,7 +331,7 @@ efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
 }
 
 /*
- * Look for the PAL_CODE region reported by EFI and maps it using an
+ * Look for the PAL_CODE region reported by EFI and map it using an
  * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
  * Abstraction Layer chapter 11 in ADAG
  */
@@ -385,7 +385,7 @@ efi_get_pal_addr (void)
 		}
 
 		if (efi_md_size(md) > IA64_GRANULE_SIZE)
-			panic("Woah!  PAL code size bigger than a granule!");
+			panic("Whoa!  PAL code size bigger than a granule!");
 
 #if EFI_DEBUG
 		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
@@ -435,7 +435,7 @@ efi_init (void)
 	int i;
 
 	/*
-	 * it's too early to be able to use the standard kernel command line
+	 * It's too early to be able to use the standard kernel command line
 	 * support...
 	 */
 	for (cp = boot_command_line; *cp; ) {
@@ -465,9 +465,9 @@ efi_init (void)
 	 * Verify the EFI Table
 	 */
 	if (efi.systab == NULL)
-		panic("Woah! Can't find EFI system table.\n");
+		panic("Whoa! Can't find EFI system table.\n");
 	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		panic("Woah! EFI system table signature incorrect\n");
+		panic("Whoa! EFI system table signature incorrect\n");
 	if ((efi.systab->hdr.revision >> 16) == 0)
 		printk(KERN_WARNING "Warning: EFI system table version "
 		       "%d.%02d, expected 1.00 or greater\n",
@@ -1195,7 +1195,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 		if ((res = kzalloc(sizeof(struct resource),
 				   GFP_KERNEL)) == NULL) {
 			printk(KERN_ERR
-			       "failed to alocate resource for iomem\n");
+			       "failed to allocate resource for iomem\n");
 			return;
 		}
 
-- 
cgit v1.1


From 410ab512e5c5716287a399145df0905c1dcddb04 Mon Sep 17 00:00:00 2001
From: Aron Griffis <aron@hp.com>
Date: Tue, 8 Jan 2008 22:29:38 -0500
Subject: [IA64] efi.c Add /* never reached */ annotation

As written, this loop could be for (;;) instead of do while (md).  The tests
inside the loop always result in a return so the loop never terminates normally.

Signed-off-by: Aron Griffis <aron@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 269f4f4..d59134d 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -762,7 +762,7 @@ efi_mem_attribute (unsigned long phys_addr, unsigned long size)
 		if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr)
 			return 0;
 	} while (md);
-	return 0;
+	return 0;	/* never reached */
 }
 
 u64
@@ -798,7 +798,7 @@ kern_mem_attribute (unsigned long phys_addr, unsigned long size)
 		if (!md || md->attribute != attr)
 			return 0;
 	} while (md);
-	return 0;
+	return 0;	/* never reached */
 }
 EXPORT_SYMBOL(kern_mem_attribute);
 
-- 
cgit v1.1


From acffc84ad9c1c8a79b8c1d6b9f4f530f37cba9ab Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 30 Jan 2008 12:17:42 +0900
Subject: [IA64] generalize attribute of fsyscall_gtod_data

In an ordinary way,

> } __attribute__ ((aligned (L1_CACHE_BYTES)));

should be

> } ____cacheline_aligned;

to save some bytes on an uni-processor.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/fsyscall_gtod_data.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
index 490dab5..57d2ee6 100644
--- a/arch/ia64/kernel/fsyscall_gtod_data.h
+++ b/arch/ia64/kernel/fsyscall_gtod_data.h
@@ -14,10 +14,10 @@ struct fsyscall_gtod_data_t {
 	u32		clk_shift;
 	void		*clk_fsys_mmio;
 	cycle_t		clk_cycle_last;
-} __attribute__ ((aligned (L1_CACHE_BYTES)));
+} ____cacheline_aligned;
 
 struct itc_jitter_data_t {
 	int		itc_jitter;
 	cycle_t		itc_lastcycle;
-} __attribute__ ((aligned (L1_CACHE_BYTES)));
+} ____cacheline_aligned;
 
-- 
cgit v1.1


From 0245583ab454ba56131921e26ad966b56bcc0a5b Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Thu, 10 Jan 2008 20:31:51 -0800
Subject: [IA64] sn_hwperf semaphore to mutex

Really simple mutex style semaphore user. The new API is struct mutex which is
what I've converted it to with this change.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index ca88274..4b0d153 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -33,6 +33,7 @@
 #include <linux/smp_lock.h>
 #include <linux/nodemask.h>
 #include <linux/smp.h>
+#include <linux/mutex.h>
 
 #include <asm/processor.h>
 #include <asm/topology.h>
@@ -50,7 +51,7 @@ static void *sn_hwperf_salheap = NULL;
 static int sn_hwperf_obj_cnt = 0;
 static nasid_t sn_hwperf_master_nasid = INVALID_NASID;
 static int sn_hwperf_init(void);
-static DECLARE_MUTEX(sn_hwperf_init_mutex);
+static DEFINE_MUTEX(sn_hwperf_init_mutex);
 
 #define cnode_possible(n)	((n) < num_cnodes)
 
@@ -884,10 +885,10 @@ static int sn_hwperf_init(void)
 	int e = 0;
 
 	/* single threaded, once-only initialization */
-	down(&sn_hwperf_init_mutex);
+	mutex_lock(&sn_hwperf_init_mutex);
 
 	if (sn_hwperf_salheap) {
-		up(&sn_hwperf_init_mutex);
+		mutex_unlock(&sn_hwperf_init_mutex);
 		return e;
 	}
 
@@ -936,7 +937,7 @@ out:
 		sn_hwperf_salheap = NULL;
 		sn_hwperf_obj_cnt = 0;
 	}
-	up(&sn_hwperf_init_mutex);
+	mutex_unlock(&sn_hwperf_init_mutex);
 	return e;
 }
 
-- 
cgit v1.1


From fe77efb8b7e80128b914044c175d5dcd75e9fff7 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 7 Jan 2008 10:11:57 +0900
Subject: [IA64] mca style cleanup

Unified changelog, 80 columns rule, and address form fix.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c         | 66 +++++++++++++++++++++++-------------------
 arch/ia64/kernel/mca_asm.S     | 46 +++++++++++++++--------------
 arch/ia64/kernel/mca_drv.c     |  2 +-
 arch/ia64/kernel/mca_drv.h     |  2 +-
 arch/ia64/kernel/mca_drv_asm.S |  2 +-
 include/asm-ia64/mca.h         |  6 ++--
 include/asm-ia64/mca_asm.h     |  3 +-
 7 files changed, 70 insertions(+), 57 deletions(-)

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6dbf591..846e7e0 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -2,61 +2,69 @@
  * File:	mca.c
  * Purpose:	Generic MCA handling layer
  *
- * Updated for latest kernel
  * Copyright (C) 2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  *
  * Copyright (C) 2002 Dell Inc.
- * Copyright (C) Matt Domsch (Matt_Domsch@dell.com)
+ * Copyright (C) Matt Domsch <Matt_Domsch@dell.com>
  *
  * Copyright (C) 2002 Intel
- * Copyright (C) Jenna Hall (jenna.s.hall@intel.com)
+ * Copyright (C) Jenna Hall <jenna.s.hall@intel.com>
  *
  * Copyright (C) 2001 Intel
- * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com)
+ * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com>
  *
  * Copyright (C) 2000 Intel
- * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com)
+ * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com>
  *
  * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander(vijay@engr.sgi.com)
+ * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
  *
- * 03/04/15 D. Mosberger Added INIT backtrace support.
- * 02/03/25 M. Domsch	GUID cleanups
+ * Copyright (C) 2006 FUJITSU LIMITED
+ * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  *
- * 02/01/04 J. Hall	Aligned MCA stack to 16 bytes, added platform vs. CPU
- *			error flag, set SAL default return values, changed
- *			error record structure to linked list, added init call
- *			to sal_get_state_info_size().
+ * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
+ *	      Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
+ *	      added min save state dump, added INIT handler.
  *
- * 01/01/03 F. Lewis    Added setup of CMCI and CPEI IRQs, logging of corrected
- *                      platform errors, completed code for logging of
- *                      corrected & uncorrected machine check errors, and
- *                      updated for conformance with Nov. 2000 revision of the
- *                      SAL 3.0 spec.
- * 00/03/29 C. Fleckenstein  Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
- *                           added min save state dump, added INIT handler.
+ * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com>
+ *	      Added setup of CMCI and CPEI IRQs, logging of corrected platform
+ *	      errors, completed code for logging of corrected & uncorrected
+ *	      machine check errors, and updated for conformance with Nov. 2000
+ *	      revision of the SAL 3.0 spec.
+ *
+ * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com>
+ *	      Aligned MCA stack to 16 bytes, added platform vs. CPU error flag,
+ *	      set SAL default return values, changed error record structure to
+ *	      linked list, added init call to sal_get_state_info_size().
+ *
+ * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com>
+ *	      GUID cleanups.
+ *
+ * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com>
+ *	      Added INIT backtrace support.
  *
  * 2003-12-08 Keith Owens <kaos@sgi.com>
- *            smp_call_function() must not be called from interrupt context (can
- *            deadlock on tasklist_lock).  Use keventd to call smp_call_function().
+ *	      smp_call_function() must not be called from interrupt context
+ *	      (can deadlock on tasklist_lock).
+ *	      Use keventd to call smp_call_function().
  *
  * 2004-02-01 Keith Owens <kaos@sgi.com>
- *            Avoid deadlock when using printk() for MCA and INIT records.
- *            Delete all record printing code, moved to salinfo_decode in user space.
- *            Mark variables and functions static where possible.
- *            Delete dead variables and functions.
- *            Reorder to remove the need for forward declarations and to consolidate
- *            related code.
+ *	      Avoid deadlock when using printk() for MCA and INIT records.
+ *	      Delete all record printing code, moved to salinfo_decode in user
+ *	      space.  Mark variables and functions static where possible.
+ *	      Delete dead variables and functions.  Reorder to remove the need
+ *	      for forward declarations and to consolidate related code.
  *
  * 2005-08-12 Keith Owens <kaos@sgi.com>
- *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS state.
+ *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS
+ *	      state.
  *
  * 2005-10-07 Keith Owens <kaos@sgi.com>
  *	      Add notify_die() hooks.
  *
  * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- * 	      Add printing support for MCA/INIT.
+ *	      Add printing support for MCA/INIT.
  *
  * 2007-04-27 Russ Anderson <rja@sgi.com>
  *	      Support multiple cpus going through OS_MCA in the same event.
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
index 0f5965f..8bc7d25 100644
--- a/arch/ia64/kernel/mca_asm.S
+++ b/arch/ia64/kernel/mca_asm.S
@@ -1,24 +1,28 @@
-//
-// assembly portion of the IA64 MCA handling
-//
-// Mods by cfleck to integrate into kernel build
-// 00/03/15 davidm Added various stop bits to get a clean compile
-//
-// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp
-//		   kstack, switch modes, jump to C INIT handler
-//
-// 02/01/04 J.Hall <jenna.s.hall@intel.com>
-//		   Before entering virtual mode code:
-//		   1. Check for TLB CPU error
-//		   2. Restore current thread pointer to kr6
-//		   3. Move stack ptr 16 bytes to conform to C calling convention
-//
-// 04/11/12 Russ Anderson <rja@sgi.com>
-//		   Added per cpu MCA/INIT stack save areas.
-//
-// 12/08/05 Keith Owens <kaos@sgi.com>
-//		   Use per cpu MCA/INIT stacks for all data.
-//
+/*
+ * File:	mca_asm.S
+ * Purpose:	assembly portion of the IA64 MCA handling
+ *
+ * Mods by cfleck to integrate into kernel build
+ *
+ * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com>
+ *		Added various stop bits to get a clean compile
+ *
+ * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
+ *		Added code to save INIT handoff state in pt_regs format,
+ *		switch to temp kstack, switch modes, jump to C INIT handler
+ *
+ * 2002-01-04 J.Hall <jenna.s.hall@intel.com>
+ *		Before entering virtual mode code:
+ *		 1. Check for TLB CPU error
+ *		 2. Restore current thread pointer to kr6
+ *		 3. Move stack ptr 16 bytes to conform to C calling convention
+ *
+ * 2004-11-12 Russ Anderson <rja@sgi.com>
+ *		Added per cpu MCA/INIT stack save areas.
+ *
+ * 2005-12-08 Keith Owens <kaos@sgi.com>
+ *		Use per cpu MCA/INIT stacks for all data.
+ */
 #include <linux/threads.h>
 
 #include <asm/asmmacro.h>
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index aba813c2..fab1d21 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -3,7 +3,7 @@
  * Purpose:	Generic MCA handling layer
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  * Copyright (C) 2005 Silicon Graphics, Inc
  * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
  * Copyright (C) 2006 Russ Anderson <rja@sgi.com>
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
index 485e34d..53b8ecb 100644
--- a/arch/ia64/kernel/mca_drv.h
+++ b/arch/ia64/kernel/mca_drv.h
@@ -3,7 +3,7 @@
  * Purpose:	Define helpers for Generic MCA handling
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  */
 /*
  * Processor error section:
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
index 3bccb06..767ac2c 100644
--- a/arch/ia64/kernel/mca_drv_asm.S
+++ b/arch/ia64/kernel/mca_drv_asm.S
@@ -3,7 +3,7 @@
  * Purpose:     Assembly portion of Generic MCA handling
  *
  * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
  */
 #include <linux/threads.h>
 
diff --git a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h
index 823553b..f1663aa 100644
--- a/include/asm-ia64/mca.h
+++ b/include/asm-ia64/mca.h
@@ -3,9 +3,9 @@
  * Purpose:	Machine check handling specific defines
  *
  * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander (vijay@engr.sgi.com)
- * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com)
- * Copyright (C) Russ Anderson (rja@sgi.com)
+ * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
+ * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
+ * Copyright (C) Russ Anderson <rja@sgi.com>
  */
 
 #ifndef _ASM_IA64_MCA_H
diff --git a/include/asm-ia64/mca_asm.h b/include/asm-ia64/mca_asm.h
index 76203f9..dd2a5b1 100644
--- a/include/asm-ia64/mca_asm.h
+++ b/include/asm-ia64/mca_asm.h
@@ -1,8 +1,9 @@
 /*
  * File:	mca_asm.h
+ * Purpose:	Machine check handling specific defines
  *
  * Copyright (C) 1999 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander (vijay@engr.sgi.com)
+ * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
  * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
  * Copyright (C) 2000 Hewlett-Packard Co.
  * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
-- 
cgit v1.1


From a7d57ecf4216ed29328f8e701bd65ebb66a0284c Mon Sep 17 00:00:00 2001
From: "Zhang, Xiantao" <xiantao.zhang@intel.com>
Date: Mon, 4 Feb 2008 15:46:23 -0800
Subject: [IA64] Export three symbols for module use

Since kvm/module needs to use some unexported functions in kernel,
so export them with this patch.

Signed-off-by: Zhang Xiantao <xiantao.zhang@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/ia64_ksyms.c |  3 +++
 arch/ia64/kernel/sal.c        | 14 ++++++++++++++
 include/asm-ia64/sal.h        | 14 +++-----------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index c3b4412..8e7193d 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -12,6 +12,9 @@ EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(strlen);
 
+#include<asm/pgtable.h>
+EXPORT_SYMBOL_GPL(empty_zero_page);
+
 #include <asm/checksum.h>
 EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
 EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
index 27c2ef4..f44fe84 100644
--- a/arch/ia64/kernel/sal.c
+++ b/arch/ia64/kernel/sal.c
@@ -284,6 +284,7 @@ ia64_sal_cache_flush (u64 cache_type)
 	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
 	return isrv.status;
 }
+EXPORT_SYMBOL_GPL(ia64_sal_cache_flush);
 
 void __init
 ia64_sal_init (struct ia64_sal_systab *systab)
@@ -372,3 +373,16 @@ ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
 	return 0;
 }
 EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
+
+long
+ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
+		    unsigned long *drift_info)
+{
+	struct ia64_sal_retval isrv;
+
+	SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
+	*ticks_per_second = isrv.v0;
+	*drift_info = isrv.v1;
+	return isrv.status;
+}
+EXPORT_SYMBOL_GPL(ia64_sal_freq_base);
diff --git a/include/asm-ia64/sal.h b/include/asm-ia64/sal.h
index 1f5412d..2251118 100644
--- a/include/asm-ia64/sal.h
+++ b/include/asm-ia64/sal.h
@@ -649,17 +649,6 @@ typedef struct err_rec {
  * Now define a couple of inline functions for improved type checking
  * and convenience.
  */
-static inline long
-ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
-		    unsigned long *drift_info)
-{
-	struct ia64_sal_retval isrv;
-
-	SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
-	*ticks_per_second = isrv.v0;
-	*drift_info = isrv.v1;
-	return isrv.status;
-}
 
 extern s64 ia64_sal_cache_flush (u64 cache_type);
 extern void __init check_sal_cache_flush (void);
@@ -841,6 +830,9 @@ extern int ia64_sal_oemcall_nolock(struct ia64_sal_retval *, u64, u64, u64,
 				   u64, u64, u64, u64, u64);
 extern int ia64_sal_oemcall_reentrant(struct ia64_sal_retval *, u64, u64, u64,
 				      u64, u64, u64, u64, u64);
+extern long
+ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
+		    unsigned long *drift_info);
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * System Abstraction Layer Specification
-- 
cgit v1.1


From f00c2d36bf6d7efece79713930763d9a0460283e Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 31 Jan 2008 17:46:09 +0800
Subject: [IA64] ia64_set_psr should use srlz.i

The only in kernel use of ia64_set_psr() needs to follow
it with a srlz.i (since it is changing state for PSR.ic).
So it is pointless to issue srlz.d inside this function.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c       | 1 -
 include/asm-ia64/processor.h | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index d59134d..919070a 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -421,7 +421,6 @@ efi_map_pal_code (void)
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
 	ia64_set_psr(psr);		/* restore psr */
-	ia64_srlz_i();
 }
 
 void __init
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index be3b0ae..038642f 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -472,7 +472,7 @@ ia64_set_psr (__u64 psr)
 {
 	ia64_stop();
 	ia64_setreg(_IA64_REG_PSR_L, psr);
-	ia64_srlz_d();
+	ia64_srlz_i();
 }
 
 /*
-- 
cgit v1.1


From 920ed9f194effd10c273a4040114d3d8e6eed95f Mon Sep 17 00:00:00 2001
From: Zhang Xiantao <xiantao.zhang@intel.com>
Date: Thu, 31 Jan 2008 12:03:39 +0800
Subject: [IA64] Appoint kvm/ia64 Maintainers

Anthony and Xiantao are working on KVM for ia64.

Signed-off-by Anthony Xu <anthony.xu@intel.com>
Signed-off-by Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 MAINTAINERS | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index da30a72..263ceae 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2249,6 +2249,15 @@ L:	kvm-devel@lists.sourceforge.net
 W:	kvm.sourceforge.net
 S:	Supported
 
+KERNEL VIRTUAL MACHINE For Itanium(KVM/IA64)
+P:	Anthony Xu
+M:	anthony.xu@intel.com
+P:	Xiantao Zhang
+M:	xiantao.zhang@intel.com
+L:	kvm-ia64-devel@lists.sourceforge.net
+W:	kvm.sourceforge.net
+S:	Supported
+
 KEXEC
 P:	Eric Biederman
 M:	ebiederm@xmission.com
-- 
cgit v1.1


From e8f9b2ed9882874ca96716597bd8c7113289e77b Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 4 Feb 2008 20:20:41 -0800
Subject: mlx4_core: Fix more section mismatches

Commit 3d73c288 ("mlx4_core: Fix section mismatches") fixed some of
the section mismatches introduced when error recovery was added, but
there were still more cases of errory recovery code calling into
__devinit code from regular .text.  Fix this by getting rid of the
now-incorrect __devinit annotations.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/main.c | 8 ++++----
 drivers/net/mlx4/mr.c   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 89b3f0b..a028d8a 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -163,7 +163,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	return 0;
 }
 
-static int __devinit mlx4_load_fw(struct mlx4_dev *dev)
+static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
@@ -197,8 +197,8 @@ err_free:
 	return err;
 }
 
-static int __devinit mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
-					  int cmpt_entry_sz)
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+				int cmpt_entry_sz)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
@@ -688,7 +688,7 @@ err_uar_table_free:
 	return err;
 }
 
-static void __devinit mlx4_enable_msi_x(struct mlx4_dev *dev)
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry entries[MLX4_NUM_EQ];
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 0c05a10..9c9e308 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -122,7 +122,7 @@ static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
 	spin_unlock(&buddy->lock);
 }
 
-static int __devinit mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
 {
 	int i, s;
 
-- 
cgit v1.1


From 3971c9f6dbf26f077b929dbe14ced60a697ebcf0 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Mon, 10 Dec 2007 15:53:25 -0800
Subject: IB/cm: Add interim support for routed paths

Paths with hop_limit > 1 indicate that the connection will be routed
between IB subnets.  Update the subnet local field in the CM REQ based
on the hop_limit value.  In addition, if the path is routed, then set
the LIDs in the REQ to the permissive LIDs.  This is used to indicate
to the passive side that it should use the LIDs in the received local
route header (LRH) associated with the REQ when programming the QP.

This is a temporary work-around to the IB CM to support IB router
development until the IB router specification is completed.  It is not
anticipated that this work-around will cause any interoperability
issues with existing stacks or future stacks that will properly
support IB routers when defined.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/cm.c | 89 ++++++++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index c015014..638b727 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -974,6 +974,9 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 			  struct cm_id_private *cm_id_priv,
 			  struct ib_cm_req_param *param)
 {
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
 	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
 			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
 
@@ -997,35 +1000,46 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
 	cm_req_set_srq(req_msg, param->srq);
 
-	req_msg->primary_local_lid = param->primary_path->slid;
-	req_msg->primary_remote_lid = param->primary_path->dlid;
-	req_msg->primary_local_gid = param->primary_path->sgid;
-	req_msg->primary_remote_gid = param->primary_path->dgid;
-	cm_req_set_primary_flow_label(req_msg, param->primary_path->flow_label);
-	cm_req_set_primary_packet_rate(req_msg, param->primary_path->rate);
-	req_msg->primary_traffic_class = param->primary_path->traffic_class;
-	req_msg->primary_hop_limit = param->primary_path->hop_limit;
-	cm_req_set_primary_sl(req_msg, param->primary_path->sl);
-	cm_req_set_primary_subnet_local(req_msg, 1); /* local only... */
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
 	cm_req_set_primary_local_ack_timeout(req_msg,
 		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-			       param->primary_path->packet_life_time));
+			       pri_path->packet_life_time));
 
-	if (param->alternate_path) {
-		req_msg->alt_local_lid = param->alternate_path->slid;
-		req_msg->alt_remote_lid = param->alternate_path->dlid;
-		req_msg->alt_local_gid = param->alternate_path->sgid;
-		req_msg->alt_remote_gid = param->alternate_path->dgid;
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
 		cm_req_set_alt_flow_label(req_msg,
-					  param->alternate_path->flow_label);
-		cm_req_set_alt_packet_rate(req_msg, param->alternate_path->rate);
-		req_msg->alt_traffic_class = param->alternate_path->traffic_class;
-		req_msg->alt_hop_limit = param->alternate_path->hop_limit;
-		cm_req_set_alt_sl(req_msg, param->alternate_path->sl);
-		cm_req_set_alt_subnet_local(req_msg, 1); /* local only... */
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
 		cm_req_set_alt_local_ack_timeout(req_msg,
 			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-				       param->alternate_path->packet_life_time));
+				       alt_path->packet_life_time));
 	}
 
 	if (param->private_data && param->private_data_len)
@@ -1441,6 +1455,34 @@ out:
 	return listen_cm_id_priv;
 }
 
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
 static int cm_req_handler(struct cm_work *work)
 {
 	struct ib_cm_id *cm_id;
@@ -1481,6 +1523,7 @@ static int cm_req_handler(struct cm_work *work)
 	cm_id_priv->id.service_id = req_msg->service_id;
 	cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL);
 
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
 	if (ret) {
-- 
cgit v1.1


From 2b7274c39228d7a8c81a411dc3969763e9069c56 Mon Sep 17 00:00:00 2001
From: Joachim Fenkes <fenkes@de.ibm.com>
Date: Thu, 24 Jan 2008 17:59:08 +0100
Subject: IB/ehca: Prevent sending UD packets to QP0

The IB spec doesn't allow packets to QP0 sent on any other VL than VL15.
Hardware doesn't filter those packets on the send side, so we need to do
this in the driver and firmware.

As eHCA doesn't support QP0, we can just filter out all traffic going to
QP0, regardless of SL or VL.

Signed-off-by: Joachim Fenkes <fenkes@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_reqs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index 3aacc8c..2ce8cff 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -209,6 +209,10 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 			ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
 			return -EINVAL;
 		}
+		if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+			ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+			return -EINVAL;
+		}
 		my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
 		wqe_p->u.ud_av.ud_av = my_av->av;
 
-- 
cgit v1.1


From 528b03f73247c30750b740dcad16ad1914e56e89 Mon Sep 17 00:00:00 2001
From: Joachim Fenkes <fenkes@de.ibm.com>
Date: Fri, 25 Jan 2008 21:12:39 +0100
Subject: IB/ehca: Update sma_attr also in case of disruptive config change

Signed-off-by: Joachim Fenkes <fenkes@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_irq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 863b34f..b5ca94c 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -403,6 +403,8 @@ static void parse_ec(struct ehca_shca *shca, u64 eqe)
 			sport->port_state = IB_PORT_ACTIVE;
 			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
 					    "is active");
+			ehca_query_sma_attr(shca, port,
+					    &sport->saved_attr);
 		} else
 			notify_port_conf_change(shca, port);
 		break;
-- 
cgit v1.1


From 2b5e6b120e58d44cace68e6c7204b541a8b0b43f Mon Sep 17 00:00:00 2001
From: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Date: Fri, 25 Jan 2008 21:18:27 +0100
Subject: IB/ehca: Add PMA support

This patch enables ehca to redirect any PMA queries to the
actual PMA QP.

Signed-off-by: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Reviewed-by: Joachim Fenkes <fenkes@de.ibm.com>
Reviewed-by: Christoph Raisch <raisch@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_classes.h |  1 +
 drivers/infiniband/hw/ehca/ehca_iverbs.h  |  5 ++
 drivers/infiniband/hw/ehca/ehca_main.c    |  2 +-
 drivers/infiniband/hw/ehca/ehca_sqp.c     | 91 +++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index f281d16..92cce8a 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -101,6 +101,7 @@ struct ehca_sport {
 	spinlock_t mod_sqp_lock;
 	enum ib_port_state port_state;
 	struct ehca_sma_attr saved_attr;
+	u32 pma_qp_nr;
 };
 
 #define HCA_CAP_MR_PGSIZE_4K  0x80000000
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
index c469bfd..a8a2ea5 100644
--- a/drivers/infiniband/hw/ehca/ehca_iverbs.h
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -187,6 +187,11 @@ int ehca_dealloc_ucontext(struct ib_ucontext *context);
 
 int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad,
+		     struct ib_mad *out_mad);
+
 void ehca_poll_eqs(unsigned long data);
 
 int ehca_calc_ipd(struct ehca_shca *shca, int port,
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 84c9b7b..a86ebcc 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -472,7 +472,7 @@ int ehca_init_device(struct ehca_shca *shca)
 	shca->ib_device.dealloc_fmr	    = ehca_dealloc_fmr;
 	shca->ib_device.attach_mcast	    = ehca_attach_mcast;
 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
-	/* shca->ib_device.process_mad	    = ehca_process_mad;	    */
+	shca->ib_device.process_mad	    = ehca_process_mad;
 	shca->ib_device.mmap		    = ehca_mmap;
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
index 79e72b2..706d97a 100644
--- a/drivers/infiniband/hw/ehca/ehca_sqp.c
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -39,12 +39,18 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <rdma/ib_mad.h>
 
 #include "ehca_classes.h"
 #include "ehca_tools.h"
 #include "ehca_iverbs.h"
 #include "hcp_if.h"
 
+#define IB_MAD_STATUS_REDIRECT		__constant_htons(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION	__constant_htons(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD	__constant_htons(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO		__constant_htons(0x0001)
 
 /**
  * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
@@ -83,6 +89,9 @@ u64 ehca_define_sqp(struct ehca_shca *shca,
 				 port, ret);
 			return ret;
 		}
+		shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+		ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+			 port, pma_qp_nr);
 		break;
 	default:
 		ehca_err(&shca->ib_device, "invalid qp_type=%x",
@@ -109,3 +118,85 @@ u64 ehca_define_sqp(struct ehca_shca *shca,
 
 	return H_SUCCESS;
 }
+
+struct ib_perf {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __attribute__ ((packed));
+
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+			     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct ib_perf *in_perf = (struct ib_perf *)in_mad;
+	struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+	struct ib_class_port_info *poi =
+		(struct ib_class_port_info *)out_perf->data;
+	struct ehca_shca *shca =
+		container_of(ibdev, struct ehca_shca, ib_device);
+	struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+	ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+	*out_mad = *in_mad;
+
+	if (in_perf->mad_hdr.class_version != 1) {
+		ehca_warn(ibdev, "Unsupported class_version=%x",
+			  in_perf->mad_hdr.class_version);
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+		goto perf_reply;
+	}
+
+	switch (in_perf->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_SET:
+		/* set class port info for redirection */
+		out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+		out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+		memset(poi, 0, sizeof(*poi));
+		poi->base_version = 1;
+		poi->class_version = 1;
+		poi->resp_time_value = 18;
+		poi->redirect_lid = sport->saved_attr.lid;
+		poi->redirect_qp = sport->pma_qp_nr;
+		poi->redirect_qkey = IB_QP1_QKEY;
+		poi->redirect_pkey = IB_DEFAULT_PKEY_FULL;
+
+		ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+			 sport->saved_attr.lid, sport->pma_qp_nr);
+		break;
+
+	case IB_MGMT_METHOD_GET_RESP:
+		return IB_MAD_RESULT_FAILURE;
+
+	default:
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+		break;
+	}
+
+perf_reply:
+	out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad,
+		     struct ib_mad *out_mad)
+{
+	int ret;
+
+	if (!port_num || port_num > ibdev->phys_port_cnt)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* accept only pma request */
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return IB_MAD_RESULT_SUCCESS;
+
+	ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+	ret = ehca_process_perf(ibdev, port_num, in_mad, out_mad);
+
+	return ret;
+}
-- 
cgit v1.1


From 0d89fe2c0ca12ad2ee4e35a0661319746af6e94a Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 4 Feb 2008 20:20:42 -0800
Subject: IB/mthca: Fix and simplify page size calculation in
 mthca_reg_phys_mr()

In mthca_reg_phys_mr(), we calculate the page size for the HCA
hardware to use to map the buffer list passed in by the consumer.
For example, if the consumer passes in

    [0] addr 0x1000, size 0x1000
    [1] addr 0x2000, size 0x1000

then the algorithm would come up with a page size of 0x2000 and a list
of two pages, at 0x0000 and 0x2000.  Usually, this would work fine
since the memory region would start at an offset of 0x1000 and have a
length of 0x2000.

However, the old code did not take into account the alignment of the
IO virtual address passed in.  For example, if the consumer passed in
a virtual address of 0x6000 for the above, then the offset of 0x1000
would not be used correctly because the page mask of 0x1fff would
result in an offset of 0.

We can fix this quite neatly by making sure that the page shift we use
is no bigger than the first bit where the start of the first buffer
and the IO virtual address differ.  Also, we can further simplify the
code by removing the special case for a single buffer by noticing that
it doesn't matter if we use a page size that is too big.  This allows
the loop to compute the page shift to be replaced with __ffs().

Thanks to Bryan S Rosenburg <rosnbrg@us.ibm.com> for pointing out the
original bug and suggesting several ways to improve this patch.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_provider.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 6bcde1c..19b7f61 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -923,17 +923,13 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
 	struct mthca_mr *mr;
 	u64 *page_list;
 	u64 total_size;
-	u64 mask;
+	unsigned long mask;
 	int shift;
 	int npages;
 	int err;
 	int i, j, n;
 
-	/* First check that we have enough alignment */
-	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK))
-		return ERR_PTR(-EINVAL);
-
-	mask = 0;
+	mask = buffer_list[0].addr ^ *iova_start;
 	total_size = 0;
 	for (i = 0; i < num_phys_buf; ++i) {
 		if (i != 0)
@@ -947,17 +943,7 @@ static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
 	if (mask & ~PAGE_MASK)
 		return ERR_PTR(-EINVAL);
 
-	/* Find largest page shift we can use to cover buffers */
-	for (shift = PAGE_SHIFT; shift < 31; ++shift)
-		if (num_phys_buf > 1) {
-			if ((1ULL << shift) & mask)
-				break;
-		} else {
-			if (1ULL << shift >=
-			    buffer_list[0].size +
-			    (buffer_list[0].addr & ((1ULL << shift) - 1)))
-				break;
-		}
+	shift = __ffs(mask | 1 << 31);
 
 	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
 	buffer_list[0].addr &= ~0ull << shift;
-- 
cgit v1.1


From bafff9741704959e99fb65a7327c017251019a19 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Thu, 17 Jan 2008 17:03:45 +0200
Subject: IPoIB: Handle bonding failover race for connected neighbours too

Move up the code that checks for a situation where the remote GID
stored in the ipoib_neigh is different than the one present in the
neighbour (handle gratuitous ARP) or that a bonding fail over has
happened but the neighbour still has a pointer to an ipoib_neigh
created by a different device than the current slave.  This will cause
the driver to apply the check also for connected mode neighbours.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index a082466..886a08cc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -680,12 +680,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (ipoib_cm_get(neigh)) {
-			if (ipoib_cm_up(neigh)) {
-				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
-				goto out;
-			}
-		} else if (neigh->ah) {
+		if (neigh->ah)
 			if (unlikely((memcmp(&neigh->dgid.raw,
 					    skb->dst->neighbour->ha + 4,
 					    sizeof(union ib_gid))) ||
@@ -706,6 +701,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				goto out;
 			}
 
+		if (ipoib_cm_get(neigh)) {
+			if (ipoib_cm_up(neigh)) {
+				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+				goto out;
+			}
+		} else if (neigh->ah) {
 			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
 			goto out;
 		}
-- 
cgit v1.1


From 7bc531dd883b955e6198c8e202161f22d2e8c472 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Tue, 29 Jan 2008 12:57:56 +0200
Subject: IPoIB: Remove a misleading debug print

Commit 732a2170 ("IB/ipoib: Bound the net device to the ipoib_neigh
structue") left a misleading debug print (n->dev would be a bond
device only if boding is used).  Clean it up.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 886a08cc..09f5371 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -814,11 +814,9 @@ static void ipoib_neigh_cleanup(struct neighbour *n)
 	struct ipoib_ah *ah = NULL;
 
 	neigh = *to_ipoib_neigh(n);
-	if (neigh) {
+	if (neigh)
 		priv = netdev_priv(neigh->dev);
-		ipoib_dbg(priv, "neigh_destructor for bonding device: %s\n",
-			  n->dev->name);
-	} else
+	else
 		return;
 	ipoib_dbg(priv,
 		  "neigh_cleanup for %06x " IPOIB_GID_FMT "\n",
-- 
cgit v1.1


From 6ccef1de2c1718729dd1c7ee8bd98473519eb3b3 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Sun, 27 Jan 2008 18:13:20 +0200
Subject: IB/mthca: Don't read reserved fields in mthca_QUERY_ADAPTER()

For memfree devices, the firmware QUERY_ADAPTER command does not
return vendor_id, device_id, and revision_id; do not return these
fields in the QUERY_ADAPTER function for memfree devices.

Instead, for memfree devices, initialize the rev_id field of the mthca
device via init_node_data (MAD IFC query), as is done in the
query_device verb implementation.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_cmd.c      | 11 ++++++++---
 drivers/infiniband/hw/mthca/mthca_main.c     |  3 ++-
 drivers/infiniband/hw/mthca/mthca_provider.c |  2 ++
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 6966f94..09a30dd 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1255,9 +1255,14 @@ int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
 	if (err)
 		goto out;
 
-	MTHCA_GET(adapter->vendor_id, outbox,   QUERY_ADAPTER_VENDOR_ID_OFFSET);
-	MTHCA_GET(adapter->device_id, outbox,   QUERY_ADAPTER_DEVICE_ID_OFFSET);
-	MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+	if (!mthca_is_memfree(dev)) {
+		MTHCA_GET(adapter->vendor_id, outbox,
+			  QUERY_ADAPTER_VENDOR_ID_OFFSET);
+		MTHCA_GET(adapter->device_id, outbox,
+			  QUERY_ADAPTER_DEVICE_ID_OFFSET);
+		MTHCA_GET(adapter->revision_id, outbox,
+			  QUERY_ADAPTER_REVISION_ID_OFFSET);
+	}
 	MTHCA_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
 
 	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 5cf8250..e3bd71a3 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -735,7 +735,8 @@ static int mthca_init_hca(struct mthca_dev *mdev)
 	}
 
 	mdev->eq_table.inta_pin = adapter.inta_pin;
-	mdev->rev_id            = adapter.revision_id;
+	if (!mthca_is_memfree(mdev))
+		mdev->rev_id = adapter.revision_id;
 	memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
 
 	return 0;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 19b7f61..9e491df 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1256,6 +1256,8 @@ static int mthca_init_node_data(struct mthca_dev *dev)
 		goto out;
 	}
 
+	if (mthca_is_memfree(dev))
+		dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
 
 out:
-- 
cgit v1.1


From 893da75956ab48545e8732b46e1cf4350bd25f9c Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Sun, 27 Jan 2008 18:13:25 +0200
Subject: mlx4_core: Don't read reserved fields in mlx4_QUERY_ADAPTER()

The firmware QUERY_ADAPTER command does not return vendor_id,
device_id, and revision_id; eliminate these fields from the query.

Initialize the rev_id field of the mlx4 device via init_node_data (MAD
IFC query), as is done in the query_device verb implementation.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c | 1 +
 drivers/net/mlx4/fw.c             | 6 ------
 drivers/net/mlx4/fw.h             | 3 ---
 drivers/net/mlx4/main.c           | 1 -
 4 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d8287d9..d2f50b6 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -468,6 +468,7 @@ static int init_node_data(struct mlx4_ib_dev *dev)
 	if (err)
 		goto out;
 
+	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
 
 out:
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 535a446..61dc495 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -617,9 +617,6 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
 	int err;
 
 #define QUERY_ADAPTER_OUT_SIZE             0x100
-#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
-#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
-#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
 #define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
 #define QUERY_ADAPTER_VSD_OFFSET           0x20
 
@@ -633,9 +630,6 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
 	if (err)
 		goto out;
 
-	MLX4_GET(adapter->vendor_id, outbox,   QUERY_ADAPTER_VENDOR_ID_OFFSET);
-	MLX4_GET(adapter->device_id, outbox,   QUERY_ADAPTER_DEVICE_ID_OFFSET);
-	MLX4_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
 	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
 
 	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 7e1dd9e..e16dec8 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -99,9 +99,6 @@ struct mlx4_dev_cap {
 };
 
 struct mlx4_adapter {
-	u32  vendor_id;
-	u32  device_id;
-	u32  revision_id;
 	char board_id[MLX4_BOARD_ID_LEN];
 	u8   inta_pin;
 };
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index a028d8a..859617d 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -534,7 +534,6 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	}
 
 	priv->eq_table.inta_pin = adapter.inta_pin;
-	dev->rev_id		= adapter.revision_id;
 	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
 
 	return 0;
-- 
cgit v1.1


From 9fe4bcf45ece0b0081031edaaa41581c85ef7049 Mon Sep 17 00:00:00 2001
From: David Dillow <dillowda@ornl.gov>
Date: Tue, 8 Jan 2008 17:08:52 -0500
Subject: IB/srp: Retry stale connections

When a host just goes away (crash, power loss, etc.) without tearing
down its IB connections, it can get stale connection errors when it
tries to reconnect to targets upon rebooting.  Retrying the connection
a few times will prevent sysadmins from playing the "which disk(s)
went missing?" game.

This would have made things slightly quicker when tracking down some
of the recent bugs, but it also helps quite a bit when you've got a
large number of targets hanging off a wedged server.

Signed-off-by: David Dillow <dillowda@ornl.gov>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 53 ++++++++++++++++++++++++++++---------
 drivers/infiniband/ulp/srp/ib_srp.h |  1 +
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 195ce7c..fd4a49f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -204,6 +204,22 @@ out:
 	return ret;
 }
 
+static int srp_new_cm_id(struct srp_target_port *target)
+{
+	struct ib_cm_id *new_cm_id;
+
+	new_cm_id = ib_create_cm_id(target->srp_host->dev->dev,
+				    srp_cm_handler, target);
+	if (IS_ERR(new_cm_id))
+		return PTR_ERR(new_cm_id);
+
+	if (target->cm_id)
+		ib_destroy_cm_id(target->cm_id);
+	target->cm_id = new_cm_id;
+
+	return 0;
+}
+
 static int srp_create_target_ib(struct srp_target_port *target)
 {
 	struct ib_qp_init_attr *init_attr;
@@ -436,6 +452,7 @@ static void srp_remove_work(struct work_struct *work)
 
 static int srp_connect_target(struct srp_target_port *target)
 {
+	int retries = 3;
 	int ret;
 
 	ret = srp_lookup_path(target);
@@ -468,6 +485,21 @@ static int srp_connect_target(struct srp_target_port *target)
 		case SRP_DLID_REDIRECT:
 			break;
 
+		case SRP_STALE_CONN:
+			/* Our current CM id was stale, and is now in timewait.
+			 * Try to reconnect with a new one.
+			 */
+			if (!retries-- || srp_new_cm_id(target)) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+					     "giving up on stale connection\n");
+				target->status = -ECONNRESET;
+				return target->status;
+			}
+
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "retrying stale connection\n");
+			break;
+
 		default:
 			return target->status;
 		}
@@ -507,7 +539,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re
 
 static int srp_reconnect_target(struct srp_target_port *target)
 {
-	struct ib_cm_id *new_cm_id;
 	struct ib_qp_attr qp_attr;
 	struct srp_request *req, *tmp;
 	struct ib_wc wc;
@@ -526,14 +557,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
 	 * Now get a new local CM ID so that we avoid confusing the
 	 * target in case things are really fouled up.
 	 */
-	new_cm_id = ib_create_cm_id(target->srp_host->dev->dev,
-				    srp_cm_handler, target);
-	if (IS_ERR(new_cm_id)) {
-		ret = PTR_ERR(new_cm_id);
+	ret = srp_new_cm_id(target);
+	if (ret)
 		goto err;
-	}
-	ib_destroy_cm_id(target->cm_id);
-	target->cm_id = new_cm_id;
 
 	qp_attr.qp_state = IB_QPS_RESET;
 	ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE);
@@ -1171,6 +1197,11 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 		target->status = -ECONNRESET;
 		break;
 
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost, "  REJ reason: stale connection\n");
+		target->status = SRP_STALE_CONN;
+		break;
+
 	default:
 		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
 			     event->param.rej_rcvd.reason);
@@ -1862,11 +1893,9 @@ static ssize_t srp_create_target(struct class_device *class_dev,
 	if (ret)
 		goto err;
 
-	target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target);
-	if (IS_ERR(target->cm_id)) {
-		ret = PTR_ERR(target->cm_id);
+	ret = srp_new_cm_id(target);
+	if (ret)
 		goto err_free;
-	}
 
 	target->qp_in_error = 0;
 	ret = srp_connect_target(target);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 4a3c1f3..cb6eb81 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -54,6 +54,7 @@ enum {
 
 	SRP_PORT_REDIRECT	= 1,
 	SRP_DLID_REDIRECT	= 2,
+	SRP_STALE_CONN		= 3,
 
 	SRP_MAX_LUN		= 512,
 	SRP_DEF_SG_TABLESIZE	= 12,
-- 
cgit v1.1


From 1d96354e617990799b1cb5d7ff8f7c467b8767c8 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Tue, 29 Jan 2008 12:56:18 +0200
Subject: IB/fmr_pool: Allocate page list for pool FMRs only when caching
 enabled

Allocate memory for the page_list field of struct ib_pool_fmr only
when caching is enabled for the FMR pool, since the field is not used
otherwise.  This can save significant amounts of memory for large
pools with caching turned off.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/fmr_pool.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 6c7aa59..7f00347 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -320,10 +320,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 			.max_maps   = pool->max_remaps,
 			.page_shift = params->page_shift
 		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
 
 		for (i = 0; i < params->pool_size; ++i) {
-			fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64),
-				      GFP_KERNEL);
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
 			if (!fmr) {
 				printk(KERN_WARNING PFX "failed to allocate fmr "
 				       "struct for FMR %d\n", i);
-- 
cgit v1.1


From 1203c42e7be1aa0be641b701f42b6d38c2d94b39 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.co.il>
Date: Mon, 4 Feb 2008 20:20:44 -0800
Subject: IB/mthca: Remove checks for srq->first_free < 0

The SRQ receive posting functions make sure that srq->first_free never
becomes negative, so we can remove tests of whether it is negative.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_srq.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 553d681..ec63adc 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -475,11 +475,7 @@ void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
 
 	spin_lock(&srq->lock);
 
-	if (likely(srq->first_free >= 0))
-		*wqe_to_link(get_wqe(srq, srq->last_free)) = ind;
-	else
-		srq->first_free = ind;
-
+	*wqe_to_link(get_wqe(srq, srq->last_free)) = ind;
 	*wqe_to_link(get_wqe(srq, ind)) = -1;
 	srq->last_free = ind;
 
@@ -506,15 +502,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	first_ind = srq->first_free;
 
 	for (nreq = 0; wr; wr = wr->next) {
-		ind = srq->first_free;
-
-		if (unlikely(ind < 0)) {
-			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
-			err = -ENOMEM;
-			*bad_wr = wr;
-			break;
-		}
-
+		ind       = srq->first_free;
 		wqe       = get_wqe(srq, ind);
 		next_ind  = *wqe_to_link(wqe);
 
@@ -614,15 +602,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	spin_lock_irqsave(&srq->lock, flags);
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
-		ind = srq->first_free;
-
-		if (unlikely(ind < 0)) {
-			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
-			err = -ENOMEM;
-			*bad_wr = wr;
-			break;
-		}
-
+		ind       = srq->first_free;
 		wqe       = get_wqe(srq, ind);
 		next_ind  = *wqe_to_link(wqe);
 
-- 
cgit v1.1


From 1d368c546566e249da8181e933c53788093965cf Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli at mellanox.co.il>
Date: Thu, 24 Jan 2008 06:38:06 -0800
Subject: IB/ib_mthca: Pre-link receive WQEs in Tavor mode

We have recently discovered that Tavor mode requires each WQE in a
posted list of receive WQEs to have a valid NDA field at all times.
This requirement holds true for regular QPs as well as for SRQs.  This
patch prelinks the receive queue in a regular QP and keeps the free
list in SRQ always properly linked.

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Reviewed-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c  | 13 ++++++++-----
 drivers/infiniband/hw/mthca/mthca_srq.c | 23 ++++++++++++++---------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 0e5461c..db5595b 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1175,6 +1175,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
 {
 	int ret;
 	int i;
+	struct mthca_next_seg *next;
 
 	qp->refcount = 1;
 	init_waitqueue_head(&qp->wait);
@@ -1217,7 +1218,6 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
 	}
 
 	if (mthca_is_memfree(dev)) {
-		struct mthca_next_seg *next;
 		struct mthca_data_seg *scatter;
 		int size = (sizeof (struct mthca_next_seg) +
 			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
@@ -1240,6 +1240,13 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
 						    qp->sq.wqe_shift) +
 						   qp->send_wqe_offset);
 		}
+	} else {
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+					      qp->rq.wqe_shift) | 1);
+		}
+
 	}
 
 	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
@@ -1863,7 +1870,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		prev_wqe = qp->rq.last;
 		qp->rq.last = wqe;
 
-		((struct mthca_next_seg *) wqe)->nda_op = 0;
 		((struct mthca_next_seg *) wqe)->ee_nds =
 			cpu_to_be32(MTHCA_NEXT_DBD);
 		((struct mthca_next_seg *) wqe)->flags = 0;
@@ -1885,9 +1891,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 
 		qp->wrid[ind] = wr->wr_id;
 
-		((struct mthca_next_seg *) prev_wqe)->nda_op =
-			cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
-		wmb();
 		((struct mthca_next_seg *) prev_wqe)->ee_nds =
 			cpu_to_be32(MTHCA_NEXT_DBD | size);
 
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index ec63adc..a5ffff6 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -175,9 +175,17 @@ static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
 	 * scatter list L_Keys to the sentry value of 0x100.
 	 */
 	for (i = 0; i < srq->max; ++i) {
-		wqe = get_wqe(srq, i);
+		struct mthca_next_seg *next;
 
-		*wqe_to_link(wqe) = i < srq->max - 1 ? i + 1 : -1;
+		next = wqe = get_wqe(srq, i);
+
+		if (i < srq->max - 1) {
+			*wqe_to_link(wqe) = i + 1;
+			next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+		} else {
+			*wqe_to_link(wqe) = -1;
+			next->nda_op = 0;
+		}
 
 		for (scatter = wqe + sizeof (struct mthca_next_seg);
 		     (void *) scatter < wqe + (1 << srq->wqe_shift);
@@ -470,12 +478,15 @@ out:
 void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
 {
 	int ind;
+	struct mthca_next_seg *last_free;
 
 	ind = wqe_addr >> srq->wqe_shift;
 
 	spin_lock(&srq->lock);
 
-	*wqe_to_link(get_wqe(srq, srq->last_free)) = ind;
+	last_free = get_wqe(srq, srq->last_free);
+	*wqe_to_link(last_free) = ind;
+	last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
 	*wqe_to_link(get_wqe(srq, ind)) = -1;
 	srq->last_free = ind;
 
@@ -516,7 +527,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 		prev_wqe  = srq->last;
 		srq->last = wqe;
 
-		((struct mthca_next_seg *) wqe)->nda_op = 0;
 		((struct mthca_next_seg *) wqe)->ee_nds = 0;
 		/* flags field will always remain 0 */
 
@@ -537,9 +547,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 		if (i < srq->max_gs)
 			mthca_set_data_seg_inval(wqe);
 
-		((struct mthca_next_seg *) prev_wqe)->nda_op =
-			cpu_to_be32((ind << srq->wqe_shift) | 1);
-		wmb();
 		((struct mthca_next_seg *) prev_wqe)->ee_nds =
 			cpu_to_be32(MTHCA_NEXT_DBD);
 
@@ -613,8 +620,6 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 			break;
 		}
 
-		((struct mthca_next_seg *) wqe)->nda_op =
-			cpu_to_be32((next_ind << srq->wqe_shift) | 1);
 		((struct mthca_next_seg *) wqe)->ee_nds = 0;
 		/* flags field will always remain 0 */
 
-- 
cgit v1.1


From 68f3948dab39249d178eb007c071f87fb6481fc6 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 4 Feb 2008 20:20:44 -0800
Subject: IB/mlx4: Actually print out the driver version

The string mlx4_ib_version was defined, but never used.  Print out the
version once when the first device is initialized.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d2f50b6..96a39b5 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -52,7 +52,7 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
-static const char mlx4_ib_version[] __devinitdata =
+static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
@@ -517,9 +517,16 @@ static struct class_device_attribute *mlx4_class_attributes[] = {
 
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
+	static int mlx4_ib_version_printed;
 	struct mlx4_ib_dev *ibdev;
 	int i;
 
+
+	if (!mlx4_ib_version_printed) {
+		printk(KERN_INFO "%s", mlx4_ib_version);
+		++mlx4_ib_version_printed;
+	}
+
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
 		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
-- 
cgit v1.1


From f33afc26dc03e6e0513e2e300f2aa0ad5463c2d2 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Mon, 4 Feb 2008 20:20:44 -0800
Subject: IB: Avoid marking __devinitdata as const

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_main.c | 2 +-
 drivers/net/mlx4/main.c                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index e3bd71a3..cd3d8ad 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -126,7 +126,7 @@ module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
 MODULE_PARM_DESC(fmr_reserved_mtts,
 		 "number of memory translation table segments reserved for FMR");
 
-static const char mthca_version[] __devinitdata =
+static char mthca_version[] __devinitdata =
 	DRV_NAME ": Mellanox InfiniBand HCA driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 859617d..08bfc13 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -71,7 +71,7 @@ MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
 
 #endif /* CONFIG_PCI_MSI */
 
-static const char mlx4_version[] __devinitdata =
+static char mlx4_version[] __devinitdata =
 	DRV_NAME ": Mellanox ConnectX core driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
-- 
cgit v1.1


From 2c78853472a36c7cf51a84a34edc370e21c93ce4 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <olaf.kirch@oracle.com>
Date: Mon, 4 Feb 2008 20:20:44 -0800
Subject: IB/mthca: Return proper error codes from mthca_fmr_alloc()

If the allocation of the MTT or the mailbox failed, mthca_fmr_alloc()
would return 0 (success) no matter what. This leads to crashes a
little down the road, when we try to dereference eg mr->mtt, which was
really ERR_PTR(-Ewhatever).

Signed-off-by: Olaf Kirch <olaf.kirch@oracle.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_mr.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index aa6c70a..3b69855 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -613,8 +613,10 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 			sizeof *(mr->mem.tavor.mpt) * idx;
 
 	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
-	if (IS_ERR(mr->mtt))
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
 		goto err_out_table;
+	}
 
 	mtt_seg = mr->mtt->first_seg * MTHCA_MTT_SEG_SIZE;
 
@@ -627,8 +629,10 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
 
 	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
-	if (IS_ERR(mailbox))
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
 		goto err_out_free_mtt;
+	}
 
 	mpt_entry = mailbox->buf;
 
-- 
cgit v1.1


From 3c2d774cad5bf4fad576363da77870e9e6530b7a Mon Sep 17 00:00:00 2001
From: Glenn Streiff <gstreiff@neteffect.com>
Date: Mon, 4 Feb 2008 20:20:45 -0800
Subject: RDMA/nes: Add a driver for NetEffect RNICs

Add a standard NIC and RDMA/iWARP driver for NetEffect 1/10Gb ethernet adapters.

Signed-off-by: Glenn Streiff <gstreiff@neteffect.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 MAINTAINERS                             |   10 +
 drivers/infiniband/Kconfig              |    2 +-
 drivers/infiniband/Makefile             |    1 +
 drivers/infiniband/hw/nes/Kconfig       |   16 +
 drivers/infiniband/hw/nes/Makefile      |    3 +
 drivers/infiniband/hw/nes/nes.c         | 1152 +++++++++
 drivers/infiniband/hw/nes/nes.h         |  560 +++++
 drivers/infiniband/hw/nes/nes_cm.c      | 3088 ++++++++++++++++++++++++
 drivers/infiniband/hw/nes/nes_cm.h      |  433 ++++
 drivers/infiniband/hw/nes/nes_context.h |  193 ++
 drivers/infiniband/hw/nes/nes_hw.c      | 3080 ++++++++++++++++++++++++
 drivers/infiniband/hw/nes/nes_hw.h      | 1206 ++++++++++
 drivers/infiniband/hw/nes/nes_nic.c     | 1703 ++++++++++++++
 drivers/infiniband/hw/nes/nes_user.h    |  112 +
 drivers/infiniband/hw/nes/nes_utils.c   |  917 ++++++++
 drivers/infiniband/hw/nes/nes_verbs.c   | 3917 +++++++++++++++++++++++++++++++
 drivers/infiniband/hw/nes/nes_verbs.h   |  169 ++
 17 files changed, 16561 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/hw/nes/Kconfig
 create mode 100644 drivers/infiniband/hw/nes/Makefile
 create mode 100644 drivers/infiniband/hw/nes/nes.c
 create mode 100644 drivers/infiniband/hw/nes/nes.h
 create mode 100644 drivers/infiniband/hw/nes/nes_cm.c
 create mode 100644 drivers/infiniband/hw/nes/nes_cm.h
 create mode 100644 drivers/infiniband/hw/nes/nes_context.h
 create mode 100644 drivers/infiniband/hw/nes/nes_hw.c
 create mode 100644 drivers/infiniband/hw/nes/nes_hw.h
 create mode 100644 drivers/infiniband/hw/nes/nes_nic.c
 create mode 100644 drivers/infiniband/hw/nes/nes_user.h
 create mode 100644 drivers/infiniband/hw/nes/nes_utils.c
 create mode 100644 drivers/infiniband/hw/nes/nes_verbs.c
 create mode 100644 drivers/infiniband/hw/nes/nes_verbs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index da30a72..548df4b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2681,6 +2681,16 @@ M:	James.Bottomley@HansenPartnership.com
 L:	linux-scsi@vger.kernel.org
 S:	Maintained
 
+NETEFFECT IWARP RNIC DRIVER (IW_NES)
+P:	Faisal Latif
+M:	flatif@neteffect.com
+P:	Glenn Streiff
+M:	gstreiff@neteffect.com
+L:	general@lists.openfabrics.org
+W:	http://www.neteffect.com
+S:	Supported
+F:	drivers/infiniband/hw/nes/
+
 NETEM NETWORK EMULATOR
 P:	Stephen Hemminger
 M:	shemminger@linux-foundation.org
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a193dfb..a5dc78a 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -44,8 +44,8 @@ source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
-
 source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index 75f325e4..ed35e44 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_INFINIBAND_EHCA)		+= hw/ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)	+= hw/amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
+obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)		+= ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)		+= ulp/iser/
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
new file mode 100644
index 0000000..2aeb7ac
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_NES
+	tristate "NetEffect RNIC Driver"
+	depends on PCI && INET && INFINIBAND
+	select LIBCRC32C
+	---help---
+	  This is a low-level driver for NetEffect RDMA enabled
+	  Network Interface Cards (RNIC).
+
+config INFINIBAND_NES_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_NES
+	default n
+	---help---
+	  This option causes the NetEffect RNIC driver to produce debug
+	  messages.  Select this if you are developing the driver
+	  or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
new file mode 100644
index 0000000..3514851
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
+
+iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
new file mode 100644
index 0000000..7f8853b
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -0,0 +1,1152 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/iw_cm.h>
+
+#include "nes.h"
+
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <linux/route.h>
+#include <net/ip_fib.h>
+
+MODULE_AUTHOR("NetEffect");
+MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int max_mtu = 9000;
+int nics_per_function = 1;
+int interrupt_mod_interval = 0;
+
+
+/* Interoperability */
+int mpa_version = 1;
+module_param(mpa_version, int, 0);
+MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
+
+/* Interoperability */
+int disable_mpa_crc = 0;
+module_param(disable_mpa_crc, int, 0);
+MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
+
+unsigned int send_first = 0;
+module_param(send_first, int, 0);
+MODULE_PARM_DESC(send_first, "Send RDMA Message First on Active Connection");
+
+
+unsigned int nes_drv_opt = 0;
+module_param(nes_drv_opt, int, 0);
+MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
+
+unsigned int nes_debug_level = 0;
+module_param_named(debug_level, nes_debug_level, uint, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug output level");
+
+LIST_HEAD(nes_adapter_list);
+LIST_HEAD(nes_dev_list);
+
+atomic_t qps_destroyed;
+atomic_t cqp_reqs_allocated;
+atomic_t cqp_reqs_freed;
+atomic_t cqp_reqs_dynallocated;
+atomic_t cqp_reqs_dynfreed;
+atomic_t cqp_reqs_queued;
+atomic_t cqp_reqs_redriven;
+
+static void nes_print_macaddr(struct net_device *netdev);
+static irqreturn_t nes_interrupt(int, void *);
+static int __devinit nes_probe(struct pci_dev *, const struct pci_device_id *);
+static void __devexit nes_remove(struct pci_dev *);
+static int __init nes_init_module(void);
+static void __exit nes_exit_module(void);
+static unsigned int ee_flsh_adapter;
+static unsigned int sysfs_nonidx_addr;
+static unsigned int sysfs_idx_addr;
+
+static struct pci_device_id nes_pci_table[] = {
+	{PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020, PCI_ANY_ID, PCI_ANY_ID},
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, nes_pci_table);
+
+static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
+static int nes_net_event(struct notifier_block *, unsigned long, void *);
+static int nes_notifiers_registered;
+
+
+static struct notifier_block nes_inetaddr_notifier = {
+	.notifier_call = nes_inetaddr_event
+};
+
+static struct notifier_block nes_net_notifier = {
+	.notifier_call = nes_net_event
+};
+
+
+
+
+/**
+ * nes_inetaddr_event
+ */
+static int nes_inetaddr_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct nes_vnic *nesvnic;
+	unsigned int addr;
+	unsigned int mask;
+
+	addr = ntohl(ifa->ifa_address);
+	mask = ntohl(ifa->ifa_mask);
+	nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %08X, netmask %08X.\n",
+			addr, mask);
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
+				nesdev, nesdev->netdev[0]->name);
+		netdev = nesdev->netdev[0];
+		nesvnic = netdev_priv(netdev);
+		if (netdev == event_netdev) {
+			if (nesvnic->rdma_enabled == 0) {
+				nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
+						" RDMA is not enabled.\n",
+						netdev->name);
+				return NOTIFY_OK;
+			}
+			/* we have ifa->ifa_address/mask here if we need it */
+			switch (event) {
+				case NETDEV_DOWN:
+					nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
+
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
+					nesvnic->local_ipaddr = 0;
+					return NOTIFY_OK;
+					break;
+				case NETDEV_UP:
+					nes_debug(NES_DBG_NETDEV, "event:UP\n");
+
+					if (nesvnic->local_ipaddr != 0) {
+						nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
+						return NOTIFY_OK;
+					}
+					/* Add the address to the IP table */
+					nesvnic->local_ipaddr = ifa->ifa_address;
+
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
+							ntohl(ifa->ifa_address));
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
+					return NOTIFY_OK;
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_net_event
+ */
+static int nes_net_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct neighbour *neigh = ptr;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct nes_vnic *nesvnic;
+
+	switch (event) {
+		case NETEVENT_NEIGH_UPDATE:
+			list_for_each_entry(nesdev, &nes_dev_list, list) {
+				/* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
+				netdev = nesdev->netdev[0];
+				nesvnic = netdev_priv(netdev);
+				if (netdev == neigh->dev) {
+					if (nesvnic->rdma_enabled == 0) {
+						nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
+								netdev->name);
+					} else {
+						if (neigh->nud_state & NUD_VALID) {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
+						} else {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
+						}
+					}
+					return NOTIFY_OK;
+				}
+			}
+			break;
+		default:
+			nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_add_ref
+ */
+void nes_add_ref(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp;
+
+	nesqp = to_nesqp(ibqp);
+	nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
+			ibqp->qp_num, atomic_read(&nesqp->refcount));
+	atomic_inc(&nesqp->refcount);
+}
+
+static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 qp_id;
+
+	atomic_inc(&qps_destroyed);
+
+	/* Free the control structures */
+
+	qp_id = nesqp->hwqp.qp_id;
+	if (nesqp->pbl_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+
+	} else {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
+
+	kfree(nesqp->allocated_buffer);
+
+}
+
+/**
+ * nes_rem_ref
+ */
+void nes_rem_ref(struct ib_qp *ibqp)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	u32 opcode;
+
+	nesqp = to_nesqp(ibqp);
+
+	if (atomic_read(&nesqp->refcount) == 0) {
+		printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
+				__FUNCTION__, ibqp->qp_num, nesqp->last_aeq);
+		BUG();
+	}
+
+	if (atomic_dec_and_test(&nesqp->refcount)) {
+		nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
+
+		/* Destroy the QP */
+		cqp_request = nes_get_cqp_request(nesdev);
+		if (cqp_request == NULL) {
+			nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+			return;
+		}
+		cqp_request->waiting = 0;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
+		cqp_request->cqp_callback_pointer = nesqp;
+		cqp_wqe = &cqp_request->cqp_wqe;
+
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
+
+		if (nesqp->hte_added) {
+			opcode  |= NES_CQP_QP_DEL_HTE;
+			nesqp->hte_added = 0;
+		}
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+		nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+	}
+}
+
+
+/**
+ * nes_get_qp
+ */
+struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
+		return NULL;
+
+	return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
+}
+
+
+/**
+ * nes_print_macaddr
+ */
+static void nes_print_macaddr(struct net_device *netdev)
+{
+	nes_debug(NES_DBG_INIT, "%s: MAC %02X:%02X:%02X:%02X:%02X:%02X, IRQ %u\n",
+			netdev->name,
+			netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2],
+			netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5],
+			netdev->irq);
+}
+
+
+/**
+ * nes_interrupt - handle interrupts
+ */
+static irqreturn_t nes_interrupt(int irq, void *dev_id)
+{
+	struct nes_device *nesdev = (struct nes_device *)dev_id;
+	int handled = 0;
+	u32 int_mask;
+	u32 int_req;
+	u32 int_stat;
+	u32 intf_int_stat;
+	u32 timer_stat;
+
+	if (nesdev->msi_enabled) {
+		/* No need to read the interrupt pending register if msi is enabled */
+		handled = 1;
+	} else {
+		if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
+			/* Master interrupt enable provides synchronization for kicking off bottom half
+			  when interrupt sharing is going on */
+			int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
+			if (int_mask & 0x80000000) {
+				/* Check interrupt status to see if this might be ours */
+				int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+				int_req = nesdev->int_req;
+				if (int_stat&int_req) {
+					/* if interesting CEQ or AEQ is pending, claim the interrupt */
+					if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
+						handled = 1;
+					} else {
+						if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
+							/* Timer might be running but might be for another function */
+							timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+							if ((timer_stat & nesdev->timer_int_req) != 0) {
+								handled = 1;
+							}
+						}
+						if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
+								(handled == 0)) {
+							intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+							if ((intf_int_stat & nesdev->intf_int_req) != 0) {
+								handled = 1;
+							}
+						}
+					}
+					if (handled) {
+						nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
+						int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
+						/* Save off the status to save an additional read */
+						nesdev->int_stat = int_stat;
+						nesdev->napi_isr_ran = 1;
+					}
+				}
+			}
+		} else {
+			handled = nes_read32(nesdev->regs+NES_INT_PENDING);
+		}
+	}
+
+	if (handled) {
+
+		if (nes_napi_isr(nesdev) == 0) {
+			tasklet_schedule(&nesdev->dpc_tasklet);
+
+		}
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+
+/**
+ * nes_probe - Device initialization
+ */
+static int __devinit nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct nes_device *nesdev = NULL;
+	int ret = 0;
+	struct nes_vnic *nesvnic = NULL;
+	void __iomem *mmio_regs = NULL;
+	u8 hw_rev;
+
+	assert(pcidev != NULL);
+	assert(ent != NULL);
+
+	printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
+			DRV_VERSION, pci_name(pcidev));
+
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
+		goto bail0;
+	}
+
+	nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_0),
+			(long unsigned int)pci_resource_len(pcidev, BAR_0));
+	nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_1),
+			(long unsigned int)pci_resource_len(pcidev, BAR_1));
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
+			!(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_64BIT_MASK);
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_64BIT_MASK);
+		if (ret) {
+			printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_32BIT_MASK);
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_32BIT_MASK);
+		if (ret) {
+			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	pci_set_master(pcidev);
+
+	/* Allocate hardware structure */
+	nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
+	if (!nesdev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev));
+		ret = -ENOMEM;
+		goto bail2;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
+	nesdev->pcidev = pcidev;
+	pci_set_drvdata(pcidev, nesdev);
+
+	pci_read_config_byte(pcidev, 0x0008, &hw_rev);
+	nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
+
+	spin_lock_init(&nesdev->indexed_regs_lock);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), sizeof(mmio_regs));
+	if (mmio_regs == NULL) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail3;
+	}
+	nesdev->regs = mmio_regs;
+	nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
+
+	/* Ensure interrupts are disabled */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
+		if (!pci_enable_msi(nesdev->pcidev)) {
+			nesdev->msi_enabled = 1;
+			nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
+					pci_name(pcidev));
+		} else {
+			nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
+					pci_name(pcidev));
+		}
+	} else {
+		nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
+				pci_name(pcidev));
+	}
+
+	nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
+	nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
+
+	/* Init the adapter */
+	nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
+	nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	if (!nesdev->nesadapter) {
+		printk(KERN_ERR PFX "Unable to initialize adapter.\n");
+		ret = -ENOMEM;
+		goto bail5;
+	}
+
+	/* nesdev->base_doorbell_index =
+			nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
+	nesdev->base_doorbell_index = 1;
+	nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
+	nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) % nesdev->nesadapter->port_count;
+
+	tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
+
+	/* bring up the Control QP */
+	if (nes_init_cqp(nesdev)) {
+		ret = -ENODEV;
+		goto bail6;
+	}
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			PCI_FUNC(nesdev->pcidev->devfn));
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	/* Enable the interrupts */
+	nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
+			(1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
+		nesdev->int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+24));
+	}
+
+	/* TODO: This really should be the first driver to load, not function 0 */
+	if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
+		/* pick up PCI and critical errors if the first driver to load */
+		nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
+		nesdev->int_req |= NES_INT_INTF;
+	} else {
+		nesdev->intf_int_req = 0;
+	}
+	nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
+
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
+
+	/* deal with both periodic and one_shot */
+	nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
+	nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
+	nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
+
+	nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+
+	list_add_tail(&nesdev->list, &nes_dev_list);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+				pci_name(pcidev), pcidev->irq);
+		goto bail65;
+	}
+
+	nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+
+	if (nes_notifiers_registered == 0) {
+		register_inetaddr_notifier(&nes_inetaddr_notifier);
+		register_netevent_notifier(&nes_net_notifier);
+	}
+	nes_notifiers_registered++;
+
+	/* Initialize network devices */
+		if ((netdev = nes_netdev_init(nesdev, mmio_regs)) == NULL) {
+			goto bail7;
+		}
+
+		/* Register network device */
+		ret = register_netdev(netdev);
+		if (ret) {
+			printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
+			nes_netdev_destroy(netdev);
+			goto bail7;
+		}
+
+		nes_print_macaddr(netdev);
+		/* create a CM core for this netdev */
+		nesvnic = netdev_priv(netdev);
+
+		nesdev->netdev_count++;
+		nesdev->nesadapter->netdev_count++;
+
+
+	printk(KERN_ERR PFX "%s: NetEffect RNIC driver successfully loaded.\n",
+			pci_name(pcidev));
+	return 0;
+
+	bail7:
+	printk(KERN_ERR PFX "bail7\n");
+	while (nesdev->netdev_count > 0) {
+		nesdev->netdev_count--;
+		nesdev->nesadapter->netdev_count--;
+
+		unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
+		nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
+	}
+
+	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
+			nesdev->netdev_count, nesdev->nesadapter->netdev_count);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	bail65:
+	printk(KERN_ERR PFX "bail65\n");
+	free_irq(pcidev->irq, nesdev);
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+	bail6:
+	printk(KERN_ERR PFX "bail6\n");
+	tasklet_kill(&nesdev->dpc_tasklet);
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	bail5:
+	printk(KERN_ERR PFX "bail5\n");
+	iounmap(nesdev->regs);
+
+	bail3:
+	printk(KERN_ERR PFX "bail3\n");
+	kfree(nesdev);
+
+	bail2:
+	pci_release_regions(pcidev);
+
+	bail1:
+	pci_disable_device(pcidev);
+
+	bail0:
+	return ret;
+}
+
+
+/**
+ * nes_remove - unload from kernel
+ */
+static void __devexit nes_remove(struct pci_dev *pcidev)
+{
+	struct nes_device *nesdev = pci_get_drvdata(pcidev);
+	struct net_device *netdev;
+	int netdev_index = 0;
+
+		if (nesdev->netdev_count) {
+			netdev = nesdev->netdev[netdev_index];
+			if (netdev) {
+				netif_stop_queue(netdev);
+				unregister_netdev(netdev);
+				nes_netdev_destroy(netdev);
+
+				nesdev->netdev[netdev_index] = NULL;
+				nesdev->netdev_count--;
+				nesdev->nesadapter->netdev_count--;
+			}
+		}
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+	tasklet_kill(&nesdev->dpc_tasklet);
+
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	free_irq(pcidev->irq, nesdev);
+
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+
+	iounmap(nesdev->regs);
+	kfree(nesdev);
+
+	/* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
+	pci_release_regions(pcidev);
+	pci_disable_device(pcidev);
+	pci_set_drvdata(pcidev, NULL);
+}
+
+
+static struct pci_driver nes_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = nes_pci_table,
+	.probe = nes_probe,
+	.remove = __devexit_p(nes_remove),
+};
+
+static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf)
+{
+	unsigned int  devfn = 0xffffffff;
+	unsigned char bus_number = 0xff;
+	unsigned int  i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			devfn      = nesdev->nesadapter->devfn;
+			bus_number = nesdev->nesadapter->bus_number;
+			break;
+		}
+		i++;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%x:%x", bus_number, devfn);
+}
+
+static ssize_t nes_store_adapter(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	ee_flsh_adapter = simple_strtoul(p, &p, 10);
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
+			break;
+		}
+		i++;
+	}
+	return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
+}
+
+static ssize_t nes_store_ee_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
+}
+
+static ssize_t nes_store_ee_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 flash_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
+}
+
+static ssize_t nes_store_flash_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf)
+{
+	u32 flash_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
+}
+
+static ssize_t nes_store_flash_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
+}
+
+static ssize_t nes_store_nonidx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf)
+{
+	u32 nonidx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
+}
+
+static ssize_t nes_store_nonidx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
+}
+
+static ssize_t nes_store_idx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_idx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf)
+{
+	u32 idx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
+}
+
+static ssize_t nes_store_idx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write_indexed(nesdev, sysfs_idx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR,
+		   nes_show_adapter, nes_store_adapter);
+static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_ee_cmd, nes_store_ee_cmd);
+static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR,
+		   nes_show_ee_data, nes_store_ee_data);
+static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_flash_cmd, nes_store_flash_cmd);
+static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR,
+		   nes_show_flash_data, nes_store_flash_data);
+static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_addr, nes_store_nonidx_addr);
+static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_data, nes_store_nonidx_data);
+static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_idx_addr, nes_store_idx_addr);
+static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR,
+		   nes_show_idx_data, nes_store_idx_data);
+
+static int nes_create_driver_sysfs(struct pci_driver *drv)
+{
+	int error;
+	error  = driver_create_file(&drv->driver, &driver_attr_adapter);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_data);
+	return error;
+}
+
+static void nes_remove_driver_sysfs(struct pci_driver *drv)
+{
+	driver_remove_file(&drv->driver, &driver_attr_adapter);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_data);
+	driver_remove_file(&drv->driver, &driver_attr_flash_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_flash_data);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_data);
+	driver_remove_file(&drv->driver, &driver_attr_idx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_idx_data);
+}
+
+/**
+ * nes_init_module - module initialization entry point
+ */
+static int __init nes_init_module(void)
+{
+	int retval;
+	int retval1;
+
+	retval = nes_cm_start();
+	if (retval) {
+		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
+		return retval;
+	}
+	retval = pci_register_driver(&nes_pci_driver);
+	if (retval >= 0) {
+		retval1 = nes_create_driver_sysfs(&nes_pci_driver);
+		if (retval1 < 0)
+			printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n");
+	}
+	return retval;
+}
+
+
+/**
+ * nes_exit_module - module unload entry point
+ */
+static void __exit nes_exit_module(void)
+{
+	nes_cm_stop();
+	nes_remove_driver_sysfs(&nes_pci_driver);
+
+	pci_unregister_driver(&nes_pci_driver);
+}
+
+
+module_init(nes_init_module);
+module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
new file mode 100644
index 0000000..fd57e8a
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NES_H
+#define __NES_H
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <asm/semaphore.h>
+#include <linux/version.h>
+#include <asm/io.h>
+#include <linux/crc32c.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+
+#define NES_SEND_FIRST_WRITE
+
+#define QUEUE_DISCONNECTS
+
+#define DRV_BUILD   "1"
+
+#define DRV_NAME    "iw_nes"
+#define DRV_VERSION "1.0 KO Build " DRV_BUILD
+#define PFX         DRV_NAME ": "
+
+/*
+ * NetEffect PCI vendor id and NE010 PCI device id.
+ */
+#ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
+#define PCI_VENDOR_ID_NETEFFECT       0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100
+#endif
+
+#define NE020_REV   4
+#define NE020_REV1  5
+
+#define BAR_0       0
+#define BAR_1       2
+
+#define RX_BUF_SIZE             (1536 + 8)
+#define NES_REG0_SIZE           (4 * 1024)
+#define NES_TX_TIMEOUT          (6*HZ)
+#define NES_FIRST_QPN           64
+#define NES_SW_CONTEXT_ALIGN    1024
+
+#define NES_NIC_MAX_NICS        16
+#define NES_MAX_ARP_TABLE_SIZE  4096
+
+#define NES_NIC_CEQ_SIZE        8
+/* NICs will be on a separate CQ */
+#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
+
+#define NES_MAX_PORT_COUNT 4
+
+#define MAX_DPC_ITERATIONS               128
+
+#define NES_CQP_REQUEST_NO_DOORBELL_RING 0
+#define NES_CQP_REQUEST_RING_DOORBELL    1
+
+#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
+#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
+#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
+#define NES_DRV_OPT_DISABLE_INTF         0x00000008
+#define NES_DRV_OPT_ENABLE_MSI           0x00000010
+#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
+#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
+#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
+#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
+#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
+
+#define NES_AEQ_EVENT_TIMEOUT         2500
+#define NES_DISCONNECT_EVENT_TIMEOUT  2000
+
+/* debug levels */
+/* must match userspace */
+#define NES_DBG_HW          0x00000001
+#define NES_DBG_INIT        0x00000002
+#define NES_DBG_ISR         0x00000004
+#define NES_DBG_PHY         0x00000008
+#define NES_DBG_NETDEV      0x00000010
+#define NES_DBG_CM          0x00000020
+#define NES_DBG_CM1         0x00000040
+#define NES_DBG_NIC_RX      0x00000080
+#define NES_DBG_NIC_TX      0x00000100
+#define NES_DBG_CQP         0x00000200
+#define NES_DBG_MMAP        0x00000400
+#define NES_DBG_MR          0x00000800
+#define NES_DBG_PD          0x00001000
+#define NES_DBG_CQ          0x00002000
+#define NES_DBG_QP          0x00004000
+#define NES_DBG_MOD_QP      0x00008000
+#define NES_DBG_AEQ         0x00010000
+#define NES_DBG_IW_RX       0x00020000
+#define NES_DBG_IW_TX       0x00040000
+#define NES_DBG_SHUTDOWN    0x00080000
+#define NES_DBG_RSVD1       0x10000000
+#define NES_DBG_RSVD2       0x20000000
+#define NES_DBG_RSVD3       0x40000000
+#define NES_DBG_RSVD4       0x80000000
+#define NES_DBG_ALL         0xffffffff
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define nes_debug(level, fmt, args...) \
+	if (level & nes_debug_level) \
+		printk(KERN_ERR PFX "%s[%u]: " fmt, __FUNCTION__, __LINE__, ##args)
+
+#define assert(expr)                                                \
+if (!(expr)) {                                                       \
+	printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n",  \
+		   #expr, __FILE__, __FUNCTION__, __LINE__);                \
+}
+
+#define NES_EVENT_TIMEOUT   1200000
+#else
+#define nes_debug(level, fmt, args...)
+#define assert(expr)          do {} while (0)
+
+#define NES_EVENT_TIMEOUT   100000
+#endif
+
+#include "nes_hw.h"
+#include "nes_verbs.h"
+#include "nes_context.h"
+#include "nes_user.h"
+#include "nes_cm.h"
+
+extern int max_mtu;
+extern int nics_per_function;
+#define max_frame_len (max_mtu+ETH_HLEN)
+extern int interrupt_mod_interval;
+extern int nes_if_count;
+extern int mpa_version;
+extern int disable_mpa_crc;
+extern unsigned int send_first;
+extern unsigned int nes_drv_opt;
+extern unsigned int nes_debug_level;
+
+extern struct list_head nes_adapter_list;
+extern struct list_head nes_dev_list;
+
+extern struct nes_cm_core *g_cm_core;
+
+extern atomic_t cm_connects;
+extern atomic_t cm_accepts;
+extern atomic_t cm_disconnects;
+extern atomic_t cm_closes;
+extern atomic_t cm_connecteds;
+extern atomic_t cm_connect_reqs;
+extern atomic_t cm_rejects;
+extern atomic_t mod_qp_timouts;
+extern atomic_t qps_created;
+extern atomic_t qps_destroyed;
+extern atomic_t sw_qps_destroyed;
+extern u32 mh_detected;
+extern u32 mh_pauses_sent;
+extern u32 cm_packets_sent;
+extern u32 cm_packets_bounced;
+extern u32 cm_packets_created;
+extern u32 cm_packets_received;
+extern u32 cm_packets_dropped;
+extern u32 cm_packets_retrans;
+extern u32 cm_listens_created;
+extern u32 cm_listens_destroyed;
+extern u32 cm_backlog_drops;
+extern atomic_t cm_loopbacks;
+extern atomic_t cm_nodes_created;
+extern atomic_t cm_nodes_destroyed;
+extern atomic_t cm_accel_dropped_pkts;
+extern atomic_t cm_resets_recvd;
+
+extern u32 crit_err_count;
+extern u32 int_mod_timer_init;
+extern u32 int_mod_cq_depth_256;
+extern u32 int_mod_cq_depth_128;
+extern u32 int_mod_cq_depth_32;
+extern u32 int_mod_cq_depth_24;
+extern u32 int_mod_cq_depth_16;
+extern u32 int_mod_cq_depth_4;
+extern u32 int_mod_cq_depth_1;
+
+extern atomic_t cqp_reqs_allocated;
+extern atomic_t cqp_reqs_freed;
+extern atomic_t cqp_reqs_dynallocated;
+extern atomic_t cqp_reqs_dynfreed;
+extern atomic_t cqp_reqs_queued;
+extern atomic_t cqp_reqs_redriven;
+
+
+struct nes_device {
+	struct nes_adapter	   *nesadapter;
+	void __iomem           *regs;
+	void __iomem           *index_reg;
+	struct pci_dev         *pcidev;
+	struct net_device      *netdev[NES_NIC_MAX_NICS];
+	u64                    link_status_interrupts;
+	struct tasklet_struct  dpc_tasklet;
+	spinlock_t             indexed_regs_lock;
+	unsigned long          csr_start;
+	unsigned long          doorbell_region;
+	unsigned long          doorbell_start;
+	unsigned long          mac_tx_errors;
+	unsigned long          mac_pause_frames_sent;
+	unsigned long          mac_pause_frames_received;
+	unsigned long          mac_rx_errors;
+	unsigned long          mac_rx_crc_errors;
+	unsigned long          mac_rx_symbol_err_frames;
+	unsigned long          mac_rx_jabber_frames;
+	unsigned long          mac_rx_oversized_frames;
+	unsigned long          mac_rx_short_frames;
+	unsigned long          port_rx_discards;
+	unsigned long          port_tx_discards;
+	unsigned int           mac_index;
+	unsigned int           nes_stack_start;
+
+	/* Control Structures */
+	void                   *cqp_vbase;
+	dma_addr_t             cqp_pbase;
+	u32                    cqp_mem_size;
+	u8                     ceq_index;
+	u8                     nic_ceq_index;
+	struct nes_hw_cqp      cqp;
+	struct nes_hw_cq       ccq;
+	struct list_head       cqp_avail_reqs;
+	struct list_head       cqp_pending_reqs;
+	struct nes_cqp_request *nes_cqp_requests;
+
+	u32                    int_req;
+	u32                    int_stat;
+	u32                    timer_int_req;
+	u32                    timer_only_int_count;
+	u32                    intf_int_req;
+	u32                    last_mac_tx_pauses;
+	u32                    last_used_chunks_tx;
+	struct list_head       list;
+
+	u16                    base_doorbell_index;
+	u16                    currcq_count;
+	u16                    deepcq_count;
+	u8                     msi_enabled;
+	u8                     netdev_count;
+	u8                     napi_isr_ran;
+	u8                     disable_rx_flow_control;
+	u8                     disable_tx_flow_control;
+};
+
+
+static inline void
+set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
+{
+	wqe_words[index]     = cpu_to_le32((u32) ((unsigned long)value));
+	wqe_words[index + 1] = cpu_to_le32((u32)(upper_32_bits((unsigned long)value)));
+}
+
+static inline void
+set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
+{
+	wqe_words[index] = cpu_to_le32(value);
+}
+
+static inline void
+nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
+{
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_CTX_LOW_IDX,
+			(u64)((unsigned long) &nesdev->cqp));
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
+}
+
+static inline void
+nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
+{
+	u32 value;
+	value = ((u32)((unsigned long) nesqp)) | head;
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
+			(u32)(upper_32_bits((unsigned long)(nesqp))));
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
+}
+
+/* Read from memory-mapped device */
+static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+	u32 value;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	value = readl((void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+	return value;
+}
+
+static inline u32 nes_read32(const void __iomem *addr)
+{
+	return readl(addr);
+}
+
+static inline u16 nes_read16(const void __iomem *addr)
+{
+	return readw(addr);
+}
+
+static inline u8 nes_read8(const void __iomem *addr)
+{
+	return readb(addr);
+}
+
+/* Write to memory-mapped device */
+static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	writel(val, (void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+}
+
+static inline void nes_write32(void __iomem *addr, u32 val)
+{
+	writel(val, addr);
+}
+
+static inline void nes_write16(void __iomem *addr, u16 val)
+{
+	writew(val, addr);
+}
+
+static inline void nes_write8(void __iomem *addr, u8 val)
+{
+	writeb(val, addr);
+}
+
+
+
+static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 max_resources,
+		u32 *req_resource_num, u32 *next)
+{
+	unsigned long flags;
+	u32 resource_num;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
+	if (resource_num >= max_resources) {
+		resource_num = find_first_zero_bit(resource_array, max_resources);
+		if (resource_num >= max_resources) {
+			printk(KERN_ERR PFX "%s: No available resourcess.\n", __FUNCTION__);
+			spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+			return -EMFILE;
+		}
+	}
+	set_bit(resource_num, resource_array);
+	*next = resource_num+1;
+	if (*next == max_resources) {
+		*next = 0;
+	}
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+	*req_resource_num = resource_num;
+
+	return 0;
+}
+
+static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+	int bit_is_set;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	bit_is_set = test_bit(resource_num, resource_array);
+	nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
+			resource_num, (bit_is_set ? "": " not"));
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+
+	return bit_is_set;
+}
+
+static inline void nes_free_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	clear_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
+}
+
+static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct nes_pd, ibpd);
+}
+
+static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct nes_ucontext, ibucontext);
+}
+
+static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct nes_mr, ibmr);
+}
+
+static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct nes_mr, ibfmr);
+}
+
+static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct nes_mr, ibmw);
+}
+
+static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
+{
+	return container_of(nesmr, struct nes_fmr, nesmr);
+}
+
+static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct nes_cq, ibcq);
+}
+
+static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct nes_qp, ibqp);
+}
+
+
+
+/* nes.c */
+void nes_add_ref(struct ib_qp *);
+void nes_rem_ref(struct ib_qp *);
+struct ib_qp *nes_get_qp(struct ib_device *, int);
+
+
+/* nes_hw.c */
+struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
+void  nes_nic_init_timer_defaults(struct nes_device *, u8);
+unsigned int nes_reset_adapter_ne020(struct nes_device *, u8 *);
+int nes_init_serdes(struct nes_device *, u8, u8, u8);
+void nes_init_csr_ne020(struct nes_device *, u8, u8);
+void nes_destroy_adapter(struct nes_adapter *);
+int nes_init_cqp(struct nes_device *);
+int nes_init_phy(struct nes_device *);
+int nes_init_nic_qp(struct nes_device *, struct net_device *);
+void nes_destroy_nic_qp(struct nes_vnic *);
+int nes_napi_isr(struct nes_device *);
+void nes_dpc(unsigned long);
+void nes_process_ceq(struct nes_device *, struct nes_hw_ceq *);
+void nes_process_aeq(struct nes_device *, struct nes_hw_aeq *);
+void nes_process_mac_intr(struct nes_device *, u32);
+void nes_nic_napi_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
+void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
+void nes_cqp_ce_handler(struct nes_device *, struct nes_hw_cq *);
+void nes_process_iwarp_aeqe(struct nes_device *, struct nes_hw_aeqe *);
+void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
+int nes_destroy_cqp(struct nes_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+
+/* nes_nic.c */
+void nes_netdev_set_multicast_list(struct net_device *);
+void nes_netdev_exit(struct nes_vnic *);
+struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
+void nes_netdev_destroy(struct net_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+
+/* nes_cm.c */
+void *nes_cm_create(struct net_device *);
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+void nes_update_arp(unsigned char *, u32, u32, u16, u16);
+void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
+void nes_sock_release(struct nes_qp *, unsigned long *);
+struct nes_cm_core *nes_cm_alloc_core(void);
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
+int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
+int nes_cm_disconn(struct nes_qp *);
+void nes_cm_disconn_worker(void *);
+
+/* nes_verbs.c */
+int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32);
+int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
+struct nes_ib_device *nes_init_ofa_device(struct net_device *);
+void nes_destroy_ofa_device(struct nes_ib_device *);
+int nes_register_ofa_device(struct nes_ib_device *);
+void nes_unregister_ofa_device(struct nes_ib_device *);
+
+/* nes_util.c */
+int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
+void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
+void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
+void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16);
+void nes_read_10G_phy_reg(struct nes_device *, u16, u8);
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *, int);
+int nes_arp_table(struct nes_device *, u32, u8 *, u32);
+void nes_mh_fix(unsigned long);
+void nes_clc(unsigned long);
+void nes_dump_mem(unsigned int, void *, int);
+u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
+
+#endif	/* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
new file mode 100644
index 0000000..bd5cfea
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -0,0 +1,3088 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#define TCPOPT_TIMESTAMP 8
+
+#include <asm/atomic.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+#include "nes.h"
+
+u32 cm_packets_sent;
+u32 cm_packets_bounced;
+u32 cm_packets_dropped;
+u32 cm_packets_retrans;
+u32 cm_packets_created;
+u32 cm_packets_received;
+u32 cm_listens_created;
+u32 cm_listens_destroyed;
+u32 cm_backlog_drops;
+atomic_t cm_loopbacks;
+atomic_t cm_nodes_created;
+atomic_t cm_nodes_destroyed;
+atomic_t cm_accel_dropped_pkts;
+atomic_t cm_resets_recvd;
+
+static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *,
+		struct nes_vnic *, struct nes_cm_info *);
+static int add_ref_cm_node(struct nes_cm_node *);
+static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
+
+
+/* External CM API Interface */
+/* instance of function pointers for client API */
+/* set address of this instance to cm_core->cm_ops at cm_core alloc */
+static struct nes_cm_ops nes_cm_api = {
+	mini_cm_accelerated,
+	mini_cm_listen,
+	mini_cm_del_listen,
+	mini_cm_connect,
+	mini_cm_close,
+	mini_cm_accept,
+	mini_cm_reject,
+	mini_cm_recv_pkt,
+	mini_cm_dealloc_core,
+	mini_cm_get,
+	mini_cm_set
+};
+
+struct nes_cm_core *g_cm_core;
+
+atomic_t cm_connects;
+atomic_t cm_accepts;
+atomic_t cm_disconnects;
+atomic_t cm_closes;
+atomic_t cm_connecteds;
+atomic_t cm_connect_reqs;
+atomic_t cm_rejects;
+
+
+/**
+ * create_event
+ */
+static struct nes_cm_event *create_event(struct nes_cm_node *cm_node,
+		enum nes_cm_event_type type)
+{
+	struct nes_cm_event *event;
+
+	if (!cm_node->cm_id)
+		return NULL;
+
+	/* allocate an empty event */
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+
+	if (!event)
+		return NULL;
+
+	event->type = type;
+	event->cm_node = cm_node;
+	event->cm_info.rem_addr = cm_node->rem_addr;
+	event->cm_info.loc_addr = cm_node->loc_addr;
+	event->cm_info.rem_port = cm_node->rem_port;
+	event->cm_info.loc_port = cm_node->loc_port;
+	event->cm_info.cm_id = cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "Created event=%p, type=%u, dst_addr=%08x[%x],"
+			" src_addr=%08x[%x]\n",
+			event, type,
+			event->cm_info.loc_addr, event->cm_info.loc_port,
+			event->cm_info.rem_addr, event->cm_info.rem_port);
+
+	nes_cm_post_event(event);
+	return event;
+}
+
+
+/**
+ * send_mpa_request
+ */
+int send_mpa_request(struct nes_cm_node *cm_node)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	skb = get_free_pkt(cm_node);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	/* send an MPA Request frame */
+	form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame,
+			cm_node->mpa_frame_size, SET_ACK);
+
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return 0;
+}
+
+
+/**
+ * recv_mpa - process a received TCP pkt, we are expecting an
+ * IETF MPA frame
+ */
+static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 len)
+{
+	struct ietf_mpa_frame *mpa_frame;
+
+	/* assume req frame is in tcp data payload */
+	if (len < sizeof(struct ietf_mpa_frame)) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
+		return -1;
+	}
+
+	mpa_frame = (struct ietf_mpa_frame *)buffer;
+	cm_node->mpa_frame_size = ntohs(mpa_frame->priv_data_len);
+
+	if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
+				" complete (%x + %x != %x)\n",
+				cm_node->mpa_frame_size, (u32)sizeof(struct ietf_mpa_frame), len);
+		return -1;
+	}
+
+	/* copy entire MPA frame to our cm_node's frame */
+	memcpy(cm_node->mpa_frame_buf, buffer + sizeof(struct ietf_mpa_frame),
+			cm_node->mpa_frame_size);
+
+	return 0;
+}
+
+
+/**
+ * handle_exception_pkt - process an exception packet.
+ * We have been in a TSA state, and we have now received SW
+ * TCP/IP traffic should be a FIN request or IP pkt with options
+ */
+static int handle_exception_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret = 0;
+	struct tcphdr *tcph = tcp_hdr(skb);
+
+	/* first check to see if this a FIN pkt */
+	if (tcph->fin) {
+		/* we need to ACK the FIN request */
+		send_ack(cm_node);
+
+		/* check which side we are (client/server) and set next state accordingly */
+		if (cm_node->tcp_cntxt.client)
+			cm_node->state = NES_CM_STATE_CLOSING;
+		else {
+			/* we are the server side */
+			cm_node->state = NES_CM_STATE_CLOSE_WAIT;
+			/* since this is a self contained CM we don't wait for */
+			/* an APP to close us, just send final FIN immediately */
+			ret = send_fin(cm_node, NULL);
+			cm_node->state = NES_CM_STATE_LAST_ACK;
+		}
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * form_cm_frame - get a free packet and build empty frame Use
+ * node info to build.
+ */
+struct sk_buff *form_cm_frame(struct sk_buff *skb, struct nes_cm_node *cm_node,
+		void *options, u32 optionsize, void *data, u32 datasize, u8 flags)
+{
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	struct ethhdr *ethh;
+	u8 *buf;
+	u16 packetsize = sizeof(*iph);
+
+	packetsize += sizeof(*tcph);
+	packetsize +=  optionsize + datasize;
+
+	memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
+
+	skb->len = 0;
+	buf = skb_put(skb, packetsize + ETH_HLEN);
+
+	ethh = (struct ethhdr *) buf;
+	buf += ETH_HLEN;
+
+	iph = (struct iphdr *)buf;
+	buf += sizeof(*iph);
+	tcph = (struct tcphdr *)buf;
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_set_transport_header(skb, ETH_HLEN+sizeof(*iph));
+	buf += sizeof(*tcph);
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->protocol = htons(0x800);
+	skb->data_len = 0;
+	skb->mac_len = ETH_HLEN;
+
+	memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
+	memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
+	ethh->h_proto = htons(0x0800);
+
+	iph->version = IPVERSION;
+	iph->ihl = 5;		/* 5 * 4Byte words, IP headr len */
+	iph->tos = 0;
+	iph->tot_len = htons(packetsize);
+	iph->id = htons(++cm_node->tcp_cntxt.loc_id);
+
+	iph->frag_off = htons(0x4000);
+	iph->ttl = 0x40;
+	iph->protocol = 0x06;	/* IPPROTO_TCP */
+
+	iph->saddr = htonl(cm_node->loc_addr);
+	iph->daddr = htonl(cm_node->rem_addr);
+
+	tcph->source = htons(cm_node->loc_port);
+	tcph->dest = htons(cm_node->rem_port);
+	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
+
+	if (flags & SET_ACK) {
+		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
+		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
+		tcph->ack = 1;
+	} else
+		tcph->ack_seq = 0;
+
+	if (flags & SET_SYN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->syn = 1;
+	} else
+		cm_node->tcp_cntxt.loc_seq_num += datasize;	/* data (no headers) */
+
+	if (flags & SET_FIN)
+		tcph->fin = 1;
+
+	if (flags & SET_RST)
+		tcph->rst = 1;
+
+	tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
+	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
+	tcph->urg_ptr = 0;
+	if (optionsize)
+		memcpy(buf, options, optionsize);
+	buf += optionsize;
+	if (datasize)
+		memcpy(buf, data, datasize);
+
+	skb_shinfo(skb)->nr_frags = 0;
+	cm_packets_created++;
+
+	return skb;
+}
+
+
+/**
+ * print_core - dump a cm core
+ */
+static void print_core(struct nes_cm_core *core)
+{
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+	nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
+	if (!core)
+		return;
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+	nes_debug(NES_DBG_CM, "Session ID    : %u \n", atomic_read(&core->session_id));
+
+	nes_debug(NES_DBG_CM, "State         : %u \n",  core->state);
+
+	nes_debug(NES_DBG_CM, "Tx Free cnt   : %u \n", skb_queue_len(&core->tx_free_list));
+	nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
+	nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
+
+	nes_debug(NES_DBG_CM, "core          : %p \n", core);
+
+	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
+}
+
+
+/**
+ * schedule_nes_timer
+ * note - cm_node needs to be protected before calling this. Encase in:
+ *			rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
+ */
+int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
+		enum nes_timer_type type, int send_retrans,
+		int close_when_complete)
+{
+	unsigned long  flags;
+	struct nes_cm_core *cm_core;
+	struct nes_timer_entry *new_send;
+	int ret = 0;
+	u32 was_timer_set;
+
+	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
+	if (!new_send)
+		return -1;
+	if (!cm_node)
+		return -EINVAL;
+
+	/* new_send->timetosend = currenttime */
+	new_send->retrycount = NES_DEFAULT_RETRYS;
+	new_send->retranscount = NES_DEFAULT_RETRANS;
+	new_send->skb = skb;
+	new_send->timetosend = jiffies;
+	new_send->type = type;
+	new_send->netdev = cm_node->netdev;
+	new_send->send_retrans = send_retrans;
+	new_send->close_when_complete = close_when_complete;
+
+	if (type == NES_TIMER_TYPE_CLOSE) {
+		new_send->timetosend += (HZ/2);	/* TODO: decide on the correct value here */
+		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+		list_add_tail(&new_send->list, &cm_node->recv_list);
+		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+	}
+
+	if (type == NES_TIMER_TYPE_SEND) {
+		new_send->seq_num = htonl(tcp_hdr(skb)->seq);
+		atomic_inc(&new_send->skb->users);
+
+		ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
+		if (ret != NETDEV_TX_OK) {
+			nes_debug(NES_DBG_CM, "Error sending packet %p (jiffies = %lu)\n",
+					new_send, jiffies);
+			atomic_dec(&new_send->skb->users);
+			new_send->timetosend = jiffies;
+		} else {
+			cm_packets_sent++;
+			if (!send_retrans) {
+				if (close_when_complete)
+					rem_ref_cm_node(cm_node->cm_core, cm_node);
+				dev_kfree_skb_any(new_send->skb);
+				kfree(new_send);
+				return ret;
+			}
+			new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
+		}
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		list_add_tail(&new_send->list, &cm_node->retrans_list);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+	}
+	if (type == NES_TIMER_TYPE_RECV) {
+		new_send->seq_num = htonl(tcp_hdr(skb)->seq);
+		new_send->timetosend = jiffies;
+		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+		list_add_tail(&new_send->list, &cm_node->recv_list);
+		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+	}
+	cm_core = cm_node->cm_core;
+
+	was_timer_set = timer_pending(&cm_core->tcp_timer);
+
+	if (!was_timer_set) {
+		cm_core->tcp_timer.expires = new_send->timetosend;
+		add_timer(&cm_core->tcp_timer);
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_timer_tick
+ */
+void nes_cm_timer_tick(unsigned long pass)
+{
+	unsigned long flags, qplockflags;
+	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
+	struct iw_cm_id *cm_id;
+	struct nes_cm_node *cm_node;
+	struct nes_timer_entry *send_entry, *recv_entry;
+	struct list_head *list_core, *list_core_temp;
+	struct list_head *list_node, *list_node_temp;
+	struct nes_cm_core *cm_core = g_cm_core;
+	struct nes_qp *nesqp;
+	struct sk_buff *skb;
+	u32 settimer = 0;
+	int ret = NETDEV_TX_OK;
+	int    node_done;
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) {
+		cm_node = container_of(list_node, struct nes_cm_node, list);
+		add_ref_cm_node(cm_node);
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+		list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) {
+			recv_entry = container_of(list_core, struct nes_timer_entry, list);
+			if ((time_after(recv_entry->timetosend, jiffies)) &&
+					(recv_entry->type == NES_TIMER_TYPE_CLOSE)) {
+				if (nexttimeout > recv_entry->timetosend || !settimer) {
+					nexttimeout = recv_entry->timetosend;
+					settimer = 1;
+				}
+				continue;
+			}
+			list_del(&recv_entry->list);
+			cm_id = cm_node->cm_id;
+			spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+			if (recv_entry->type == NES_TIMER_TYPE_CLOSE) {
+				nesqp = (struct nes_qp *)recv_entry->skb;
+				spin_lock_irqsave(&nesqp->lock, qplockflags);
+				if (nesqp->cm_id) {
+					nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, refcount = %d: "
+							"****** HIT A NES_TIMER_TYPE_CLOSE"
+							" with something to do!!! ******\n",
+							nesqp->hwqp.qp_id, cm_id,
+							atomic_read(&nesqp->refcount));
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+					nesqp->ibqp_state = IB_QPS_ERR;
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_cm_disconn(nesqp);
+				} else {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, refcount = %d:"
+							" ****** HIT A NES_TIMER_TYPE_CLOSE"
+							" with nothing to do!!! ******\n",
+							nesqp->hwqp.qp_id, cm_id,
+							atomic_read(&nesqp->refcount));
+					nes_rem_ref(&nesqp->ibqp);
+				}
+				if (cm_id)
+					cm_id->rem_ref(cm_id);
+			}
+			kfree(recv_entry);
+			spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+		}
+		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		node_done = 0;
+		list_for_each_safe(list_core, list_node_temp, &cm_node->retrans_list) {
+			if (node_done) {
+				break;
+			}
+			send_entry = container_of(list_core, struct nes_timer_entry, list);
+			if (time_after(send_entry->timetosend, jiffies)) {
+				if (cm_node->state != NES_CM_STATE_TSA) {
+					if ((nexttimeout > send_entry->timetosend) || !settimer) {
+						nexttimeout = send_entry->timetosend;
+						settimer = 1;
+					}
+					node_done = 1;
+					continue;
+				} else {
+					list_del(&send_entry->list);
+					skb = send_entry->skb;
+					spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+					dev_kfree_skb_any(skb);
+					kfree(send_entry);
+					spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+					continue;
+				}
+			}
+			if (send_entry->type == NES_TIMER_NODE_CLEANUP) {
+				list_del(&send_entry->list);
+				spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+				kfree(send_entry);
+				spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+				continue;
+			}
+			if ((send_entry->seq_num < cm_node->tcp_cntxt.rem_ack_num) ||
+					(cm_node->state == NES_CM_STATE_TSA) ||
+					(cm_node->state == NES_CM_STATE_CLOSED)) {
+				skb = send_entry->skb;
+				list_del(&send_entry->list);
+				spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+				kfree(send_entry);
+				dev_kfree_skb_any(skb);
+				spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+				continue;
+			}
+
+			if (!send_entry->retranscount || !send_entry->retrycount) {
+				cm_packets_dropped++;
+				skb = send_entry->skb;
+				list_del(&send_entry->list);
+				spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+				dev_kfree_skb_any(skb);
+				kfree(send_entry);
+				if (cm_node->state == NES_CM_STATE_SYN_RCVD) {
+					/* this node never even generated an indication up to the cm */
+					rem_ref_cm_node(cm_core, cm_node);
+				} else {
+					cm_node->state = NES_CM_STATE_CLOSED;
+					create_event(cm_node, NES_CM_EVENT_ABORTED);
+				}
+				spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+				continue;
+			}
+			/* this seems like the correct place, but leave send entry unprotected */
+			// spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+			atomic_inc(&send_entry->skb->users);
+			cm_packets_retrans++;
+			nes_debug(NES_DBG_CM, "Retransmitting send_entry %p for node %p,"
+					" jiffies = %lu, time to send =  %lu, retranscount = %u, "
+					"send_entry->seq_num = 0x%08X, cm_node->tcp_cntxt.rem_ack_num = 0x%08X\n",
+					send_entry, cm_node, jiffies, send_entry->timetosend, send_entry->retranscount,
+					send_entry->seq_num, cm_node->tcp_cntxt.rem_ack_num);
+
+			spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+			ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
+			if (ret != NETDEV_TX_OK) {
+				cm_packets_bounced++;
+				atomic_dec(&send_entry->skb->users);
+				send_entry->retrycount--;
+				nexttimeout = jiffies + NES_SHORT_TIME;
+				settimer = 1;
+				node_done = 1;
+				spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+				continue;
+			} else {
+				cm_packets_sent++;
+			}
+			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+			list_del(&send_entry->list);
+			nes_debug(NES_DBG_CM, "Packet Sent: retrans count = %u, retry count = %u.\n",
+					send_entry->retranscount, send_entry->retrycount);
+			if (send_entry->send_retrans) {
+				send_entry->retranscount--;
+				send_entry->timetosend = jiffies + NES_RETRY_TIMEOUT;
+				if (nexttimeout > send_entry->timetosend || !settimer) {
+					nexttimeout = send_entry->timetosend;
+					settimer = 1;
+				}
+				list_add(&send_entry->list, &cm_node->retrans_list);
+				continue;
+			} else {
+				int close_when_complete;
+				skb = send_entry->skb;
+				close_when_complete = send_entry->close_when_complete;
+				spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+				if (close_when_complete) {
+					BUG_ON(atomic_read(&cm_node->ref_count) == 1);
+					rem_ref_cm_node(cm_core, cm_node);
+				}
+				dev_kfree_skb_any(skb);
+				kfree(send_entry);
+				spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+				continue;
+			}
+		}
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+
+		rem_ref_cm_node(cm_core, cm_node);
+
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		if (ret != NETDEV_TX_OK)
+			break;
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	if (settimer) {
+		if (!timer_pending(&cm_core->tcp_timer)) {
+			cm_core->tcp_timer.expires  = nexttimeout;
+			add_timer(&cm_core->tcp_timer);
+		}
+	}
+}
+
+
+/**
+ * send_syn
+ */
+int send_syn(struct nes_cm_node *cm_node, u32 sendack)
+{
+	int ret;
+	int flags = SET_SYN;
+	struct sk_buff *skb;
+	char optionsbuffer[sizeof(struct option_mss) +
+			sizeof(struct option_windowscale) +
+			sizeof(struct option_base) + 1];
+
+	int optionssize = 0;
+	/* Sending MSS option */
+	union all_known_options *options;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_mss.optionnum = OPTION_NUMBER_MSS;
+	options->as_mss.length = sizeof(struct option_mss);
+	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
+	optionssize += sizeof(struct option_mss);
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
+	options->as_windowscale.length = sizeof(struct option_windowscale);
+	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
+	optionssize += sizeof(struct option_windowscale);
+
+	if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)
+			) {
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_base.optionnum = OPTION_NUMBER_WRITE0;
+		options->as_base.length = sizeof(struct option_base);
+		optionssize += sizeof(struct option_base);
+		/* we need the size to be a multiple of 4 */
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+	}
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_end = OPTION_NUMBER_END;
+	optionssize += 1;
+
+	skb = get_free_pkt(cm_node);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	if (sendack)
+		flags |= SET_ACK;
+
+	form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_reset
+ */
+int send_reset(struct nes_cm_node *cm_node)
+{
+	int ret;
+	struct sk_buff *skb = get_free_pkt(cm_node);
+	int flags = SET_RST | SET_ACK;
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	add_ref_cm_node(cm_node);
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
+
+	return ret;
+}
+
+
+/**
+ * send_ack
+ */
+int send_ack(struct nes_cm_node *cm_node)
+{
+	int ret;
+	struct sk_buff *skb = get_free_pkt(cm_node);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_fin
+ */
+int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	/* if we didn't get a frame get one */
+	if (!skb)
+		skb = get_free_pkt(cm_node);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * get_free_pkt
+ */
+struct sk_buff *get_free_pkt(struct nes_cm_node *cm_node)
+{
+	struct sk_buff *skb, *new_skb;
+
+	/* check to see if we need to repopulate the free tx pkt queue */
+	if (skb_queue_len(&cm_node->cm_core->tx_free_list) < NES_CM_FREE_PKT_LO_WATERMARK) {
+		while (skb_queue_len(&cm_node->cm_core->tx_free_list) <
+				cm_node->cm_core->free_tx_pkt_max) {
+			/* replace the frame we took, we won't get it back */
+			new_skb = dev_alloc_skb(cm_node->cm_core->mtu);
+			BUG_ON(!new_skb);
+			/* add a replacement frame to the free tx list head */
+			skb_queue_head(&cm_node->cm_core->tx_free_list, new_skb);
+		}
+	}
+
+	skb = skb_dequeue(&cm_node->cm_core->tx_free_list);
+
+	return skb;
+}
+
+
+/**
+ * make_hashkey - generate hash key from node tuple
+ */
+static inline int make_hashkey(u16 loc_port, nes_addr_t loc_addr, u16 rem_port,
+		nes_addr_t rem_addr)
+{
+	u32 hashkey = 0;
+
+	hashkey = loc_addr + rem_addr + loc_port + rem_port;
+	hashkey = (hashkey % NES_CM_HASHTABLE_SIZE);
+
+	return hashkey;
+}
+
+
+/**
+ * find_node - find a cm node that matches the reference cm node
+ */
+static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
+		u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
+{
+	unsigned long flags;
+	u32 hashkey;
+	struct list_head *list_pos;
+	struct list_head *hte;
+	struct nes_cm_node *cm_node;
+
+	/* make a hash index key for this packet */
+	hashkey = make_hashkey(loc_port, loc_addr, rem_port, rem_addr);
+
+	/* get a handle on the hte */
+	hte = &cm_core->connected_nodes;
+
+	nes_debug(NES_DBG_CM, "Searching for an owner node:%x:%x from core %p->%p\n",
+			loc_addr, loc_port, cm_core, hte);
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each(list_pos, hte) {
+		cm_node = container_of(list_pos, struct nes_cm_node, list);
+		/* compare quad, return node handle if a match */
+		nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
+				cm_node->loc_addr, cm_node->loc_port,
+				loc_addr, loc_port,
+				cm_node->rem_addr, cm_node->rem_port,
+				rem_addr, rem_port);
+		if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) &&
+				(cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) {
+			add_ref_cm_node(cm_node);
+			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+			return cm_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	/* no owner node */
+	return NULL;
+}
+
+
+/**
+ * find_listener - find a cm node listening on this addr-port pair
+ */
+static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
+		nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state)
+{
+	unsigned long flags;
+	struct list_head *listen_list;
+	struct nes_cm_listener *listen_node;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each(listen_list, &cm_core->listen_list.list) {
+		listen_node = container_of(listen_list, struct nes_cm_listener, list);
+		/* compare node pair, return node handle if a match */
+		if (((listen_node->loc_addr == dst_addr) ||
+				listen_node->loc_addr == 0x00000000) &&
+				(listen_node->loc_port == dst_port) &&
+				(listener_state & listen_node->listener_state)) {
+			atomic_inc(&listen_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+			return listen_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+	nes_debug(NES_DBG_CM, "Unable to find listener- %x:%x\n",
+			dst_addr, dst_port);
+
+	/* no listener */
+	return NULL;
+}
+
+
+/**
+ * add_hte_node - add a cm node to the hash table
+ */
+static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	u32 hashkey;
+	struct list_head *hte;
+
+	if (!cm_node || !cm_core)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "Adding Node to Active Connection HT\n");
+
+	/* first, make an index into our hash table */
+	hashkey = make_hashkey(cm_node->loc_port, cm_node->loc_addr,
+			cm_node->rem_port, cm_node->rem_addr);
+	cm_node->hashkey = hashkey;
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	/* get a handle on the hash table element (list head for this slot) */
+	hte = &cm_core->connected_nodes;
+	list_add_tail(&cm_node->list, hte);
+	atomic_inc(&cm_core->ht_node_cnt);
+
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_dec_refcnt_listen
+ */
+static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
+		struct nes_cm_listener *listener, int free_hanging_nodes)
+{
+	int ret = 1;
+	unsigned long flags;
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	if (!atomic_dec_return(&listener->ref_count)) {
+		list_del(&listener->list);
+
+		/* decrement our listen node count */
+		atomic_dec(&cm_core->listen_node_cnt);
+
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+		if (listener->nesvnic) {
+			nes_manage_apbvt(listener->nesvnic, listener->loc_port,
+					PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL);
+		}
+
+		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
+
+		kfree(listener);
+		ret = 0;
+		cm_listens_destroyed++;
+	} else {
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+	if (listener) {
+		if (atomic_read(&listener->pend_accepts_cnt) > 0)
+			nes_debug(NES_DBG_CM, "destroying listener (%p)"
+					" with non-zero pending accepts=%u\n",
+					listener, atomic_read(&listener->pend_accepts_cnt));
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_del_listen
+ */
+static int mini_cm_del_listen(struct nes_cm_core *cm_core,
+		struct nes_cm_listener *listener)
+{
+	listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
+	listener->cm_id = NULL; /* going to be destroyed pretty soon */
+	return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
+}
+
+
+/**
+ * mini_cm_accelerated
+ */
+static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
+		struct nes_cm_node *cm_node)
+{
+	u32 was_timer_set;
+	cm_node->accelerated = 1;
+
+	if (cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+
+	was_timer_set = timer_pending(&cm_core->tcp_timer);
+	if (!was_timer_set) {
+		cm_core->tcp_timer.expires = jiffies + NES_SHORT_TIME;
+		add_timer(&cm_core->tcp_timer);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_addr_send_arp
+ */
+static void nes_addr_send_arp(u32 dst_ip)
+{
+	struct rtable *rt;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof fl);
+	fl.nl_u.ip4_u.daddr = htonl(dst_ip);
+	if (ip_route_output_key(&init_net, &rt, &fl)) {
+		printk("%s: ip_route_output_key failed for 0x%08X\n",
+				__FUNCTION__, dst_ip);
+		return;
+	}
+
+	neigh_event_send(rt->u.dst.neighbour, NULL);
+	ip_rt_put(rt);
+}
+
+
+/**
+ * make_cm_node - create a new instance of a cm node
+ */
+static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
+		struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
+		struct nes_cm_listener *listener)
+{
+	struct nes_cm_node *cm_node;
+	struct timespec ts;
+	int arpindex = 0;
+	struct nes_device *nesdev;
+	struct nes_adapter *nesadapter;
+
+	/* create an hte and cm_node for this instance */
+	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node specific transport info */
+	cm_node->loc_addr = cm_info->loc_addr;
+	cm_node->rem_addr = cm_info->rem_addr;
+	cm_node->loc_port = cm_info->loc_port;
+	cm_node->rem_port = cm_info->rem_port;
+	cm_node->send_write0 = send_first;
+	nes_debug(NES_DBG_CM, "Make node addresses : loc = %x:%x, rem = %x:%x\n",
+			cm_node->loc_addr, cm_node->loc_port, cm_node->rem_addr, cm_node->rem_port);
+	cm_node->listener = listener;
+	cm_node->netdev = nesvnic->netdev;
+	cm_node->cm_id = cm_info->cm_id;
+	memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
+
+	nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n",
+			cm_node->listener, cm_node->cm_id);
+
+	INIT_LIST_HEAD(&cm_node->retrans_list);
+	spin_lock_init(&cm_node->retrans_list_lock);
+	INIT_LIST_HEAD(&cm_node->recv_list);
+	spin_lock_init(&cm_node->recv_list_lock);
+
+	cm_node->loopbackpartner = NULL;
+	atomic_set(&cm_node->ref_count, 1);
+	/* associate our parent CM core */
+	cm_node->cm_core = cm_core;
+	cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+	cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
+			NES_CM_DEFAULT_RCV_WND_SCALE;
+	ts = current_kernel_time();
+	cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+	cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
+			sizeof(struct tcphdr) - ETH_HLEN;
+	cm_node->tcp_cntxt.rcv_nxt = 0;
+	/* get a unique session ID , add thread_id to an upcounter to handle race */
+	atomic_inc(&cm_core->node_cnt);
+	atomic_inc(&cm_core->session_id);
+	cm_node->session_id = (u32)(atomic_read(&cm_core->session_id) + current->tgid);
+	cm_node->conn_type = cm_info->conn_type;
+	cm_node->apbvt_set = 0;
+	cm_node->accept_pend = 0;
+
+	cm_node->nesvnic = nesvnic;
+	/* get some device handles, for arp lookup */
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	cm_node->loopbackpartner = NULL;
+	/* get the mac addr for the remote node */
+	arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE);
+	if (arpindex < 0) {
+		kfree(cm_node);
+		nes_addr_send_arp(cm_info->rem_addr);
+		return NULL;
+	}
+
+	/* copy the mac addr to node context */
+	memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
+	nes_debug(NES_DBG_CM, "Remote mac addr from arp table:%02x,"
+			" %02x, %02x, %02x, %02x, %02x\n",
+			cm_node->rem_mac[0], cm_node->rem_mac[1],
+			cm_node->rem_mac[2], cm_node->rem_mac[3],
+			cm_node->rem_mac[4], cm_node->rem_mac[5]);
+
+	add_hte_node(cm_core, cm_node);
+	atomic_inc(&cm_nodes_created);
+
+	return cm_node;
+}
+
+
+/**
+ * add_ref_cm_node - destroy an instance of a cm node
+ */
+static int add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	atomic_inc(&cm_node->ref_count);
+	return 0;
+}
+
+
+/**
+ * rem_ref_cm_node - destroy an instance of a cm node
+ */
+static int rem_ref_cm_node(struct nes_cm_core *cm_core,
+		struct nes_cm_node *cm_node)
+{
+	unsigned long flags, qplockflags;
+	struct nes_timer_entry *send_entry;
+	struct nes_timer_entry *recv_entry;
+	struct iw_cm_id *cm_id;
+	struct list_head *list_core, *list_node_temp;
+	struct nes_qp *nesqp;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
+	if (atomic_dec_return(&cm_node->ref_count)) {
+		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+		return 0;
+	}
+	list_del(&cm_node->list);
+	atomic_dec(&cm_core->ht_node_cnt);
+	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+
+	/* if the node is destroyed before connection was accelerated */
+	if (!cm_node->accelerated && cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+
+	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+	list_for_each_safe(list_core, list_node_temp, &cm_node->retrans_list) {
+		send_entry = container_of(list_core, struct nes_timer_entry, list);
+		list_del(&send_entry->list);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		dev_kfree_skb_any(send_entry->skb);
+		kfree(send_entry);
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		continue;
+	}
+	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+
+	spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+	list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) {
+		recv_entry = container_of(list_core, struct nes_timer_entry, list);
+		list_del(&recv_entry->list);
+		cm_id = cm_node->cm_id;
+		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+		if (recv_entry->type == NES_TIMER_TYPE_CLOSE) {
+			nesqp = (struct nes_qp *)recv_entry->skb;
+			spin_lock_irqsave(&nesqp->lock, qplockflags);
+			if (nesqp->cm_id) {
+				nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: ****** HIT A NES_TIMER_TYPE_CLOSE"
+						" with something to do!!! ******\n",
+						nesqp->hwqp.qp_id, cm_id);
+				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+				nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+				nesqp->ibqp_state = IB_QPS_ERR;
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				nes_cm_disconn(nesqp);
+			} else {
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: ****** HIT A NES_TIMER_TYPE_CLOSE"
+						" with nothing to do!!! ******\n",
+						nesqp->hwqp.qp_id, cm_id);
+				nes_rem_ref(&nesqp->ibqp);
+			}
+			cm_id->rem_ref(cm_id);
+		} else if (recv_entry->type == NES_TIMER_TYPE_RECV) {
+			dev_kfree_skb_any(recv_entry->skb);
+		}
+		kfree(recv_entry);
+		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+
+	if (cm_node->listener) {
+		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
+	} else {
+		if (cm_node->apbvt_set && cm_node->nesvnic) {
+			nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port,
+					PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
+					NES_MANAGE_APBVT_DEL);
+		}
+	}
+
+	kfree(cm_node);
+	atomic_dec(&cm_core->node_cnt);
+	atomic_inc(&cm_nodes_destroyed);
+
+	return 0;
+}
+
+
+/**
+ * process_options
+ */
+static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, u32 optionsize, u32 syn_packet)
+{
+	u32 tmp;
+	u32 offset = 0;
+	union all_known_options *all_options;
+	char got_mss_option = 0;
+
+	while (offset < optionsize) {
+		all_options = (union all_known_options *)(optionsloc + offset);
+		switch (all_options->as_base.optionnum) {
+			case OPTION_NUMBER_END:
+				offset = optionsize;
+				break;
+			case OPTION_NUMBER_NONE:
+				offset += 1;
+				continue;
+			case OPTION_NUMBER_MSS:
+				nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d Size: %d\n",
+						__FUNCTION__,
+						all_options->as_mss.length, offset, optionsize);
+				got_mss_option = 1;
+				if (all_options->as_mss.length != 4) {
+					return 1;
+				} else {
+					tmp = ntohs(all_options->as_mss.mss);
+					if (tmp > 0 && tmp < cm_node->tcp_cntxt.mss)
+						cm_node->tcp_cntxt.mss = tmp;
+				}
+				break;
+			case OPTION_NUMBER_WINDOW_SCALE:
+				cm_node->tcp_cntxt.snd_wscale = all_options->as_windowscale.shiftcount;
+				break;
+			case OPTION_NUMBER_WRITE0:
+				cm_node->send_write0 = 1;
+				break;
+			default:
+				nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
+						all_options->as_base.optionnum);
+				break;
+		}
+		offset += all_options->as_base.length;
+	}
+	if ((!got_mss_option) && (syn_packet))
+		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
+	return 0;
+}
+
+
+/**
+ * process_packet
+ */
+int process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
+		struct nes_cm_core *cm_core)
+{
+	int optionsize;
+	int datasize;
+	int ret = 0;
+	struct tcphdr *tcph = tcp_hdr(skb);
+	u32 inc_sequence;
+	if (cm_node->state == NES_CM_STATE_SYN_SENT && tcph->syn) {
+		inc_sequence = ntohl(tcph->seq);
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence;
+	}
+
+	if ((!tcph) || (cm_node->state == NES_CM_STATE_TSA)) {
+		BUG_ON(!tcph);
+		atomic_inc(&cm_accel_dropped_pkts);
+		return -1;
+	}
+
+	if (tcph->rst) {
+		atomic_inc(&cm_resets_recvd);
+		nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u. refcnt=%d\n",
+				cm_node, cm_node->state, atomic_read(&cm_node->ref_count));
+		switch (cm_node->state) {
+			case NES_CM_STATE_LISTENING:
+				rem_ref_cm_node(cm_core, cm_node);
+				break;
+			case NES_CM_STATE_TSA:
+			case NES_CM_STATE_CLOSED:
+				break;
+			case NES_CM_STATE_SYN_RCVD:
+					nes_debug(NES_DBG_CM, "Received a reset for local 0x%08X:%04X,"
+							" remote 0x%08X:%04X, node state = %u\n",
+							cm_node->loc_addr, cm_node->loc_port,
+							cm_node->rem_addr, cm_node->rem_port,
+							cm_node->state);
+				rem_ref_cm_node(cm_core, cm_node);
+				break;
+			case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+			case NES_CM_STATE_ESTABLISHED:
+			case NES_CM_STATE_MPAREQ_SENT:
+			default:
+					nes_debug(NES_DBG_CM, "Received a reset for local 0x%08X:%04X,"
+							" remote 0x%08X:%04X, node state = %u refcnt=%d\n",
+							cm_node->loc_addr, cm_node->loc_port,
+							cm_node->rem_addr, cm_node->rem_port,
+							cm_node->state, atomic_read(&cm_node->ref_count));
+				// create event
+				cm_node->state = NES_CM_STATE_CLOSED;
+
+				create_event(cm_node, NES_CM_EVENT_ABORTED);
+				break;
+
+		}
+		return -1;
+	}
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+
+	skb_pull(skb, ip_hdr(skb)->ihl << 2);
+	skb_pull(skb, tcph->doff << 2);
+
+	datasize = skb->len;
+	inc_sequence = ntohl(tcph->seq);
+	nes_debug(NES_DBG_CM, "datasize = %u, sequence = 0x%08X, ack_seq = 0x%08X,"
+			" rcv_nxt = 0x%08X Flags: %s %s.\n",
+			datasize, inc_sequence, ntohl(tcph->ack_seq),
+			cm_node->tcp_cntxt.rcv_nxt, (tcph->syn ? "SYN":""),
+			(tcph->ack ? "ACK":""));
+
+	if (!tcph->syn && (inc_sequence != cm_node->tcp_cntxt.rcv_nxt)
+		) {
+		nes_debug(NES_DBG_CM, "dropping packet, datasize = %u, sequence = 0x%08X,"
+				" ack_seq = 0x%08X, rcv_nxt = 0x%08X Flags: %s.\n",
+				datasize, inc_sequence, ntohl(tcph->ack_seq),
+				cm_node->tcp_cntxt.rcv_nxt, (tcph->ack ? "ACK":""));
+		if (cm_node->state == NES_CM_STATE_LISTENING) {
+			rem_ref_cm_node(cm_core, cm_node);
+		}
+		return -1;
+	}
+
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+
+
+	if (optionsize) {
+		u8 *optionsloc = (u8 *)&tcph[1];
+		if (process_options(cm_node, optionsloc, optionsize, (u32)tcph->syn)) {
+			nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", __FUNCTION__, cm_node);
+			send_reset(cm_node);
+			if (cm_node->state != NES_CM_STATE_SYN_SENT)
+			rem_ref_cm_node(cm_core, cm_node);
+			return 0;
+		}
+	} else if (tcph->syn)
+		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
+
+	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
+			cm_node->tcp_cntxt.snd_wscale;
+
+	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) {
+		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
+	}
+
+	if (tcph->ack) {
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		switch (cm_node->state) {
+			case NES_CM_STATE_SYN_RCVD:
+			case NES_CM_STATE_SYN_SENT:
+				/* read and stash current sequence number */
+				if (cm_node->tcp_cntxt.rem_ack_num != cm_node->tcp_cntxt.loc_seq_num) {
+					nes_debug(NES_DBG_CM, "ERROR - cm_node->tcp_cntxt.rem_ack_num !="
+							" cm_node->tcp_cntxt.loc_seq_num\n");
+					send_reset(cm_node);
+					return 0;
+				}
+				if (cm_node->state == NES_CM_STATE_SYN_SENT)
+					cm_node->state = NES_CM_STATE_ONE_SIDE_ESTABLISHED;
+				else {
+						cm_node->state = NES_CM_STATE_ESTABLISHED;
+				}
+				break;
+			case NES_CM_STATE_LAST_ACK:
+				cm_node->state = NES_CM_STATE_CLOSED;
+				break;
+			case NES_CM_STATE_FIN_WAIT1:
+				cm_node->state = NES_CM_STATE_FIN_WAIT2;
+				break;
+			case NES_CM_STATE_CLOSING:
+				cm_node->state = NES_CM_STATE_TIME_WAIT;
+				/* need to schedule this to happen in 2MSL timeouts */
+				cm_node->state = NES_CM_STATE_CLOSED;
+				break;
+			case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+			case NES_CM_STATE_ESTABLISHED:
+			case NES_CM_STATE_MPAREQ_SENT:
+			case NES_CM_STATE_CLOSE_WAIT:
+			case NES_CM_STATE_TIME_WAIT:
+			case NES_CM_STATE_CLOSED:
+				break;
+			case NES_CM_STATE_LISTENING:
+				nes_debug(NES_DBG_CM, "Received an ACK on a listening port (SYN %d)\n", tcph->syn);
+				cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+				send_reset(cm_node);
+				/* send_reset bumps refcount, this should have been a new node */
+				rem_ref_cm_node(cm_core, cm_node);
+				return -1;
+				break;
+			case NES_CM_STATE_TSA:
+				nes_debug(NES_DBG_CM, "Received a packet with the ack bit set while in TSA state\n");
+				break;
+			case NES_CM_STATE_UNKNOWN:
+			case NES_CM_STATE_INITED:
+			case NES_CM_STATE_ACCEPTING:
+			case NES_CM_STATE_FIN_WAIT2:
+			default:
+				nes_debug(NES_DBG_CM, "Received ack from unknown state: %x\n",
+						cm_node->state);
+				send_reset(cm_node);
+				break;
+		}
+	}
+
+	if (tcph->syn) {
+		if (cm_node->state == NES_CM_STATE_LISTENING) {
+			/* do not exceed backlog */
+			atomic_inc(&cm_node->listener->pend_accepts_cnt);
+			if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
+					cm_node->listener->backlog) {
+				nes_debug(NES_DBG_CM, "drop syn due to backlog pressure \n");
+				cm_backlog_drops++;
+				atomic_dec(&cm_node->listener->pend_accepts_cnt);
+				rem_ref_cm_node(cm_core, cm_node);
+				return 0;
+			}
+			cm_node->accept_pend = 1;
+
+		}
+		if (datasize == 0)
+			cm_node->tcp_cntxt.rcv_nxt ++;
+
+		if (cm_node->state == NES_CM_STATE_LISTENING) {
+			cm_node->state = NES_CM_STATE_SYN_RCVD;
+			send_syn(cm_node, 1);
+		}
+		if (cm_node->state == NES_CM_STATE_ONE_SIDE_ESTABLISHED) {
+			cm_node->state = NES_CM_STATE_ESTABLISHED;
+			/* send final handshake ACK */
+			ret = send_ack(cm_node);
+			if (ret < 0)
+				return ret;
+
+				cm_node->state = NES_CM_STATE_MPAREQ_SENT;
+				ret = send_mpa_request(cm_node);
+				if (ret < 0)
+					return ret;
+		}
+	}
+
+	if (tcph->fin) {
+		cm_node->tcp_cntxt.rcv_nxt++;
+		switch (cm_node->state) {
+			case NES_CM_STATE_SYN_RCVD:
+			case NES_CM_STATE_SYN_SENT:
+			case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+			case NES_CM_STATE_ESTABLISHED:
+			case NES_CM_STATE_ACCEPTING:
+			case NES_CM_STATE_MPAREQ_SENT:
+				cm_node->state = NES_CM_STATE_CLOSE_WAIT;
+				cm_node->state = NES_CM_STATE_LAST_ACK;
+				ret = send_fin(cm_node, NULL);
+				break;
+			case NES_CM_STATE_FIN_WAIT1:
+				cm_node->state = NES_CM_STATE_CLOSING;
+				ret = send_ack(cm_node);
+				break;
+			case NES_CM_STATE_FIN_WAIT2:
+				cm_node->state = NES_CM_STATE_TIME_WAIT;
+				cm_node->tcp_cntxt.loc_seq_num ++;
+				ret = send_ack(cm_node);
+				/* need to schedule this to happen in 2MSL timeouts */
+				cm_node->state = NES_CM_STATE_CLOSED;
+				break;
+			case NES_CM_STATE_CLOSE_WAIT:
+			case NES_CM_STATE_LAST_ACK:
+			case NES_CM_STATE_CLOSING:
+			case NES_CM_STATE_TSA:
+			default:
+				nes_debug(NES_DBG_CM, "Received a fin while in %x state\n",
+						cm_node->state);
+				ret = -EINVAL;
+				break;
+		}
+	}
+
+	if (datasize) {
+		u8 *dataloc = skb->data;
+		/* figure out what state we are in and handle transition to next state */
+		switch (cm_node->state) {
+			case NES_CM_STATE_LISTENING:
+			case NES_CM_STATE_SYN_RCVD:
+			case NES_CM_STATE_SYN_SENT:
+			case NES_CM_STATE_FIN_WAIT1:
+			case NES_CM_STATE_FIN_WAIT2:
+			case NES_CM_STATE_CLOSE_WAIT:
+			case NES_CM_STATE_LAST_ACK:
+			case NES_CM_STATE_CLOSING:
+				break;
+			case  NES_CM_STATE_MPAREQ_SENT:
+				/* recv the mpa res frame, ret=frame len (incl priv data) */
+				ret = parse_mpa(cm_node, dataloc, datasize);
+				if (ret < 0)
+					break;
+				/* set the req frame payload len in skb */
+				/* we are done handling this state, set node to a TSA state */
+				cm_node->state = NES_CM_STATE_TSA;
+				send_ack(cm_node);
+				create_event(cm_node, NES_CM_EVENT_CONNECTED);
+				break;
+
+			case  NES_CM_STATE_ESTABLISHED:
+				/* we are expecting an MPA req frame */
+				ret = parse_mpa(cm_node, dataloc, datasize);
+				if (ret < 0) {
+					break;
+				}
+				cm_node->state = NES_CM_STATE_TSA;
+				send_ack(cm_node);
+				/* we got a valid MPA request, create an event */
+				create_event(cm_node, NES_CM_EVENT_MPA_REQ);
+				break;
+			case  NES_CM_STATE_TSA:
+				handle_exception_pkt(cm_node, skb);
+				break;
+			case NES_CM_STATE_UNKNOWN:
+			case NES_CM_STATE_INITED:
+			default:
+				ret = -1;
+		}
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_listen - create a listen node with params
+ */
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
+		struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
+{
+	struct nes_cm_listener *listener;
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
+		cm_info->loc_addr, cm_info->loc_port);
+
+	/* cannot have multiple matching listeners */
+	listener = find_listener(cm_core, htonl(cm_info->loc_addr),
+			htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE);
+	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
+		/* find automatically incs ref count ??? */
+		atomic_dec(&listener->ref_count);
+		nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
+		return NULL;
+	}
+
+	if (!listener) {
+		/* create a CM listen node (1/2 node to compare incoming traffic to) */
+		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
+		if (!listener) {
+			nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n");
+			return NULL;
+		}
+
+		memset(listener, 0, sizeof(struct nes_cm_listener));
+		listener->loc_addr = htonl(cm_info->loc_addr);
+		listener->loc_port = htons(cm_info->loc_port);
+		listener->reused_node = 0;
+
+		atomic_set(&listener->ref_count, 1);
+	}
+	/* pasive case */
+	/* find already inc'ed the ref count */
+	else {
+		listener->reused_node = 1;
+	}
+
+	listener->cm_id = cm_info->cm_id;
+	atomic_set(&listener->pend_accepts_cnt, 0);
+	listener->cm_core = cm_core;
+	listener->nesvnic = nesvnic;
+	atomic_inc(&cm_core->node_cnt);
+	atomic_inc(&cm_core->session_id);
+
+	listener->session_id = (u32)(atomic_read(&cm_core->session_id) + current->tgid);
+	listener->conn_type = cm_info->conn_type;
+	listener->backlog = cm_info->backlog;
+	listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
+
+	if (!listener->reused_node) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_add(&listener->list, &cm_core->listen_list.list);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+		atomic_inc(&cm_core->listen_node_cnt);
+	}
+
+	nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
+			" listener = %p, backlog = %d, cm_id = %p.\n",
+			cm_info->loc_addr, cm_info->loc_port,
+			listener, listener->backlog, listener->cm_id);
+
+	return listener;
+}
+
+
+/**
+ * mini_cm_connect - make a connection node with params
+ */
+struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
+		struct nes_vnic *nesvnic, struct ietf_mpa_frame *mpa_frame,
+		struct nes_cm_info *cm_info)
+{
+	int ret = 0;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_listener *loopbackremotelistener;
+	struct nes_cm_node *loopbackremotenode;
+	struct nes_cm_info loopback_cm_info;
+
+	u16 mpa_frame_size = sizeof(struct ietf_mpa_frame) +
+			ntohs(mpa_frame->priv_data_len);
+
+	cm_info->loc_addr = htonl(cm_info->loc_addr);
+	cm_info->rem_addr = htonl(cm_info->rem_addr);
+	cm_info->loc_port = htons(cm_info->loc_port);
+	cm_info->rem_port = htons(cm_info->rem_port);
+
+	/* create a CM connection node */
+	cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
+	if (!cm_node)
+		return NULL;
+
+	// set our node side to client (active) side
+	cm_node->tcp_cntxt.client = 1;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+
+	if (cm_info->loc_addr == cm_info->rem_addr) {
+		loopbackremotelistener = find_listener(cm_core, cm_node->rem_addr,
+				cm_node->rem_port, NES_CM_LISTENER_ACTIVE_STATE);
+		if (loopbackremotelistener == NULL) {
+			create_event(cm_node, NES_CM_EVENT_ABORTED);
+		} else {
+			atomic_inc(&cm_loopbacks);
+			loopback_cm_info = *cm_info;
+			loopback_cm_info.loc_port = cm_info->rem_port;
+			loopback_cm_info.rem_port = cm_info->loc_port;
+			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
+			loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info,
+					loopbackremotelistener);
+			loopbackremotenode->loopbackpartner = cm_node;
+			loopbackremotenode->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+			cm_node->loopbackpartner = loopbackremotenode;
+			memcpy(loopbackremotenode->mpa_frame_buf, &mpa_frame->priv_data,
+					mpa_frame_size);
+			loopbackremotenode->mpa_frame_size = mpa_frame_size -
+					sizeof(struct ietf_mpa_frame);
+
+			// we are done handling this state, set node to a TSA state
+			cm_node->state = NES_CM_STATE_TSA;
+			cm_node->tcp_cntxt.rcv_nxt = loopbackremotenode->tcp_cntxt.loc_seq_num;
+			loopbackremotenode->tcp_cntxt.rcv_nxt = cm_node->tcp_cntxt.loc_seq_num;
+			cm_node->tcp_cntxt.max_snd_wnd = loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wnd = loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wscale = loopbackremotenode->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->tcp_cntxt.snd_wscale = cm_node->tcp_cntxt.rcv_wscale;
+
+			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
+		}
+		return cm_node;
+	}
+
+	/* set our node side to client (active) side */
+	cm_node->tcp_cntxt.client = 1;
+	/* init our MPA frame ptr */
+	memcpy(&cm_node->mpa_frame, mpa_frame, mpa_frame_size);
+	cm_node->mpa_frame_size = mpa_frame_size;
+
+	/* send a syn and goto syn sent state */
+	cm_node->state = NES_CM_STATE_SYN_SENT;
+	ret = send_syn(cm_node, 0);
+
+	nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X, port=0x%04x,"
+			" cm_node=%p, cm_id = %p.\n",
+			cm_node->rem_addr, cm_node->rem_port, cm_node, cm_node->cm_id);
+
+	return cm_node;
+}
+
+
+/**
+ * mini_cm_accept - accept a connection
+ * This function is never called
+ */
+int mini_cm_accept(struct nes_cm_core *cm_core, struct ietf_mpa_frame *mpa_frame,
+		struct nes_cm_node *cm_node)
+{
+	return 0;
+}
+
+
+/**
+ * mini_cm_reject - reject and teardown a connection
+ */
+int mini_cm_reject(struct nes_cm_core *cm_core,
+		struct ietf_mpa_frame *mpa_frame,
+		struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+	struct sk_buff *skb;
+	u16 mpa_frame_size = sizeof(struct ietf_mpa_frame) +
+			ntohs(mpa_frame->priv_data_len);
+
+	skb = get_free_pkt(cm_node);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	/* send an MPA Request frame */
+	form_cm_frame(skb, cm_node, NULL, 0, mpa_frame, mpa_frame_size, SET_ACK | SET_FIN);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	cm_node->state = NES_CM_STATE_CLOSED;
+	ret = send_fin(cm_node, NULL);
+
+	if (ret < 0) {
+		printk(KERN_INFO PFX "failed to send MPA Reply (reject)\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_close
+ */
+int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!cm_core || !cm_node)
+		return -EINVAL;
+
+	switch (cm_node->state) {
+		/* if passed in node is null, create a reference key node for node search */
+		/* check if we found an owner node for this pkt */
+		case NES_CM_STATE_SYN_RCVD:
+		case NES_CM_STATE_SYN_SENT:
+		case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+		case NES_CM_STATE_ESTABLISHED:
+		case NES_CM_STATE_ACCEPTING:
+		case NES_CM_STATE_MPAREQ_SENT:
+			cm_node->state = NES_CM_STATE_FIN_WAIT1;
+			send_fin(cm_node, NULL);
+			break;
+		case NES_CM_STATE_CLOSE_WAIT:
+			cm_node->state = NES_CM_STATE_LAST_ACK;
+			send_fin(cm_node, NULL);
+			break;
+		case NES_CM_STATE_FIN_WAIT1:
+		case NES_CM_STATE_FIN_WAIT2:
+		case NES_CM_STATE_LAST_ACK:
+		case NES_CM_STATE_TIME_WAIT:
+		case NES_CM_STATE_CLOSING:
+			ret = -1;
+			break;
+		case NES_CM_STATE_LISTENING:
+		case NES_CM_STATE_UNKNOWN:
+		case NES_CM_STATE_INITED:
+		case NES_CM_STATE_CLOSED:
+		case NES_CM_STATE_TSA:
+			ret = rem_ref_cm_node(cm_core, cm_node);
+			break;
+	}
+	cm_node->cm_id = NULL;
+	return ret;
+}
+
+
+/**
+ * recv_pkt - recv an ETHERNET packet, and process it through CM
+ * node state machine
+ */
+int mini_cm_recv_pkt(struct nes_cm_core *cm_core, struct nes_vnic *nesvnic,
+		struct sk_buff *skb)
+{
+	struct nes_cm_node *cm_node = NULL;
+	struct nes_cm_listener *listener = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_cm_info nfo;
+	int ret = 0;
+
+	if (!skb || skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, sizeof(*tcph));
+	skb->len = ntohs(iph->tot_len);
+
+	nfo.loc_addr = ntohl(iph->daddr);
+	nfo.loc_port = ntohs(tcph->dest);
+	nfo.rem_addr = ntohl(iph->saddr);
+	nfo.rem_port = ntohs(tcph->source);
+
+	nes_debug(NES_DBG_CM, "Received packet: dest=0x%08X:0x%04X src=0x%08X:0x%04X\n",
+			iph->daddr, tcph->dest, iph->saddr, tcph->source);
+
+	/* note: this call is going to increment cm_node ref count */
+	cm_node = find_node(cm_core,
+			nfo.rem_port, nfo.rem_addr,
+			nfo.loc_port, nfo.loc_addr);
+
+	if (!cm_node) {
+		listener = find_listener(cm_core, nfo.loc_addr, nfo.loc_port,
+				NES_CM_LISTENER_ACTIVE_STATE);
+		if (listener) {
+			nfo.cm_id = listener->cm_id;
+			nfo.conn_type = listener->conn_type;
+		} else {
+			nfo.cm_id = NULL;
+			nfo.conn_type = 0;
+		}
+
+		cm_node = make_cm_node(cm_core, nesvnic, &nfo, listener);
+		if (!cm_node) {
+			nes_debug(NES_DBG_CM, "Unable to allocate node\n");
+			if (listener) {
+				nes_debug(NES_DBG_CM, "unable to allocate node and decrementing listener refcount\n");
+				atomic_dec(&listener->ref_count);
+			}
+			ret = -1;
+			goto out;
+		}
+		if (!listener) {
+			nes_debug(NES_DBG_CM, "Packet found for unknown port %x refcnt=%d\n",
+					nfo.loc_port, atomic_read(&cm_node->ref_count));
+			if (!tcph->rst) {
+				nes_debug(NES_DBG_CM, "Packet found for unknown port=%d"
+						" rem_port=%d refcnt=%d\n",
+						nfo.loc_port, nfo.rem_port, atomic_read(&cm_node->ref_count));
+
+				cm_node->tcp_cntxt.rcv_nxt = ntohl(tcph->seq);
+				cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+				send_reset(cm_node);
+			}
+			rem_ref_cm_node(cm_core, cm_node);
+			ret = -1;
+			goto out;
+		}
+		add_ref_cm_node(cm_node);
+		cm_node->state = NES_CM_STATE_LISTENING;
+	}
+
+	nes_debug(NES_DBG_CM, "Processing Packet for node %p, data = (%p):\n",
+			cm_node, skb->data);
+	process_packet(cm_node, skb, cm_core);
+
+	rem_ref_cm_node(cm_core, cm_node);
+	out:
+	if (skb)
+		dev_kfree_skb_any(skb);
+	return ret;
+}
+
+
+/**
+ * nes_cm_alloc_core - allocate a top level instance of a cm core
+ */
+struct nes_cm_core *nes_cm_alloc_core(void)
+{
+	int i;
+
+	struct nes_cm_core *cm_core;
+	struct sk_buff *skb = NULL;
+
+	/* setup the CM core */
+	/* alloc top level core control structure */
+	cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
+	if (!cm_core)
+		return NULL;
+
+	INIT_LIST_HEAD(&cm_core->connected_nodes);
+	init_timer(&cm_core->tcp_timer);
+	cm_core->tcp_timer.function = nes_cm_timer_tick;
+
+	cm_core->mtu   = NES_CM_DEFAULT_MTU;
+	cm_core->state = NES_CM_STATE_INITED;
+	cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
+
+	atomic_set(&cm_core->session_id, 0);
+	atomic_set(&cm_core->events_posted, 0);
+
+	/* init the packet lists */
+	skb_queue_head_init(&cm_core->tx_free_list);
+
+	for (i = 0; i < NES_CM_DEFAULT_FRAME_CNT; i++) {
+		skb = dev_alloc_skb(cm_core->mtu);
+		if (!skb) {
+			kfree(cm_core);
+			return NULL;
+		}
+		/* add 'raw' skb to free frame list */
+		skb_queue_head(&cm_core->tx_free_list, skb);
+	}
+
+	cm_core->api = &nes_cm_api;
+
+	spin_lock_init(&cm_core->ht_lock);
+	spin_lock_init(&cm_core->listen_list_lock);
+
+	INIT_LIST_HEAD(&cm_core->listen_list.list);
+
+	nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
+
+	nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
+	cm_core->event_wq = create_singlethread_workqueue("nesewq");
+	cm_core->post_event = nes_cm_post_event;
+	nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
+	cm_core->disconn_wq = create_singlethread_workqueue("nesdwq");
+
+	print_core(cm_core);
+	return cm_core;
+}
+
+
+/**
+ * mini_cm_dealloc_core - deallocate a top level instance of a cm core
+ */
+int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
+{
+	nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
+
+	if (!cm_core)
+		return -EINVAL;
+
+	barrier();
+
+	if (timer_pending(&cm_core->tcp_timer)) {
+		del_timer(&cm_core->tcp_timer);
+	}
+
+	destroy_workqueue(cm_core->event_wq);
+	destroy_workqueue(cm_core->disconn_wq);
+	nes_debug(NES_DBG_CM, "\n");
+	kfree(cm_core);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_get
+ */
+int mini_cm_get(struct nes_cm_core *cm_core)
+{
+	return cm_core->state;
+}
+
+
+/**
+ * mini_cm_set
+ */
+int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
+{
+	int ret = 0;
+
+	switch (type) {
+		case NES_CM_SET_PKT_SIZE:
+			cm_core->mtu = value;
+			break;
+		case NES_CM_SET_FREE_PKT_Q_SIZE:
+			cm_core->free_tx_pkt_max = value;
+			break;
+		default:
+			/* unknown set option */
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_init_tsa_conn setup HW; MPA frames must be
+ * successfully exchanged when this is called
+ */
+static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!nesqp)
+		return -EINVAL;
+
+	nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
+			NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
+			NES_QPCONTEXT_MISC_DROS);
+
+	if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
+		nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
+
+	nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
+
+	nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
+			(u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+			(cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
+			NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+			(cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
+			NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
+
+	nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
+	nesqp->nesqp_context->ts_recent = 0;
+	nesqp->nesqp_context->ts_age = 0;
+	nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
+	nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
+			cm_node->tcp_cntxt.rcv_wscale);
+	nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->srtt = 0;
+	nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
+	nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
+	nesqp->nesqp_context->cwnd = cpu_to_le32(2*cm_node->tcp_cntxt.mss);
+	nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
+
+	nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
+			" Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
+			nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+			le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+			cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
+			le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
+			le32_to_cpu(nesqp->nesqp_context->misc));
+	nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
+	nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
+	nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
+
+	nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
+	cm_node->state = NES_CM_STATE_TSA;
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_disconn
+ */
+int nes_cm_disconn(struct nes_qp *nesqp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	if (nesqp->disconn_pending == 0) {
+		nesqp->disconn_pending++;
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		/* nes_add_ref(&nesqp->ibqp); */
+		/* init our disconnect work element, to */
+		INIT_WORK(&nesqp->disconn_work, nes_disconnect_worker);
+
+		queue_work(g_cm_core->disconn_wq, &nesqp->disconn_work);
+	} else {
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		nes_rem_ref(&nesqp->ibqp);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_disconnect_worker
+ */
+void nes_disconnect_worker(struct work_struct *work)
+{
+	struct nes_qp *nesqp = container_of(work, struct nes_qp, disconn_work);
+
+	nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
+			nesqp->last_aeq, nesqp->hwqp.qp_id);
+	nes_cm_disconn_true(nesqp);
+}
+
+
+/**
+ * nes_cm_disconn_true
+ */
+int nes_cm_disconn_true(struct nes_qp *nesqp)
+{
+	unsigned long flags;
+	int ret = 0;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_vnic *nesvnic;
+	u16 last_ae;
+	u8 original_hw_tcp_state;
+	u8 original_ibqp_state;
+	u8 issued_disconnect_reset = 0;
+
+	if (!nesqp) {
+		nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
+		return -1;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	cm_id = nesqp->cm_id;
+	/* make sure we havent already closed this connection */
+	if (!cm_id) {
+		nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
+				nesqp->hwqp.qp_id);
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		nes_rem_ref(&nesqp->ibqp);
+		return -1;
+	}
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
+
+	original_hw_tcp_state = nesqp->hw_tcp_state;
+	original_ibqp_state   = nesqp->ibqp_state;
+	last_ae = nesqp->last_aeq;
+
+
+	nes_debug(NES_DBG_CM, "set ibqp_state=%u\n", nesqp->ibqp_state);
+
+	if ((nesqp->cm_id) && (cm_id->event_handler)) {
+		if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+				((original_ibqp_state == IB_QPS_RTS) &&
+				(last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+			atomic_inc(&cm_disconnects);
+			cm_event.event = IW_CM_EVENT_DISCONNECT;
+			if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) {
+				issued_disconnect_reset = 1;
+				cm_event.status = IW_CM_EVENT_STATUS_RESET;
+				nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event (status reset) for "
+						" QP%u, cm_id = %p. \n",
+						nesqp->hwqp.qp_id, cm_id);
+			} else {
+				cm_event.status = IW_CM_EVENT_STATUS_OK;
+			}
+
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event for "
+					" QP%u, SQ Head = %u, SQ Tail = %u. cm_id = %p, refcount = %u.\n",
+					nesqp->hwqp.qp_id,
+					nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail, cm_id,
+					atomic_read(&nesqp->refcount));
+
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+			spin_lock_irqsave(&nesqp->lock, flags);
+		}
+
+		nesqp->disconn_pending = 0;
+		/* There might have been another AE while the lock was released */
+		original_hw_tcp_state = nesqp->hw_tcp_state;
+		original_ibqp_state   = nesqp->ibqp_state;
+		last_ae = nesqp->last_aeq;
+
+		if ((issued_disconnect_reset == 0) && (nesqp->cm_id) &&
+				((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+				 (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
+				 (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
+				 (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+			atomic_inc(&cm_closes);
+			nesqp->cm_id = NULL;
+			nesqp->in_disconnect = 0;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_disconnect(nesqp, 1);
+
+			cm_id->provider_data = nesqp;
+			/* Send up the close complete event */
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = IW_CM_EVENT_STATUS_OK;
+			cm_event.provider_data = cm_id->provider_data;
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret) {
+				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+			}
+
+			cm_id->rem_ref(cm_id);
+
+			spin_lock_irqsave(&nesqp->lock, flags);
+			if (nesqp->flush_issued == 0) {
+				nesqp->flush_issued = 1;
+				spin_unlock_irqrestore(&nesqp->lock, flags);
+				flush_wqes(nesvnic->nesdev, nesqp, NES_CQP_FLUSH_RQ, 1);
+			} else {
+				spin_unlock_irqrestore(&nesqp->lock, flags);
+			}
+
+			/* This reference is from either ModifyQP or the AE processing,
+					there is still a race here with modifyqp */
+			nes_rem_ref(&nesqp->ibqp);
+
+		} else {
+			cm_id = nesqp->cm_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			/* check to see if the inbound reset beat the outbound reset */
+			if ((!cm_id) && (last_ae==NES_AEQE_AEID_RESET_SENT)) {
+				nes_debug(NES_DBG_CM, "QP%u: Decing refcount due to inbound reset"
+						" beating the outbound reset.\n",
+						nesqp->hwqp.qp_id);
+				nes_rem_ref(&nesqp->ibqp);
+			}
+		}
+	} else {
+		nesqp->disconn_pending = 0;
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+	}
+	nes_rem_ref(&nesqp->ibqp);
+
+	return 0;
+}
+
+
+/**
+ * nes_disconnect
+ */
+int nes_disconnect(struct nes_qp *nesqp, int abrupt)
+{
+	int ret = 0;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nesdev = nesvnic->nesdev;
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			atomic_read(&nesvnic->netdev->refcnt));
+
+	if (nesqp->active_conn) {
+
+		/* indicate this connection is NOT active */
+		nesqp->active_conn = 0;
+	} else {
+		/* Need to free the Last Streaming Mode Message */
+		if (nesqp->ietf_frame) {
+			pci_free_consistent(nesdev->pcidev,
+					nesqp->private_data_len+sizeof(struct ietf_mpa_frame),
+					nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+		}
+	}
+
+	/* close the CM node down if it is still active */
+	if (nesqp->cm_node) {
+		nes_debug(NES_DBG_CM, "Call close API\n");
+
+		g_cm_core->api->close(g_cm_core, nesqp->cm_node);
+		nesqp->cm_node = NULL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_accept
+ */
+int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	u64 u64temp;
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *adapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_event cm_event;
+	struct nes_hw_qp_wqe *wqe;
+	struct nes_v4_quad nes_quad;
+	int ret;
+
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+
+	/* get all our handles */
+	nesqp = to_nesqp(ibqp);
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	adapter = nesdev->nesadapter;
+
+	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
+			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
+
+	/* since this is from a listen, we were able to put node handle into cm_id */
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+
+	/* associate the node with the QP */
+	nesqp->cm_node = (void *)cm_node;
+
+	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu\n",
+			nesqp->hwqp.qp_id, cm_node, jiffies);
+	atomic_inc(&cm_accepts);
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			atomic_read(&nesvnic->netdev->refcnt));
+
+		/* allocate the ietf frame and space for private data */
+		nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
+				sizeof(struct ietf_mpa_frame) + conn_param->private_data_len,
+				&nesqp->ietf_frame_pbase);
+
+		if (!nesqp->ietf_frame) {
+			nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
+			return -ENOMEM;
+		}
+
+
+		/* setup the MPA frame */
+		nesqp->private_data_len = conn_param->private_data_len;
+		memcpy(nesqp->ietf_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
+
+		memcpy(nesqp->ietf_frame->priv_data, conn_param->private_data,
+				conn_param->private_data_len);
+
+		nesqp->ietf_frame->priv_data_len = cpu_to_be16(conn_param->private_data_len);
+		nesqp->ietf_frame->rev = mpa_version;
+		nesqp->ietf_frame->flags = IETF_MPA_FLAGS_CRC;
+
+		/* setup our first outgoing iWarp send WQE (the IETF frame response) */
+		wqe = &nesqp->hwqp.sq_vbase[0];
+
+		if (cm_id->remote_addr.sin_addr.s_addr != cm_id->local_addr.sin_addr.s_addr) {
+			u64temp = (unsigned long)nesqp;
+			u64temp |= NES_SW_CONTEXT_ALIGN>>1;
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
+					    u64temp);
+			wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+					cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | NES_IWARP_SQ_WQE_WRPDU);
+			wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
+					cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame));
+			wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] =
+					cpu_to_le32((u32)nesqp->ietf_frame_pbase);
+			wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] =
+					cpu_to_le32((u32)((u64)nesqp->ietf_frame_pbase >> 32));
+			wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
+					cpu_to_le32(conn_param->private_data_len + sizeof(struct ietf_mpa_frame));
+			wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
+
+			nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
+					NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | NES_QPCONTEXT_ORDIRD_WRPDU);
+		} else {
+			nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+					NES_QPCONTEXT_ORDIRD_WRPDU | NES_QPCONTEXT_ORDIRD_ALSMM));
+		}
+		nesqp->skip_lsmm = 1;
+
+
+	/* Cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_node->cm_id = cm_id;
+
+	/*  nesqp->cm_node = (void *)cm_id->provider_data; */
+	cm_id->provider_data = nesqp;
+	nesqp->active_conn   = 0;
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(cm_id->local_addr.sin_port));
+	nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(cm_id->remote_addr.sin_port));
+	nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr));
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+
+	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
+			nes_arp_table(nesdev, le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
+			NES_ARP_RESOLVE) << 16);
+
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
+			((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+	nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr      = cm_id->remote_addr.sin_addr.s_addr;
+	nes_quad.TcpPorts[0]   = cm_id->remote_addr.sin_port;
+	nes_quad.TcpPorts[1]   = cm_id->local_addr.sin_port;
+
+	/* Produce hash key */
+	nesqp->hte_index = cpu_to_be32(
+			crc32c(~0, (void *)&nes_quad, sizeof(nes_quad)) ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
+			nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
+
+	nesqp->hte_index &= adapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = 0x%08X:0x%04X,"
+			" rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + private data length=%zu.\n",
+			nesqp->hwqp.qp_id,
+			ntohl(cm_id->remote_addr.sin_addr.s_addr),
+			ntohs(cm_id->remote_addr.sin_port),
+			ntohl(cm_id->local_addr.sin_addr.s_addr),
+			ntohs(cm_id->local_addr.sin_port),
+			le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+			le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+			conn_param->private_data_len+sizeof(struct ietf_mpa_frame));
+
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	/* notify OF layer that accept event was successfull */
+	cm_id->add_ref(cm_id);
+
+	cm_event.event = IW_CM_EVENT_ESTABLISHED;
+	cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED;
+	cm_event.provider_data = (void *)nesqp;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->mpa_frame_size = nesqp->private_data_len;
+		/* copy entire MPA frame to our cm_node's frame */
+		memcpy(cm_node->loopbackpartner->mpa_frame_buf, nesqp->ietf_frame->priv_data,
+			   nesqp->private_data_len);
+		create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
+	}
+	if (ret)
+		printk("%s[%u] OFA CM event_handler returned, ret=%d\n",
+				__FUNCTION__, __LINE__, ret);
+
+	return 0;
+}
+
+
+/**
+ * nes_reject
+ */
+int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct nes_cm_node *cm_node;
+	struct nes_cm_core *cm_core;
+
+	atomic_inc(&cm_rejects);
+	cm_node = (struct nes_cm_node *) cm_id->provider_data;
+	cm_core = cm_node->cm_core;
+	cm_node->mpa_frame_size = sizeof(struct ietf_mpa_frame) + pdata_len;
+
+	strcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP);
+	memcpy(&cm_node->mpa_frame.priv_data, pdata, pdata_len);
+
+	cm_node->mpa_frame.priv_data_len = cpu_to_be16(pdata_len);
+	cm_node->mpa_frame.rev = mpa_version;
+	cm_node->mpa_frame.flags = IETF_MPA_FLAGS_CRC | IETF_MPA_FLAGS_REJECT;
+
+	cm_core->api->reject(cm_core, &cm_node->mpa_frame, cm_node);
+
+	return 0;
+}
+
+
+/**
+ * nes_connect
+ * setup and launch cm connect node
+ */
+int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_info cm_info;
+
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	nesqp = to_nesqp(ibqp);
+	if (!nesqp)
+		return -EINVAL;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+	nesdev  = nesvnic->nesdev;
+	if (!nesdev)
+		return -EINVAL;
+
+	atomic_inc(&cm_connects);
+
+	nesqp->ietf_frame = kzalloc(sizeof(struct ietf_mpa_frame) +
+			conn_param->private_data_len, GFP_KERNEL);
+	if (!nesqp->ietf_frame)
+		return -ENOMEM;
+
+	/* set qp as having an active connection */
+	nesqp->active_conn = 1;
+
+	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = 0x%08X:0x%04X.\n",
+			nesqp->hwqp.qp_id,
+			ntohl(cm_id->remote_addr.sin_addr.s_addr),
+			ntohs(cm_id->remote_addr.sin_port),
+			ntohl(cm_id->local_addr.sin_addr.s_addr),
+			ntohs(cm_id->local_addr.sin_port));
+
+	/* cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+
+	cm_id->provider_data = nesqp;
+
+	/* copy the private data */
+	if (conn_param->private_data_len) {
+		memcpy(nesqp->ietf_frame->priv_data, conn_param->private_data,
+				conn_param->private_data_len);
+	}
+
+	nesqp->private_data_len = conn_param->private_data_len;
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord);
+	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
+	nes_debug(NES_DBG_CM, "mpa private data len =%u\n", conn_param->private_data_len);
+
+	strcpy(&nesqp->ietf_frame->key[0], IEFT_MPA_KEY_REQ);
+	nesqp->ietf_frame->flags = IETF_MPA_FLAGS_CRC;
+	nesqp->ietf_frame->rev = IETF_MPA_VERSION;
+	nesqp->ietf_frame->priv_data_len = htons(conn_param->private_data_len);
+
+	if (cm_id->local_addr.sin_addr.s_addr != cm_id->remote_addr.sin_addr.s_addr)
+		nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port),
+				PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);
+
+	/* set up the connection params for the node */
+	cm_info.loc_addr = (cm_id->local_addr.sin_addr.s_addr);
+	cm_info.loc_port = (cm_id->local_addr.sin_port);
+	cm_info.rem_addr = (cm_id->remote_addr.sin_addr.s_addr);
+	cm_info.rem_port = (cm_id->remote_addr.sin_port);
+	cm_info.cm_id = cm_id;
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	cm_id->add_ref(cm_id);
+	nes_add_ref(&nesqp->ibqp);
+
+	/* create a connect CM node connection */
+	cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, nesqp->ietf_frame, &cm_info);
+	if (!cm_node) {
+		if (cm_id->local_addr.sin_addr.s_addr != cm_id->remote_addr.sin_addr.s_addr)
+			nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port),
+					PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL);
+		nes_rem_ref(&nesqp->ibqp);
+		kfree(nesqp->ietf_frame);
+		nesqp->ietf_frame = NULL;
+		cm_id->rem_ref(cm_id);
+		return -ENOMEM;
+	}
+
+	cm_node->apbvt_set = 1;
+	nesqp->cm_node = cm_node;
+
+	return 0;
+}
+
+
+/**
+ * nes_create_listen
+ */
+int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct nes_vnic *nesvnic;
+	struct nes_cm_listener *cm_node;
+	struct nes_cm_info cm_info;
+	struct nes_adapter *adapter;
+	int err;
+
+
+	nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
+			cm_id, ntohs(cm_id->local_addr.sin_port));
+
+	nesvnic = to_nesvnic(cm_id->device);
+	if (!nesvnic)
+		return -EINVAL;
+	adapter = nesvnic->nesdev->nesadapter;
+	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
+			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
+
+	nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
+			nesvnic->local_ipaddr, cm_id->local_addr.sin_addr.s_addr);
+
+	/* setup listen params in our api call struct */
+	cm_info.loc_addr = nesvnic->local_ipaddr;
+	cm_info.loc_port = cm_id->local_addr.sin_port;
+	cm_info.backlog = backlog;
+	cm_info.cm_id = cm_id;
+
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+
+	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
+	if (!cm_node) {
+		printk("%s[%u] Error returned from listen API call\n",
+				__FUNCTION__, __LINE__);
+		return -ENOMEM;
+	}
+
+	cm_id->provider_data = cm_node;
+
+	if (!cm_node->reused_node) {
+		err = nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port),
+				PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);
+		if (err) {
+			printk("nes_manage_apbvt call returned %d.\n", err);
+			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
+			return err;
+		}
+		cm_listens_created++;
+	}
+
+	cm_id->add_ref(cm_id);
+	cm_id->provider_data = (void *)cm_node;
+
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_listen
+ */
+int nes_destroy_listen(struct iw_cm_id *cm_id)
+{
+	if (cm_id->provider_data)
+		g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
+	else
+		nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
+
+	cm_id->rem_ref(cm_id);
+
+	return 0;
+}
+
+
+/**
+ * nes_cm_recv
+ */
+int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
+{
+	cm_packets_received++;
+	if ((g_cm_core) && (g_cm_core->api)) {
+		g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
+	} else {
+		nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
+				" cm is not setup properly.\n");
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_cm_start
+ * Start and init a cm core module
+ */
+int nes_cm_start(void)
+{
+	nes_debug(NES_DBG_CM, "\n");
+	/* create the primary CM core, pass this handle to subsequent core inits */
+	g_cm_core = nes_cm_alloc_core();
+	if (g_cm_core) {
+		return 0;
+	} else {
+		return -ENOMEM;
+	}
+}
+
+
+/**
+ * nes_cm_stop
+ * stop and dealloc all cm core instances
+ */
+int nes_cm_stop(void)
+{
+	g_cm_core->api->destroy_cm_core(g_cm_core);
+	return 0;
+}
+
+
+/**
+ * cm_event_connected
+ * handle a connected event, setup QPs and HW
+ */
+void cm_event_connected(struct nes_cm_event *event)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *nesadapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_hw_qp_wqe *wqe;
+	struct nes_v4_quad nes_quad;
+	int ret;
+
+	/* get all our handles */
+	cm_node = event->cm_node;
+	cm_id = cm_node->cm_id;
+	nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
+	nesqp = (struct nes_qp *)cm_id->provider_data;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	if (nesqp->destroyed) {
+		return;
+	}
+	atomic_inc(&cm_connecteds);
+	nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
+			" local port 0x%04X. jiffies = %lu.\n",
+			nesqp->hwqp.qp_id,
+			ntohl(cm_id->remote_addr.sin_addr.s_addr),
+			ntohs(cm_id->remote_addr.sin_port),
+			ntohs(cm_id->local_addr.sin_port),
+			jiffies);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	/* set the QP tsa context */
+	nesqp->nesqp_context->tcpPorts[0] = cpu_to_le16(ntohs(cm_id->local_addr.sin_port));
+	nesqp->nesqp_context->tcpPorts[1] = cpu_to_le16(ntohs(cm_id->remote_addr.sin_port));
+	nesqp->nesqp_context->ip0 = cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr));
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
+			nes_arp_table(nesdev, le32_to_cpu(nesqp->nesqp_context->ip0),
+			NULL, NES_ARP_RESOLVE) << 16);
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
+
+	/* Adjust tail for not having a LSMM */
+	nesqp->hwqp.sq_tail = 1;
+
+#if defined(NES_SEND_FIRST_WRITE)
+		if (cm_node->send_write0) {
+			nes_debug(NES_DBG_CM, "Sending first write.\n");
+			wqe = &nesqp->hwqp.sq_vbase[0];
+			u64temp = (unsigned long)nesqp;
+			u64temp |= NES_SW_CONTEXT_ALIGN>>1;
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
+					    u64temp);
+			wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
+			wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
+			wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
+			wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
+			wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
+			wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
+
+			/* use the reserved spot on the WQ for the extra first WQE */
+			nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+					NES_QPCONTEXT_ORDIRD_WRPDU | NES_QPCONTEXT_ORDIRD_ALSMM));
+			nesqp->skip_lsmm = 1;
+			nesqp->hwqp.sq_tail = 0;
+			nes_write32(nesdev->regs + NES_WQE_ALLOC,
+					(1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+		}
+#endif
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+
+	nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = cm_id->remote_addr.sin_addr.s_addr;
+	nes_quad.TcpPorts[0] = cm_id->remote_addr.sin_port;
+	nes_quad.TcpPorts[1] = cm_id->local_addr.sin_port;
+
+	/* Produce hash key */
+	nesqp->hte_index = cpu_to_be32(
+			crc32c(~0, (void *)&nes_quad, sizeof(nes_quad)) ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
+			nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+	nesqp->hte_index &= nesadapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	nesqp->ietf_frame = &cm_node->mpa_frame;
+	nesqp->private_data_len = (u8) cm_node->mpa_frame_size;
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	/* modify QP state to rts */
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	/* notify OF layer we successfully created the requested connection */
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr.sin_family = AF_INET;
+	cm_event.local_addr.sin_port = cm_id->local_addr.sin_port;
+	cm_event.remote_addr = cm_id->remote_addr;
+
+		cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
+		cm_event.private_data_len = (u8) event->cm_node->mpa_frame_size;
+
+	cm_event.local_addr.sin_addr.s_addr = event->cm_info.rem_addr;
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+	if (ret)
+		printk("%s[%u] OFA CM event_handler returned, ret=%d\n",
+				__FUNCTION__, __LINE__, ret);
+	nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = %lu\n",
+			nesqp->hwqp.qp_id, jiffies );
+
+	nes_rem_ref(&nesqp->ibqp);
+
+	return;
+}
+
+
+/**
+ * cm_event_connect_error
+ */
+void cm_event_connect_error(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+	if (!cm_id) {
+		return;
+	}
+
+	nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+
+	if (!nesqp) {
+		return;
+	}
+
+	/* notify OF layer about this connection error event */
+	/* cm_id->rem_ref(cm_id); */
+	nesqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = IW_CM_EVENT_STATUS_REJECTED;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+	nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remove_addr=%08x\n",
+			cm_event.local_addr.sin_addr.s_addr, cm_event.remote_addr.sin_addr.s_addr);
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+	if (ret)
+		printk("%s[%u] OFA CM event_handler returned, ret=%d\n",
+				__FUNCTION__, __LINE__, ret);
+	nes_rem_ref(&nesqp->ibqp);
+		cm_id->rem_ref(cm_id);
+
+	return;
+}
+
+
+/**
+ * cm_event_reset
+ */
+void cm_event_reset(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	if (!event->cm_node->cm_id)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+
+	nesqp->cm_id = NULL;
+	/* cm_id->provider_data = NULL; */
+	cm_event.event = IW_CM_EVENT_DISCONNECT;
+	cm_event.status = IW_CM_EVENT_STATUS_RESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+
+	/* notify OF layer about this connection error event */
+	cm_id->rem_ref(cm_id);
+
+	return;
+}
+
+
+/**
+ * cm_event_mpa_req
+ */
+void cm_event_mpa_req(struct nes_cm_event *event)
+{
+	struct iw_cm_id   *cm_id;
+	struct iw_cm_event cm_event;
+	int ret;
+	struct nes_cm_node *cm_node;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+			cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	cm_event.status = IW_CM_EVENT_STATUS_OK;
+	cm_event.provider_data = (void *)cm_node;
+
+	cm_event.local_addr.sin_family = AF_INET;
+	cm_event.local_addr.sin_port = htons(event->cm_info.loc_port);
+	cm_event.local_addr.sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event.remote_addr.sin_family = AF_INET;
+	cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port);
+	cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+
+		cm_event.private_data                = cm_node->mpa_frame_buf;
+		cm_event.private_data_len            = (u8) cm_node->mpa_frame_size;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk("%s[%u] OFA CM event_handler returned, ret=%d\n",
+				__FUNCTION__, __LINE__, ret);
+
+	return;
+}
+
+
+static void nes_cm_event_handler(struct work_struct *);
+
+/**
+ * nes_cm_post_event
+ * post an event to the cm event handler
+ */
+int nes_cm_post_event(struct nes_cm_event *event)
+{
+	atomic_inc(&event->cm_node->cm_core->events_posted);
+	add_ref_cm_node(event->cm_node);
+	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
+	INIT_WORK(&event->event_work, nes_cm_event_handler);
+	nes_debug(NES_DBG_CM, "queue_work, event=%p\n", event);
+
+	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
+
+	nes_debug(NES_DBG_CM, "Exit\n");
+	return 0;
+}
+
+
+/**
+ * nes_cm_event_handler
+ * worker function to handle cm events
+ * will free instance of nes_cm_event
+ */
+static void nes_cm_event_handler(struct work_struct *work)
+{
+	struct nes_cm_event *event = container_of(work, struct nes_cm_event, event_work);
+	struct nes_cm_core *cm_core;
+
+	if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) {
+		return;
+	}
+	cm_core = event->cm_node->cm_core;
+	nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
+			event, event->type, atomic_read(&cm_core->events_posted));
+
+	switch (event->type) {
+		case NES_CM_EVENT_MPA_REQ:
+			cm_event_mpa_req(event);
+			nes_debug(NES_DBG_CM, "CM Event: MPA REQUEST\n");
+			break;
+		case NES_CM_EVENT_RESET:
+			nes_debug(NES_DBG_CM, "CM Event: RESET\n");
+			cm_event_reset(event);
+			break;
+		case NES_CM_EVENT_CONNECTED:
+			if ((!event->cm_node->cm_id) ||
+				(event->cm_node->state != NES_CM_STATE_TSA)) {
+				break;
+			}
+			cm_event_connected(event);
+			nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
+			break;
+		case NES_CM_EVENT_ABORTED:
+			if ((!event->cm_node->cm_id) || (event->cm_node->state == NES_CM_STATE_TSA)) {
+				break;
+			}
+			cm_event_connect_error(event);
+			nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
+			break;
+		case NES_CM_EVENT_DROPPED_PKT:
+			nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
+			break;
+		default:
+			nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
+			break;
+	}
+
+	atomic_dec(&cm_core->events_posted);
+	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
+	rem_ref_cm_node(cm_core, event->cm_node);
+	kfree(event);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
new file mode 100644
index 0000000..a59f0a7
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_CM_H
+#define NES_CM_H
+
+#define QUEUE_EVENTS
+
+#define NES_MANAGE_APBVT_DEL 0
+#define NES_MANAGE_APBVT_ADD 1
+
+/* IETF MPA -- defines, enums, structs */
+#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
+#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
+#define IETF_MPA_KEY_SIZE 16
+#define IETF_MPA_VERSION  1
+
+enum ietf_mpa_flags {
+	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
+	IETF_MPA_FLAGS_CRC     = 0x40,	/* receive Markers */
+	IETF_MPA_FLAGS_REJECT  = 0x20,	/* Reject */
+};
+
+struct ietf_mpa_frame {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	u8 priv_data[0];
+};
+
+#define ietf_mpa_req_resp_frame ietf_mpa_frame
+
+struct nes_v4_quad {
+	u32 rsvd0;
+	__le32 DstIpAdrIndex;	/* Only most significant 5 bits are valid */
+	__be32 SrcIpadr;
+	__be16 TcpPorts[2];		/* src is low, dest is high */
+};
+
+struct nes_cm_node;
+enum nes_timer_type {
+	NES_TIMER_TYPE_SEND,
+	NES_TIMER_TYPE_RECV,
+	NES_TIMER_NODE_CLEANUP,
+	NES_TIMER_TYPE_CLOSE,
+};
+
+#define MAX_NES_IFS 4
+
+#define SET_ACK 1
+#define SET_SYN 2
+#define SET_FIN 4
+#define SET_RST 8
+
+struct option_base {
+	u8 optionnum;
+	u8 length;
+};
+
+enum option_numbers {
+	OPTION_NUMBER_END,
+	OPTION_NUMBER_NONE,
+	OPTION_NUMBER_MSS,
+	OPTION_NUMBER_WINDOW_SCALE,
+	OPTION_NUMBER_SACK_PERM,
+	OPTION_NUMBER_SACK,
+	OPTION_NUMBER_WRITE0 = 0xbc
+};
+
+struct option_mss {
+	u8 optionnum;
+	u8 length;
+	__be16 mss;
+};
+
+struct option_windowscale {
+	u8 optionnum;
+	u8 length;
+	u8 shiftcount;
+};
+
+union all_known_options {
+	char as_end;
+	struct option_base as_base;
+	struct option_mss as_mss;
+	struct option_windowscale as_windowscale;
+};
+
+struct nes_timer_entry {
+	struct list_head list;
+	unsigned long timetosend;	/* jiffies */
+	struct sk_buff *skb;
+	u32 type;
+	u32 retrycount;
+	u32 retranscount;
+	u32 context;
+	u32 seq_num;
+	u32 send_retrans;
+	int close_when_complete;
+	struct net_device *netdev;
+};
+
+#define NES_DEFAULT_RETRYS  64
+#define NES_DEFAULT_RETRANS 8
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
+#else
+#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
+#endif
+#define NES_SHORT_TIME      (10)
+#define NES_LONG_TIME       (2000*HZ/1000)
+
+#define NES_CM_HASHTABLE_SIZE         1024
+#define NES_CM_TCP_TIMER_INTERVAL     3000
+#define NES_CM_DEFAULT_MTU            1540
+#define NES_CM_DEFAULT_FRAME_CNT      10
+#define NES_CM_THREAD_STACK_SIZE      256
+#define NES_CM_DEFAULT_RCV_WND        64240	// before we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALE  2
+#define NES_CM_DEFAULT_FREE_PKTS      0x000A
+#define NES_CM_FREE_PKT_LO_WATERMARK  2
+
+#define NES_CM_DEFAULT_MSS   536
+
+#define NES_CM_DEF_SEQ       0x159bf75f
+#define NES_CM_DEF_LOCAL_ID  0x3b47
+
+#define NES_CM_DEF_SEQ2      0x18ed5740
+#define NES_CM_DEF_LOCAL_ID2 0xb807
+
+typedef u32 nes_addr_t;
+
+#define nes_cm_tsa_context nes_qp_context
+
+struct nes_qp;
+
+/* cm node transition states */
+enum nes_cm_node_state {
+	NES_CM_STATE_UNKNOWN,
+	NES_CM_STATE_INITED,
+	NES_CM_STATE_LISTENING,
+	NES_CM_STATE_SYN_RCVD,
+	NES_CM_STATE_SYN_SENT,
+	NES_CM_STATE_ONE_SIDE_ESTABLISHED,
+	NES_CM_STATE_ESTABLISHED,
+	NES_CM_STATE_ACCEPTING,
+	NES_CM_STATE_MPAREQ_SENT,
+	NES_CM_STATE_TSA,
+	NES_CM_STATE_FIN_WAIT1,
+	NES_CM_STATE_FIN_WAIT2,
+	NES_CM_STATE_CLOSE_WAIT,
+	NES_CM_STATE_TIME_WAIT,
+	NES_CM_STATE_LAST_ACK,
+	NES_CM_STATE_CLOSING,
+	NES_CM_STATE_CLOSED
+};
+
+/* type of nes connection */
+enum nes_cm_conn_type {
+	NES_CM_IWARP_CONN_TYPE,
+};
+
+/* CM context params */
+struct nes_cm_tcp_context {
+	u8  client;
+
+	u32 loc_seq_num;
+	u32 loc_ack_num;
+	u32 rem_ack_num;
+	u32 rcv_nxt;
+
+	u32 loc_id;
+	u32 rem_id;
+
+	u32 snd_wnd;
+	u32 max_snd_wnd;
+
+	u32 rcv_wnd;
+	u32 mss;
+	u8  snd_wscale;
+	u8  rcv_wscale;
+
+	struct nes_cm_tsa_context tsa_cntxt;
+	struct timeval            sent_ts;
+};
+
+
+enum nes_cm_listener_state {
+	NES_CM_LISTENER_PASSIVE_STATE=1,
+	NES_CM_LISTENER_ACTIVE_STATE=2,
+	NES_CM_LISTENER_EITHER_STATE=3
+};
+
+struct nes_cm_listener {
+	struct list_head           list;
+	u64                        session_id;
+	struct nes_cm_core         *cm_core;
+	u8                         loc_mac[ETH_ALEN];
+	nes_addr_t                 loc_addr;
+	u16                        loc_port;
+	struct iw_cm_id            *cm_id;
+	enum nes_cm_conn_type      conn_type;
+	atomic_t                   ref_count;
+	struct nes_vnic            *nesvnic;
+	atomic_t                   pend_accepts_cnt;
+	int                        backlog;
+	enum nes_cm_listener_state listener_state;
+	u32                        reused_node;
+};
+
+/* per connection node and node state information */
+struct nes_cm_node {
+	u64                       session_id;
+	u32                       hashkey;
+
+	nes_addr_t                loc_addr, rem_addr;
+	u16                       loc_port, rem_port;
+
+	u8                        loc_mac[ETH_ALEN];
+	u8                        rem_mac[ETH_ALEN];
+
+	enum nes_cm_node_state    state;
+	struct nes_cm_tcp_context tcp_cntxt;
+	struct nes_cm_core        *cm_core;
+	struct sk_buff_head       resend_list;
+	atomic_t                  ref_count;
+	struct net_device         *netdev;
+
+	struct nes_cm_node        *loopbackpartner;
+	struct list_head          retrans_list;
+	spinlock_t                retrans_list_lock;
+	struct list_head          recv_list;
+	spinlock_t                recv_list_lock;
+
+	int                       send_write0;
+	union {
+		struct ietf_mpa_frame mpa_frame;
+		u8                    mpa_frame_buf[NES_CM_DEFAULT_MTU];
+	};
+	u16                       mpa_frame_size;
+	struct iw_cm_id           *cm_id;
+	struct list_head          list;
+	int                       accelerated;
+	struct nes_cm_listener    *listener;
+	enum nes_cm_conn_type     conn_type;
+	struct nes_vnic           *nesvnic;
+	int                       apbvt_set;
+	int                       accept_pend;
+};
+
+/* structure for client or CM to fill when making CM api calls. */
+/*	- only need to set relevant data, based on op. */
+struct nes_cm_info {
+	union {
+		struct iw_cm_id   *cm_id;
+		struct net_device *netdev;
+	};
+
+	u16 loc_port;
+	u16 rem_port;
+	nes_addr_t loc_addr;
+	nes_addr_t rem_addr;
+
+	enum nes_cm_conn_type  conn_type;
+	int backlog;
+};
+
+/* CM event codes */
+enum  nes_cm_event_type {
+	NES_CM_EVENT_UNKNOWN,
+	NES_CM_EVENT_ESTABLISHED,
+	NES_CM_EVENT_MPA_REQ,
+	NES_CM_EVENT_MPA_CONNECT,
+	NES_CM_EVENT_MPA_ACCEPT,
+	NES_CM_EVENT_MPA_ESTABLISHED,
+	NES_CM_EVENT_CONNECTED,
+	NES_CM_EVENT_CLOSED,
+	NES_CM_EVENT_RESET,
+	NES_CM_EVENT_DROPPED_PKT,
+	NES_CM_EVENT_CLOSE_IMMED,
+	NES_CM_EVENT_CLOSE_HARD,
+	NES_CM_EVENT_CLOSE_CLEAN,
+	NES_CM_EVENT_ABORTED,
+	NES_CM_EVENT_SEND_FIRST
+};
+
+/* event to post to CM event handler */
+struct nes_cm_event {
+	enum nes_cm_event_type type;
+
+	struct nes_cm_info cm_info;
+	struct work_struct event_work;
+	struct nes_cm_node *cm_node;
+};
+
+struct nes_cm_core {
+	enum nes_cm_node_state  state;
+	atomic_t                session_id;
+
+	atomic_t                listen_node_cnt;
+	struct nes_cm_node      listen_list;
+	spinlock_t              listen_list_lock;
+
+	u32                     mtu;
+	u32                     free_tx_pkt_max;
+	u32                     rx_pkt_posted;
+	struct sk_buff_head     tx_free_list;
+	atomic_t                ht_node_cnt;
+	struct list_head        connected_nodes;
+	/* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
+	spinlock_t              ht_lock;
+
+	struct timer_list       tcp_timer;
+
+	struct nes_cm_ops       *api;
+
+	int (*post_event)(struct nes_cm_event *event);
+	atomic_t                events_posted;
+	struct workqueue_struct *event_wq;
+	struct workqueue_struct *disconn_wq;
+
+	atomic_t                node_cnt;
+	u64                     aborted_connects;
+	u32                     options;
+
+	struct nes_cm_node      *current_listen_node;
+};
+
+
+#define NES_CM_SET_PKT_SIZE        (1 << 1)
+#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
+
+/* CM ops/API for client interface */
+struct nes_cm_ops {
+	int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
+	struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
+			struct nes_cm_info *);
+	int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
+	struct nes_cm_node * (*connect)(struct nes_cm_core *,
+			struct nes_vnic *, struct ietf_mpa_frame *,
+			struct nes_cm_info *);
+	int (*close)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*accept)(struct nes_cm_core *, struct ietf_mpa_frame *,
+			struct nes_cm_node *);
+	int (*reject)(struct nes_cm_core *, struct ietf_mpa_frame *,
+			struct nes_cm_node *);
+	int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
+			struct sk_buff *);
+	int (*destroy_cm_core)(struct nes_cm_core *);
+	int (*get)(struct nes_cm_core *);
+	int (*set)(struct nes_cm_core *, u32, u32);
+};
+
+
+int send_mpa_request(struct nes_cm_node *);
+struct sk_buff *form_cm_frame(struct sk_buff *, struct nes_cm_node *,
+		void *, u32, void *, u32, u8);
+int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
+		enum nes_timer_type, int, int);
+void nes_cm_timer_tick(unsigned long);
+int send_syn(struct nes_cm_node *, u32);
+int send_reset(struct nes_cm_node *);
+int send_ack(struct nes_cm_node *);
+int send_fin(struct nes_cm_node *, struct sk_buff *);
+struct sk_buff *get_free_pkt(struct nes_cm_node *);
+int process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
+
+struct nes_cm_node * mini_cm_connect(struct nes_cm_core *,
+		struct nes_vnic *, struct ietf_mpa_frame *, struct nes_cm_info *);
+int mini_cm_accept(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *);
+int mini_cm_reject(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *);
+int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
+int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
+struct nes_cm_core *mini_cm_alloc_core(struct nes_cm_info *);
+int mini_cm_dealloc_core(struct nes_cm_core *);
+int mini_cm_get(struct nes_cm_core *);
+int mini_cm_set(struct nes_cm_core *, u32, u32);
+
+int nes_cm_disconn(struct nes_qp *);
+void nes_disconnect_worker(struct work_struct *);
+int nes_cm_disconn_true(struct nes_qp *);
+int nes_disconnect(struct nes_qp *, int);
+
+int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_reject(struct iw_cm_id *, const void *, u8);
+int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_create_listen(struct iw_cm_id *, int);
+int nes_destroy_listen(struct iw_cm_id *);
+
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+int nes_cm_start(void);
+int nes_cm_stop(void);
+
+/* CM event handler functions */
+void cm_event_connected(struct nes_cm_event *);
+void cm_event_connect_error(struct nes_cm_event *);
+void cm_event_reset(struct nes_cm_event *);
+void cm_event_mpa_req(struct nes_cm_event *);
+int nes_cm_post_event(struct nes_cm_event *);
+
+#endif			/* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
new file mode 100644
index 0000000..da9daba
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_context.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef NES_CONTEXT_H
+#define NES_CONTEXT_H
+
+struct nes_qp_context {
+	__le32   misc;
+	__le32   cqs;
+	__le32   sq_addr_low;
+	__le32   sq_addr_high;
+	__le32   rq_addr_low;
+	__le32   rq_addr_high;
+	__le32   misc2;
+	__le16   tcpPorts[2];
+	__le32   ip0;
+	__le32   ip1;
+	__le32   ip2;
+	__le32   ip3;
+	__le32   mss;
+	__le32   arp_index_vlan;
+	__le32   tcp_state_flow_label;
+	__le32   pd_index_wscale;
+	__le32   keepalive;
+	u32   ts_recent;
+	u32   ts_age;
+	__le32   snd_nxt;
+	__le32   snd_wnd;
+	__le32   rcv_nxt;
+	__le32   rcv_wnd;
+	__le32   snd_max;
+	__le32   snd_una;
+	u32   srtt;
+	__le32   rttvar;
+	__le32   ssthresh;
+	__le32   cwnd;
+	__le32   snd_wl1;
+	__le32   snd_wl2;
+	__le32   max_snd_wnd;
+	__le32   ts_val_delta;
+	u32   retransmit;
+	u32   probe_cnt;
+	u32   hte_index;
+	__le32   q2_addr_low;
+	__le32   q2_addr_high;
+	__le32   ird_index;
+	u32   Rsvd3;
+	__le32   ird_ord_sizes;
+	u32   mrkr_offset;
+	__le32   aeq_token_low;
+	__le32   aeq_token_high;
+};
+
+/* QP Context Misc Field */
+
+#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
+#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
+#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
+#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
+#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
+#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
+#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
+#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
+#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
+#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
+
+enum nes_qp_context_misc_bits {
+	NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
+	NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
+	NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
+	NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
+	NES_QPCONTEXT_MISC_DROS                = 0x00008000,
+	NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
+	NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
+	NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
+	NES_QPCONTEXT_MISC_SACK                = 0x00400000,
+	NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
+	NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
+	NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
+	NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
+	NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
+	NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
+};
+
+enum nes_qp_acc_wq_sizes {
+	HCONTEXT_TSA_WQ_SIZE_4 = 0,
+	HCONTEXT_TSA_WQ_SIZE_32 = 1,
+	HCONTEXT_TSA_WQ_SIZE_128 = 2,
+	HCONTEXT_TSA_WQ_SIZE_512 = 3
+};
+
+/* QP Context Misc2 Fields */
+#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
+#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
+#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
+#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
+#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
+#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
+#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
+#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
+
+/* QP Context Tcp State/Flow Label Fields */
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
+
+enum nes_qp_tcp_state {
+	NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
+	NES_QPCONTEXT_TCPSTATE_EST = 5,
+	NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
+};
+
+/* QP Context PD Index/wscale Fields */
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
+
+/* QP Context Keepalive Fields */
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
+#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
+#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
+
+/* QP Context ORD/IRD Fields */
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
+
+enum nes_ord_ird_bits {
+	NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
+	NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
+	NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
+	NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
+	NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
+};
+
+enum nes_iwarp_qp_state {
+	NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
+	NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
+	NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
+	NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
+	NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
+	NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
+};
+
+
+#endif		/* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
new file mode 100644
index 0000000..7c4c0fb
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -0,0 +1,3080 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include "nes.h"
+
+u32 crit_err_count = 0;
+u32 int_mod_timer_init;
+u32 int_mod_cq_depth_256;
+u32 int_mod_cq_depth_128;
+u32 int_mod_cq_depth_32;
+u32 int_mod_cq_depth_24;
+u32 int_mod_cq_depth_16;
+u32 int_mod_cq_depth_4;
+u32 int_mod_cq_depth_1;
+
+#include "nes_cm.h"
+
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+static unsigned char *nes_iwarp_state_str[] = {
+	"Non-Existant",
+	"Idle",
+	"RTS",
+	"Closing",
+	"RSVD1",
+	"Terminate",
+	"Error",
+	"RSVD2",
+};
+
+static unsigned char *nes_tcp_state_str[] = {
+	"Non-Existant",
+	"Closed",
+	"Listen",
+	"SYN Sent",
+	"SYN Rcvd",
+	"Established",
+	"Close Wait",
+	"FIN Wait 1",
+	"Closing",
+	"Last Ack",
+	"FIN Wait 2",
+	"Time Wait",
+	"RSVD1",
+	"RSVD2",
+	"RSVD3",
+	"RSVD4",
+};
+#endif
+
+
+/**
+ * nes_nic_init_timer_defaults
+ */
+void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
+	shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
+	if (jumbomode) {
+		shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
+	} else {
+		shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
+	}
+
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_init_timer
+ */
+static void  nes_nic_init_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->timer_in_use_old == 0) {
+		nesdev->deepcq_count = 0;
+		shared_timer->timer_direction_upward = 0;
+		shared_timer->timer_direction_downward = 0;
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
+		shared_timer->timer_in_use_old = 0;
+
+	}
+	if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
+		shared_timer->timer_in_use_old = shared_timer->timer_in_use;
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+			0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
+	}
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_tune_timer
+ */
+static void nes_nic_tune_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	u16 cq_count = nesdev->currcq_count;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->cq_count_old < cq_count) {
+		if (cq_count > shared_timer->threshold_low)
+			shared_timer->cq_direction_downward=0;
+	}
+	if (shared_timer->cq_count_old >= cq_count)
+		shared_timer->cq_direction_downward++;
+	shared_timer->cq_count_old = cq_count;
+	if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
+		if (cq_count <= shared_timer->threshold_low) {
+			shared_timer->threshold_low = shared_timer->threshold_low/2;
+			shared_timer->cq_direction_downward=0;
+			nesdev->currcq_count = 0;
+			spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+			return;
+		}
+	}
+
+	if (cq_count > 1) {
+		nesdev->deepcq_count += cq_count;
+		if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
+			shared_timer->timer_direction_upward++;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_target) { /* balanced */
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
+			shared_timer->timer_direction_downward++;
+			shared_timer->timer_direction_upward = 0;
+		} else if (cq_count <= (shared_timer->threshold_high) * 2) {
+			shared_timer->timer_in_use -= 2;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		} else {
+			shared_timer->timer_in_use -= 4;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		}
+
+		if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
+			shared_timer->timer_in_use += 3;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		}
+		if (shared_timer->timer_direction_downward > 5) { /* using history */
+			shared_timer->timer_in_use -= 4 ;
+			shared_timer->timer_direction_downward = 0;
+			shared_timer->timer_direction_upward = 0;
+		}
+	}
+
+	/* boundary checking */
+	if (shared_timer->timer_in_use > NES_NIC_FAST_TIMER_HIGH)
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER_HIGH;
+	else if (shared_timer->timer_in_use < NES_NIC_FAST_TIMER_LOW) {
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER_LOW;
+	}
+
+	nesdev->currcq_count = 0;
+
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_init_adapter - initialize adapter
+ */
+struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
+	struct nes_adapter *nesadapter = NULL;
+	unsigned long num_pds;
+	u32 u32temp;
+	u32 port_count;
+	u16 max_rq_wrs;
+	u16 max_sq_wrs;
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 max_qp;
+	u32 max_irrq;
+	u32 max_cq;
+	u32 hte_index_mask;
+	u32 adapter_size;
+	u32 arp_table_size;
+	u16 vendor_id;
+	u8  OneG_Mode;
+	u8  func_index;
+
+	/* search the list of existing adapters */
+	list_for_each_entry(nesadapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
+				" adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
+				nesdev->pcidev->devfn,
+				PCI_SLOT(nesadapter->devfn),
+				nesadapter->bus_number,
+				PCI_SLOT(nesdev->pcidev->devfn),
+				nesdev->pcidev->bus->number );
+		if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
+				(nesadapter->bus_number == nesdev->pcidev->bus->number)) {
+			nesadapter->ref_count++;
+			return nesadapter;
+		}
+	}
+
+	/* no adapter found */
+	num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
+	if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
+		nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
+				hw_rev);
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
+			nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
+
+	nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
+
+
+	if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
+		return NULL;
+	if (nes_init_serdes(nesdev, hw_rev, port_count, OneG_Mode))
+		return NULL;
+	nes_init_csr_ne020(nesdev, hw_rev, port_count);
+
+	max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
+	nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
+	if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
+				max_qp, u32temp);
+		max_qp = (u32)1 << (u32temp & 0x001f);
+	}
+
+	hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
+	nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
+			max_qp, hte_index_mask);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
+
+	max_irrq = 1 << (u32temp & 0x001f);
+
+	if (max_qp > max_irrq) {
+		max_qp = max_irrq;
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
+				max_qp);
+	}
+
+	/* there should be no reason to allocate more pds than qps */
+	if (num_pds > max_qp)
+		num_pds = max_qp;
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
+	max_mr = (u32)8192 << (u32temp & 0x7);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
+	max_256pbl = (u32)1 << (u32temp & 0x0000001f);
+	max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
+	max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
+	arp_table_size = 1 << u32temp;
+
+	adapter_size = (sizeof(struct nes_adapter) +
+			(sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+	adapter_size += sizeof(struct nes_qp **) * max_qp;
+
+	/* allocate a new adapter struct */
+	nesadapter = kzalloc(adapter_size, GFP_KERNEL);
+	if (nesadapter == NULL) {
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
+			nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
+
+	/* populate the new nesadapter */
+	nesadapter->devfn = nesdev->pcidev->devfn;
+	nesadapter->bus_number = nesdev->pcidev->bus->number;
+	nesadapter->ref_count = 1;
+	nesadapter->timer_int_req = 0xffff0000;
+	nesadapter->OneG_Mode = OneG_Mode;
+	nesadapter->doorbell_start = nesdev->doorbell_region;
+
+	/* nesadapter->tick_delta = clk_divisor; */
+	nesadapter->hw_rev = hw_rev;
+	nesadapter->port_count = port_count;
+
+	nesadapter->max_qp = max_qp;
+	nesadapter->hte_index_mask = hte_index_mask;
+	nesadapter->max_irrq = max_irrq;
+	nesadapter->max_mr = max_mr;
+	nesadapter->max_256pbl = max_256pbl - 1;
+	nesadapter->max_4kpbl = max_4kpbl - 1;
+	nesadapter->max_cq = max_cq;
+	nesadapter->free_256pbl = max_256pbl - 1;
+	nesadapter->free_4kpbl = max_4kpbl - 1;
+	nesadapter->max_pd = num_pds;
+	nesadapter->arp_table_size = arp_table_size;
+
+	nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
+	if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
+		nesadapter->et_use_adaptive_rx_coalesce = 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce = 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __FUNCTION__);
+	}
+	/* Setup and enable the periodic timer */
+	if (nesadapter->et_rx_coalesce_usecs_irq)
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
+				((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
+	else
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
+
+	nesadapter->base_pd = 1;
+
+	nesadapter->device_cap_flags =
+			IB_DEVICE_ZERO_STAG | IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW;
+
+	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
+			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
+	nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
+	nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
+	nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
+	nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
+	nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+
+
+	/* mark the usual suspect QPs and CQs as in use */
+	for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
+		set_bit(u32temp, nesadapter->allocated_qps);
+		set_bit(u32temp, nesadapter->allocated_cqs);
+	}
+
+	for (u32temp = 0; u32temp < 20; u32temp++)
+		set_bit(u32temp, nesadapter->allocated_pds);
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
+
+	max_rq_wrs = ((u32temp >> 8) & 3);
+	switch (max_rq_wrs) {
+		case 0:
+			max_rq_wrs = 4;
+			break;
+		case 1:
+			max_rq_wrs = 16;
+			break;
+		case 2:
+			max_rq_wrs = 32;
+			break;
+		case 3:
+			max_rq_wrs = 512;
+			break;
+	}
+
+	max_sq_wrs = (u32temp & 3);
+	switch (max_sq_wrs) {
+		case 0:
+			max_sq_wrs = 4;
+			break;
+		case 1:
+			max_sq_wrs = 16;
+			break;
+		case 2:
+			max_sq_wrs = 32;
+			break;
+		case 3:
+			max_sq_wrs = 512;
+			break;
+	}
+	nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
+	nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
+
+	nesadapter->max_sge = 4;
+	nesadapter->max_cqe = 32767;
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
+	nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
+			(u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
+
+	/* setup port configuration */
+	if (nesadapter->port_count == 1) {
+		u32temp = 0x00000000;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
+		else
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	} else {
+		if (nesadapter->port_count == 2)
+			u32temp = 0x00000044;
+		else
+			u32temp = 0x000000e4;
+		nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	}
+
+	nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT, u32temp);
+	nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
+			nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
+
+	spin_lock_init(&nesadapter->resource_lock);
+	spin_lock_init(&nesadapter->phy_lock);
+	spin_lock_init(&nesadapter->pbl_lock);
+	spin_lock_init(&nesadapter->periodic_timer_lock);
+
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
+
+	if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+		u32 pcs_control_status0, pcs_control_status1;
+		u32 reset_value;
+		u32 i = 0;
+		u32 int_cnt = 0;
+		u32 ext_cnt = 0;
+		unsigned long flags;
+		u32 j = 0;
+
+		pcs_control_status0 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0);
+		pcs_control_status1 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+		for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+			    || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
+				int_cnt++;
+			msleep(1);
+		}
+		if (int_cnt > 1) {
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			mh_detected++;
+			reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+			reset_value |= 0x0000003d;
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (j++ < 5000));
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+			for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+				pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+					|| (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
+					if (++ext_cnt > int_cnt) {
+						spin_lock_irqsave(&nesadapter->phy_lock, flags);
+						nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
+								0x0000F0C8);
+						mh_detected++;
+						reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+						reset_value |= 0x0000003d;
+						nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+						while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+							& 0x00000040) != 0x00000040) && (j++ < 5000));
+						spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+						break;
+					}
+				}
+				msleep(1);
+			}
+		}
+	}
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		init_timer(&nesadapter->mh_timer);
+		nesadapter->mh_timer.function = nes_mh_fix;
+		nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
+		nesadapter->mh_timer.data = (unsigned long)nesdev;
+		add_timer(&nesadapter->mh_timer);
+	} else {
+		nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
+	}
+
+	init_timer(&nesadapter->lc_timer);
+	nesadapter->lc_timer.function = nes_clc;
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	nesadapter->lc_timer.data = (unsigned long)nesdev;
+	add_timer(&nesadapter->lc_timer);
+
+	list_add_tail(&nesadapter->list, &nes_adapter_list);
+
+	for (func_index = 0; func_index < 8; func_index++) {
+		pci_bus_read_config_word(nesdev->pcidev->bus,
+					PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
+					func_index), 0, &vendor_id);
+		if (vendor_id == 0xffff)
+			break;
+	}
+	nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __FUNCTION__,
+		func_index, pci_name(nesdev->pcidev));
+	nesadapter->adapter_fcn_count = func_index;
+
+	return nesadapter;
+}
+
+
+/**
+ * nes_reset_adapter_ne020
+ */
+unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
+{
+	u32 port_count;
+	u32 u32temp;
+	u32 i;
+
+	u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+	port_count = ((u32temp & 0x00000300) >> 8) + 1;
+	/* TODO: assuming that both SERDES are set the same for now */
+	*OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
+	nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
+			u32temp, port_count);
+	if (*OneG_Mode)
+		nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
+	u32temp &= 0xff00ffc0;
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0000;
+			break;
+		case 2:
+			u32temp |= 0x00cc0000;
+			break;
+		case 4:
+			u32temp |= 0x00000000;
+			break;
+		default:
+			return 0;
+			break;
+	}
+
+	/* check and do full reset if needed */
+	if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
+		nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+		i = 0;
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+			mdelay(1);
+		if (i >= 10000) {
+			nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
+			return 0;
+		}
+	}
+
+	/* port reset */
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0010;
+			break;
+		case 2:
+			u32temp |= 0x00cc0030;
+			break;
+		case 4:
+			u32temp |= 0x00000030;
+			break;
+	}
+
+	nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+	i = 0;
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+		mdelay(1);
+	if (i >= 10000) {
+		nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
+		return 0;
+	}
+
+	/* serdes 0 */
+	i = 0;
+	while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+			& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+		mdelay(1);
+	if (i >= 5000) {
+		nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
+		return 0;
+	}
+
+	/* serdes 1 */
+	if (port_count > 1) {
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i >= 5000) {
+			nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
+			return 0;
+		}
+	}
+
+
+
+	i = 0;
+	while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
+		mdelay(1);
+	if (i >= 10000) {
+		printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
+				nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
+		return 0;
+	}
+
+	return port_count;
+}
+
+
+/**
+ * nes_init_serdes
+ */
+int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, u8  OneG_Mode)
+{
+	int i;
+	u32 u32temp;
+
+	if (hw_rev != NE020_REV) {
+		/* init serdes 0 */
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+		if (!OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
+		if (port_count > 1) {
+			/* init serdes 1 */
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
+			if (!OneG_Mode)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
+			}
+	} else {
+		/* init serdes 0 */
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i >= 5000) {
+			nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
+			return 1;
+		}
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		else
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+		if (port_count > 1) {
+			/* init serdes 1 */
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
+			i = 0;
+			while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && (i++ < 5000))
+				mdelay(1);
+			if (i >= 5000) {
+				printk("%s: Init: serdes 1 not ready, status=%x\n", __FUNCTION__, u32temp);
+				/* return 1; */
+			}
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_init_csr_ne020
+ * Initialize registers for ne020 hardware
+ */
+void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
+{
+	u32 u32temp;
+
+	nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
+
+	nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
+	/* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
+	nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
+	nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
+	/* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
+	nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
+	nes_write_indexed(nesdev, 0x00000600, 0x55555555);
+	nes_write_indexed(nesdev, 0x00000604, 0x55555555);
+
+	/* TODO: move these MAC register settings to NIC bringup */
+	nes_write_indexed(nesdev, 0x00002000, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002004, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
+	nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
+	nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
+	if (port_count > 1) {
+		nes_write_indexed(nesdev, 0x00002200, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002204, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000908, 0x20000001);
+	}
+	if (port_count > 2) {
+		nes_write_indexed(nesdev, 0x00002400, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002404, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000910, 0x20000001);
+
+		nes_write_indexed(nesdev, 0x00002600, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002604, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000918, 0x20000001);
+	}
+
+	nes_write_indexed(nesdev, 0x00005000, 0x00018000);
+	/* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
+	nes_write_indexed(nesdev, 0x00005004, 0x00020001);
+	nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
+
+	/* TODO: move this to code, get from EEPROM */
+	nes_write_indexed(nesdev, 0x00000900, 0x20000001);
+	nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
+	nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
+														//
+	nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
+	/* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
+
+	if (hw_rev != NE020_REV) {
+		u32temp = nes_read_indexed(nesdev, 0x000008e8);
+		u32temp |= 0x80000000;
+		nes_write_indexed(nesdev, 0x000008e8, u32temp);
+		u32temp = nes_read_indexed(nesdev, 0x000021f8);
+		u32temp &= 0x7fffffff;
+		u32temp |= 0x7fff0010;
+		nes_write_indexed(nesdev, 0x000021f8, u32temp);
+	}
+}
+
+
+/**
+ * nes_destroy_adapter - destroy the adapter structure
+ */
+void nes_destroy_adapter(struct nes_adapter *nesadapter)
+{
+	struct nes_adapter *tmp_adapter;
+
+	list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
+				tmp_adapter);
+	}
+
+	nesadapter->ref_count--;
+	if (!nesadapter->ref_count) {
+		if (nesadapter->hw_rev == NE020_REV) {
+			del_timer(&nesadapter->mh_timer);
+		}
+		del_timer(&nesadapter->lc_timer);
+
+		list_del(&nesadapter->list);
+		kfree(nesadapter);
+	}
+}
+
+
+/**
+ * nes_init_cqp
+ */
+int nes_init_cqp(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_qp_context *cqp_qp_context;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_ceq *ceq;
+	struct nes_hw_ceq *nic_ceq;
+	struct nes_hw_aeq *aeq;
+	void *vmem;
+	dma_addr_t pmem;
+	u32 count=0;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 u32temp;
+
+	/* allocate CQP memory */
+	/* Need to add max_cq to the aeq size once cq overflow checking is added back */
+	/* SQ is 512 byte aligned, others are 256 byte aligned */
+	nesdev->cqp_mem_size = 512 +
+			(sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
+			(sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
+			(sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
+			sizeof(struct nes_hw_cqp_qp_context);
+
+	nesdev->cqp_vbase = pci_alloc_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+			&nesdev->cqp_pbase);
+	if (!nesdev->cqp_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
+		return -ENOMEM;
+	}
+	memset(nesdev->cqp_vbase, 0, nesdev->cqp_mem_size);
+
+	/* Allocate a twice the number of CQP requests as the SQ size */
+	nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
+			2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
+	if (nesdev->nes_cqp_requests == NULL) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n");
+		pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+				nesdev->cqp.sq_pbase);
+		return -ENOMEM;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
+			nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
+
+	spin_lock_init(&nesdev->cqp.lock);
+	init_waitqueue_head(&nesdev->cqp.waitq);
+
+	/* Setup Various Structures */
+	vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
+			~(unsigned long)(512 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
+			~(unsigned long long)(512 - 1));
+
+	nesdev->cqp.sq_vbase = vmem;
+	nesdev->cqp.sq_pbase = pmem;
+	nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
+	nesdev->cqp.sq_head = 0;
+	nesdev->cqp.sq_tail = 0;
+	nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+	pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+
+	nesdev->ccq.cq_vbase = vmem;
+	nesdev->ccq.cq_pbase = pmem;
+	nesdev->ccq.cq_size = NES_CCQ_SIZE;
+	nesdev->ccq.cq_head = 0;
+	nesdev->ccq.ce_handler = nes_cqp_ce_handler;
+	nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+	pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+
+	nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
+	ceq = &nesadapter->ceq[nesdev->ceq_index];
+	ceq->ceq_vbase = vmem;
+	ceq->ceq_pbase = pmem;
+	ceq->ceq_size = NES_CCEQ_SIZE;
+	ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+
+	nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
+	nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
+	nic_ceq->ceq_vbase = vmem;
+	nic_ceq->ceq_pbase = pmem;
+	nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
+	nic_ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+
+	aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
+	aeq->aeq_vbase = vmem;
+	aeq->aeq_pbase = pmem;
+	aeq->aeq_size = nesadapter->max_qp;
+	aeq->aeq_head = 0;
+
+	/* Setup QP Context */
+	vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+	pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+
+	cqp_qp_context = vmem;
+	cqp_qp_context->context_words[0] =
+			cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
+	cqp_qp_context->context_words[1] = 0;
+	cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
+	cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
+
+
+	/* Write the address to Create CQP */
+	if ((sizeof(dma_addr_t) > 4)) {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+				((u64)pmem) >> 32);
+	} else {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
+	}
+	nes_write_indexed(nesdev,
+			NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			(u32)pmem);
+
+	INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
+	INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
+
+	for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
+		init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
+		list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
+	}
+
+	/* Write Create CCQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			    (nesdev->ccq.cq_number |
+			     ((u32)nesdev->ceq_index << 16)));
+	u64temp = (u64)nesdev->ccq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (unsigned long)&nesdev->ccq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+	/* Write Create CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			    (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
+	u64temp = (u64)ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create AEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
+	u64temp = (u64)aeq->aeq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create NIC CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
+	u64temp = (u64)nic_ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Poll until CCQP done */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CQP\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (!(nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
+
+	nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	u32temp = 0x04800000;
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
+
+	/* wait for the CCQ, CEQ, and AEQ to get created */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	nesdev->cqp.sq_tail++;
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_cqp
+ */
+int nes_destroy_cqp(struct nes_device *nesdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 count = 0;
+	u32 cqp_head;
+	unsigned long flags;
+
+	do {
+		if (count++ > 1000)
+			break;
+		udelay(10);
+	} while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
+
+	/* Reset CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
+			nesdev->ccq.cq_number);
+
+	/* Disable device interrupts */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy the AEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
+
+	/* Destroy the NIC CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			((u32)nesdev->nic_ceq_index << 8));
+
+	/* Destroy the CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			(nesdev->ceq_index << 8));
+
+	/* Destroy the CCQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
+			((u32)nesdev->ceq_index << 16));
+
+	/* Destroy CQP */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
+			NES_CQP_QP_TYPE_CQP);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
+
+	barrier();
+	/* Ring doorbell (5 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* wait for the CCQ, CEQ, and AEQ to get destroyed */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
+					PCI_FUNC(nesdev->pcidev->devfn));
+			break;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	kfree(nesdev->nes_cqp_requests);
+
+	/* Free the control structures */
+	pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+			nesdev->cqp.sq_pbase);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter = 0;
+	u32 mac_index = nesdev->mac_index;
+	u32 tx_config;
+	u16 phy_data;
+
+	if (nesadapter->OneG_Mode) {
+		nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index);
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_1G) {
+			printk(PFX "%s: Programming mdc config for 1G\n", __FUNCTION__);
+			tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+			tx_config |= 0x04;
+			nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		}
+
+		nes_read_1G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 1 phy address %u = 0x%X.\n",
+				nesadapter->phy_index[mac_index], phy_data);
+		nes_write_1G_phy_reg(nesdev, 23, nesadapter->phy_index[mac_index],  0xb000);
+
+		/* Reset the PHY */
+		nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], 0x8000);
+		udelay(100);
+		counter = 0;
+		do {
+			nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data);
+			if (counter++ > 100) break;
+		} while (phy_data & 0x8000);
+
+		/* Setting no phy loopback */
+		phy_data &= 0xbfff;
+		phy_data |= 0x1140;
+		nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index],  phy_data);
+		nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0 = 0x%X.\n", phy_data);
+
+		nes_read_1G_phy_reg(nesdev, 0x17, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x17 = 0x%X.\n", phy_data);
+
+		nes_read_1G_phy_reg(nesdev, 0x1e, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x1e = 0x%X.\n", phy_data);
+
+		/* Setting the interrupt mask */
+		nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data);
+		nes_write_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], 0xffee);
+
+		nes_read_1G_phy_reg(nesdev, 0x19, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x19 = 0x%X.\n", phy_data);
+
+		/* turning on flow control */
+		nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data);
+		nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index],
+				(phy_data & ~(0x03E0)) | 0xc00);
+		/* nes_write_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index],
+				phy_data | 0xc00); */
+		nes_read_1G_phy_reg(nesdev, 4, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x4 = 0x%X.\n", phy_data);
+
+		nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data);
+		/* Clear Half duplex */
+		nes_write_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index],
+				phy_data & ~(0x0100));
+		nes_read_1G_phy_reg(nesdev, 9, nesadapter->phy_index[mac_index], &phy_data);
+		nes_debug(NES_DBG_PHY, "Phy data from register 0x9 = 0x%X.\n", phy_data);
+
+		nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data);
+		nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data | 0x0300);
+	} else {
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) {
+			/* setup 10G MDIO operation */
+			tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+			tx_config |= 0x14;
+			nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_replenish_nic_rq
+ */
+static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic *nesnic;
+	struct nes_device *nesdev;
+	u32 rx_wqes_posted = 0;
+
+	nesnic = &nesvnic->nic;
+	nesdev = nesvnic->nesdev;
+	spin_lock_irqsave(&nesnic->rq_lock, flags);
+	if (nesnic->replenishing_rq !=0) {
+		if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+				(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&nesvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+			add_timer(&nesvnic->rq_wqes_timer);
+		} else
+		spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+		return;
+	}
+	nesnic->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+					skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+
+			nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+					cpu_to_le32(nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+					cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+					cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesnic->rx_skb[nesnic->rq_head] = skb;
+			nesnic->rq_head++;
+			nesnic->rq_head &= nesnic->rq_size - 1;
+			atomic_dec(&nesvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesnic->rq_lock, flags);
+			if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+					(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&nesvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+				nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+				add_timer(&nesvnic->rq_wqes_timer);
+			} else
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			break;
+		}
+	} while (atomic_read(&nesvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+	nesnic->replenishing_rq = 0;
+}
+
+
+/**
+ * nes_rq_wqes_timeout
+ */
+static void nes_rq_wqes_timeout(unsigned long parm)
+{
+	struct nes_vnic *nesvnic = (struct nes_vnic *)parm;
+	printk("%s: Timer fired.\n", __FUNCTION__);
+	atomic_set(&nesvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+/**
+ * nes_init_nic_qp
+ */
+int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct nes_hw_nic_qp_context *nic_context;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	unsigned long flags;
+	void *vmem;
+	dma_addr_t pmem;
+	u64 u64temp;
+	int ret;
+	u32 cqp_head;
+	u32 counter;
+	u32 wqe_count;
+	u8 jumbomode=0;
+
+	/* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
+	nesvnic->nic_mem_size = 256 +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
+			(NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
+			sizeof(struct nes_hw_nic_qp_context);
+
+	nesvnic->nic_vbase = pci_alloc_consistent(nesdev->pcidev, nesvnic->nic_mem_size,
+			&nesvnic->nic_pbase);
+	if (!nesvnic->nic_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
+		return -ENOMEM;
+	}
+	memset(nesvnic->nic_vbase, 0, nesvnic->nic_mem_size);
+	nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
+			nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
+
+	vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
+			~(unsigned long)(256 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
+			~(unsigned long long)(256 - 1));
+
+	/* Setup the first Fragment buffers */
+	nesvnic->nic.first_frag_vbase = vmem;
+
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nesvnic->nic.frag_paddr[counter] = pmem;
+		pmem += sizeof(struct nes_first_frag);
+	}
+
+	/* setup the SQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
+
+	nesvnic->nic.sq_vbase = (void *)vmem;
+	nesvnic->nic.sq_pbase = pmem;
+	nesvnic->nic.sq_head = 0;
+	nesvnic->nic.sq_tail = 0;
+	nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nic_sqe = &nesvnic->nic.sq_vbase[counter];
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
+				cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
+				NES_NIC_SQ_WQE_COMPLETION);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
+				cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
+	}
+
+	nesvnic->get_cqp_request = nes_get_cqp_request;
+	nesvnic->post_cqp_request = nes_post_cqp_request;
+	nesvnic->mcrq_mcast_filter = NULL;
+
+	spin_lock_init(&nesvnic->nic.sq_lock);
+	spin_lock_init(&nesvnic->nic.rq_lock);
+
+	/* setup the RQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+
+
+	nesvnic->nic.rq_vbase = vmem;
+	nesvnic->nic.rq_pbase = pmem;
+	nesvnic->nic.rq_head = 0;
+	nesvnic->nic.rq_tail = 0;
+	nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
+
+	/* setup the CQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+
+	if (nesdev->nesadapter->netdev_count > 2)
+		nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
+	else
+		nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
+
+	nesvnic->nic_cq.cq_vbase = vmem;
+	nesvnic->nic_cq.cq_pbase = pmem;
+	nesvnic->nic_cq.cq_head = 0;
+	nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
+
+	nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
+
+	/* Send CreateCQ request to CQP */
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	cqp_head = nesdev->cqp.sq_head;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)nesvnic->nic_cq.cq_size << 16));
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
+	u64temp = (unsigned long)&nesvnic->nic_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* Send CreateQP request to CQP */
+	nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
+	nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_NIC_CTX_SIZE |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+	nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+	if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
+		nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+	}
+
+	u64temp = (u64)nesvnic->nic.sq_pbase;
+	nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+	u64temp = (u64)nesvnic->nic.rq_pbase;
+	nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+			NES_CQP_QP_TYPE_NIC);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase +
+			(nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	nesdev->cqp.sq_head = cqp_head;
+
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
+			nesvnic->nic.qp_id);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
+			nesvnic->nic.qp_id, ret);
+	if (!ret) {
+		nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
+		pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+				nesvnic->nic_pbase);
+		return -EIO;
+	}
+
+	/* Populate the RQ */
+	for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (!skb) {
+			nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+
+			nes_destroy_nic_qp(nesvnic);
+			return -ENOMEM;
+		}
+
+		skb->dev = netdev;
+
+		pmem = pci_map_single(nesdev->pcidev, skb->data,
+				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+
+		nic_rqe = &nesvnic->nic.rq_vbase[counter];
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+		nesvnic->nic.rx_skb[counter] = skb;
+	}
+
+	wqe_count = NES_NIC_WQ_SIZE - 1;
+	nesvnic->nic.rq_head = wqe_count;
+	barrier();
+	do {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
+	} while (wqe_count);
+	init_timer(&nesvnic->rq_wqes_timer);
+	nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout;
+	nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic;
+	nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
+
+	if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
+	{
+		nes_nic_init_timer(nesdev);
+		if (netdev->mtu > 1500)
+			jumbomode = 1;
+                nes_nic_init_timer_defaults(nesdev, jumbomode);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_nic_qp
+ */
+void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	u64 wqe_frag;
+	u32 cqp_head;
+	unsigned long flags;
+	int ret;
+
+	/* Free remaining NIC receive buffers */
+	while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
+		nic_rqe = &nesvnic->nic.rq_vbase[nesvnic->nic.rq_tail];
+		wqe_frag = (u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
+		wqe_frag |= ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
+		pci_unmap_single(nesdev->pcidev, (dma_addr_t)wqe_frag,
+				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+		dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
+		nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
+	}
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy NIC QP */
+	cqp_head = nesdev->cqp.sq_head;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		nesvnic->nic.qp_id);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+	/* Destroy NIC CQ */
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	nesdev->cqp.sq_head = cqp_head;
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			" cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			cqp_head, nesdev->cqp.sq_head,
+			nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			" cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+	if (!ret) {
+		nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
+				nesvnic->nic.qp_id);
+	}
+
+	pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+			nesvnic->nic_pbase);
+}
+
+/**
+ * nes_napi_isr
+ */
+int nes_napi_isr(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 int_stat;
+
+	if (nesdev->napi_isr_ran) {
+		/* interrupt status has already been read in ISR */
+		int_stat = nesdev->int_stat;
+	} else {
+		int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+		nesdev->int_stat = int_stat;
+		nesdev->napi_isr_ran = 1;
+	}
+
+	int_stat &= nesdev->int_req;
+	/* iff NIC, process here, else wait for DPC */
+	if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
+		nesdev->napi_isr_ran = 0;
+		nes_write32(nesdev->regs+NES_INT_STAT,
+				(int_stat &
+				~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3)));
+
+		/* Process the CEQs */
+		nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
+
+		if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
+					   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
+					  ((nesadapter->et_use_adaptive_rx_coalesce) &&
+					   (nesdev->deepcq_count > nesadapter->et_pkt_rate_low)))) ) {
+			if ((nesdev->int_req & NES_INT_TIMER) == 0) {
+				/* Enable Periodic timer interrupts */
+				nesdev->int_req |= NES_INT_TIMER;
+				/* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
+				/* TODO: need to also ack other unused periodic timer values, get from nesadapter */
+				nes_write32(nesdev->regs+NES_TIMER_STAT,
+						nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
+				nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+						~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			}
+
+			if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+			{
+				nes_nic_init_timer(nesdev);
+			}
+			/* Enable interrupts, except CEQs */
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			/* Enable interrupts, make sure timer is off */
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+			nesadapter->tune_timer.timer_in_use_old = 0;
+		}
+		nesdev->deepcq_count = 0;
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+
+/**
+ * nes_dpc
+ */
+void nes_dpc(unsigned long param)
+{
+	struct nes_device *nesdev = (struct nes_device *)param;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter;
+	u32 loop_counter = 0;
+	u32 int_status_bit;
+	u32 int_stat;
+	u32 timer_stat;
+	u32 temp_int_stat;
+	u32 intf_int_stat;
+	u32 debug_error;
+	u32 processed_intf_int = 0;
+	u16 processed_timer_int = 0;
+	u16 completion_ints = 0;
+	u16 timer_ints = 0;
+
+	/* nes_debug(NES_DBG_ISR, "\n"); */
+
+	do {
+		timer_stat = 0;
+		if (nesdev->napi_isr_ran) {
+			nesdev->napi_isr_ran = 0;
+			int_stat = nesdev->int_stat;
+		} else
+			int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
+		if (processed_intf_int != 0)
+			int_stat &= nesdev->int_req & ~NES_INT_INTF;
+		else
+			int_stat &= nesdev->int_req;
+		if (processed_timer_int == 0) {
+			processed_timer_int = 1;
+			if (int_stat & NES_INT_TIMER) {
+				timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+				if ((timer_stat & nesdev->timer_int_req) == 0) {
+					int_stat &= ~NES_INT_TIMER;
+				}
+			}
+		} else {
+			int_stat &= ~NES_INT_TIMER;
+		}
+
+		if (int_stat) {
+			if (int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|
+					NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3)) {
+				/* Ack the interrupts */
+				nes_write32(nesdev->regs+NES_INT_STAT,
+						(int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|
+						NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3)));
+			}
+
+			temp_int_stat = int_stat;
+			for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
+				if (int_stat & int_status_bit) {
+					nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
+					temp_int_stat &= ~int_status_bit;
+					completion_ints = 1;
+				}
+				if (!(temp_int_stat & 0x0000ffff))
+					break;
+				int_status_bit <<= 1;
+			}
+
+			/* Process the AEQ for this pci function */
+			int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
+			if (int_stat & int_status_bit) {
+				nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
+			}
+
+			/* Process the MAC interrupt for this pci function */
+			int_status_bit = 1 << (24 + nesdev->mac_index);
+			if (int_stat & int_status_bit) {
+				nes_process_mac_intr(nesdev, nesdev->mac_index);
+			}
+
+			if (int_stat & NES_INT_TIMER) {
+				if (timer_stat & nesdev->timer_int_req) {
+					nes_write32(nesdev->regs + NES_TIMER_STAT,
+							(timer_stat & nesdev->timer_int_req) |
+							~(nesdev->nesadapter->timer_int_req));
+					timer_ints = 1;
+				}
+			}
+
+			if (int_stat & NES_INT_INTF) {
+				processed_intf_int = 1;
+				intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+				intf_int_stat &= nesdev->intf_int_req;
+				if (NES_INTF_INT_CRITERR & intf_int_stat) {
+					debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
+					printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
+							(u16)debug_error);
+					nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
+							0x01010000 | (debug_error & 0x0000ffff));
+					/* BUG(); */
+					if (crit_err_count++ > 10)
+						nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
+				}
+				if (NES_INTF_INT_PCIERR & intf_int_stat) {
+					printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
+					BUG();
+				}
+				if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
+					printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
+					BUG();
+				}
+				nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
+			}
+
+			if (int_stat & NES_INT_TSW) {
+			}
+		}
+		/* Don't use the interface interrupt bit stay in loop */
+		int_stat &= ~NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|
+				NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3;
+	} while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
+
+	if (timer_ints == 1) {
+		if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
+			if (completion_ints == 0) {
+				nesdev->timer_only_int_count++;
+				if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
+					nesdev->timer_only_int_count = 0;
+					nesdev->int_req &= ~NES_INT_TIMER;
+					nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+					nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+					nesdev->nesadapter->tune_timer.timer_in_use_old = 0;
+				} else {
+					nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req));
+				}
+			} else {
+				if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+				{
+					nes_nic_init_timer(nesdev);
+				}
+				nesdev->timer_only_int_count = 0;
+				nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req));
+			}
+		} else {
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	} else {
+		if ( (completion_ints == 1) &&
+			 (((nesadapter->et_rx_coalesce_usecs_irq) &&
+			   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
+			  ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
+			   (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
+			/* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req |= NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+					~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	}
+	nesdev->deepcq_count = 0;
+}
+
+
+/**
+ * nes_process_ceq
+ */
+void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
+{
+	u64 u64temp;
+	struct nes_hw_cq *cq;
+	u32 head;
+	u32 ceq_size;
+
+	/* nes_debug(NES_DBG_CQ, "\n"); */
+	head = ceq->ceq_head;
+	ceq_size = ceq->ceq_size;
+
+	do {
+		if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
+				NES_CEQE_VALID) {
+			u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX])))<<32) |
+						((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
+			u64temp <<= 1;
+			cq = *((struct nes_hw_cq **)&u64temp);
+			/* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
+			barrier();
+			ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
+
+			/* call the event handler */
+			cq->ce_handler(nesdev, cq);
+
+			if (++head >= ceq_size)
+				head = 0;
+		} else {
+			break;
+		}
+
+	} while (1);
+
+	ceq->ceq_head = head;
+}
+
+
+/**
+ * nes_process_aeq
+ */
+void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
+{
+//	u64 u64temp;
+	u32 head;
+	u32 aeq_size;
+	u32 aeqe_misc;
+	u32 aeqe_cq_id;
+	struct nes_hw_aeqe volatile *aeqe;
+
+	head = aeq->aeq_head;
+	aeq_size = aeq->aeq_size;
+
+	do {
+		aeqe = &aeq->aeq_vbase[head];
+		if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
+			break;
+		aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+		aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+		if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
+			if (aeqe_cq_id >= NES_FIRST_QPN) {
+				/* dealing with an accelerated QP related AE */
+//				u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])))<<32) |
+//					((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
+				nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
+			} else {
+				/* TODO: dealing with a CQP related AE */
+				nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
+						(u16)(aeqe_misc >> 16));
+			}
+		}
+
+		aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
+
+		if (++head >= aeq_size)
+			head = 0;
+	}
+	while (1);
+	aeq->aeq_head = head;
+}
+
+static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 reset_value;
+	u32 i=0;
+	u32 u32temp;
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		return;
+	}
+	mh_detected++;
+
+	reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+	if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
+		reset_value |= 0x0000001d;
+	else
+		reset_value |= 0x0000002d;
+
+	if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
+		if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+			nesadapter->link_interrupt_count[0] = 0;
+			nesadapter->link_interrupt_count[1] = 0;
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			reset_value |= 0x0000003d;
+		}
+		nesadapter->link_interrupt_count[mac_index] = 0;
+	}
+
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+			& 0x00000040) != 0x00000040) && (i++ < 5000));
+
+	if (0x0000003d == (reset_value & 0x0000003d)) {
+		u32 pcs_control_status0, pcs_control_status1;
+
+		for (i = 0; i < 10; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
+			     && (pcs_control_status0 & 0x00100000))
+			    || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
+				&& (pcs_control_status1 & 0x00100000)))
+				continue;
+			else
+				break;
+		}
+		if (10 == i) {
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
+				 & 0x00000040) != 0x00000040) && (i++ < 5000));
+		}
+	}
+}
+
+/**
+ * nes_process_mac_intr
+ */
+void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
+{
+	unsigned long flags;
+	u32 pcs_control_status;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_status;
+	u32 mac_index = nesdev->mac_index;
+	u32 u32temp;
+	u16 phy_data;
+	u16 temp_phy_data;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		return;
+	}
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	/* ack the MAC interrupt */
+	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
+	/* Clear the interrupt */
+	nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
+
+	nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
+
+	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
+		nesdev->link_status_interrupts++;
+		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) {
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_reset_link(nesdev, mac_index);
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		}
+		/* read the PHY interrupt status register */
+		if (nesadapter->OneG_Mode) {
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x1a,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+			} while (phy_data&0x8000);
+
+			temp_phy_data = 0;
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x11,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+				if (temp_phy_data == phy_data)
+					break;
+				temp_phy_data = phy_data;
+			} while (1);
+
+			nes_read_1G_phy_reg(nesdev, 0x1e,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			nes_read_1G_phy_reg(nesdev, 1,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			if (temp_phy_data & 0x1000) {
+				nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
+				phy_data = 4;
+			} else {
+				nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
+			}
+		}
+		nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
+		pcs_control_status = nes_read_indexed(nesdev,
+				NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200));
+		pcs_control_status = nes_read_indexed(nesdev,
+				NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200));
+		nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
+				mac_index, pcs_control_status);
+		if (nesadapter->OneG_Mode) {
+			u32temp = 0x01010000;
+			if (nesadapter->port_count > 2) {
+				u32temp |= 0x02020000;
+			}
+			if ((pcs_control_status & u32temp)!= u32temp) {
+				phy_data = 0;
+				nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
+			}
+		} else if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) {
+			nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]);
+			temp_phy_data = (u16)nes_read_indexed(nesdev,
+								NES_IDX_MAC_MDIO_CONTROL);
+			u32temp = 20;
+			do {
+				nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]);
+				phy_data = (u16)nes_read_indexed(nesdev,
+								NES_IDX_MAC_MDIO_CONTROL);
+				if ((phy_data == temp_phy_data) || (!(--u32temp)))
+					break;
+				temp_phy_data = phy_data;
+			} while (1);
+			nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+				__FUNCTION__, phy_data, nesadapter->mac_link_down ? "DOWN" : "UP");
+
+		} else {
+			phy_data = (0x0f0f0000 == (pcs_control_status & 0x0f1f0000)) ? 4 : 0;
+		}
+
+		if (phy_data & 0x0004) {
+			nesadapter->mac_link_down[mac_index] = 0;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 0) {
+					printk(PFX "The Link is now up for port %u, netdev %p.\n",
+							mac_index, nesvnic->netdev);
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_start_queue(nesvnic->netdev);
+					nesvnic->linkup = 1;
+					netif_carrier_on(nesvnic->netdev);
+				}
+			}
+		} else {
+			nesadapter->mac_link_down[mac_index] = 1;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 1) {
+					printk(PFX "The Link is now down for port %u, netdev %p.\n",
+							mac_index, nesvnic->netdev);
+					if (!(netif_queue_stopped(nesvnic->netdev)))
+						netif_stop_queue(nesvnic->netdev);
+					nesvnic->linkup = 0;
+					netif_carrier_off(nesvnic->netdev);
+				}
+			}
+		}
+	}
+
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
+}
+
+
+
+void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+
+	netif_rx_schedule(nesdev->netdev[nesvnic->netdev_index], &nesvnic->napi);
+}
+
+
+/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
+* getting out of nic_ce_handler
+*/
+#define	MAX_RQES_TO_PROCESS	384
+
+/**
+ * nes_nic_ce_handler
+ */
+void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_hw_nic *nesnic;
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct sk_buff *skb;
+	struct sk_buff *rx_skb;
+	__le16 *wqe_fragment_length;
+	u32 head;
+	u32 cq_size;
+	u32 rx_pkt_size;
+	u32 cqe_count=0;
+	u32 cqe_errv;
+	u32 cqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 vlan_tag;
+	u16 pkt_type;
+	u16 rqes_processed = 0;
+	u8 sq_cqes = 0;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+	cq->cqes_pending = 1;
+	do {
+		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
+				NES_NIC_CQE_VALID) {
+			nesnic = &nesvnic->nic;
+			cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+			if (cqe_misc & NES_NIC_CQE_SQ) {
+				sq_cqes++;
+				wqe_fragment_index = 1;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
+				skb = nesnic->tx_skb[nesnic->sq_tail];
+				wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+				if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+					u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]);
+					u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32;
+					bus_address = (dma_addr_t)u64temp;
+					if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
+						pci_unmap_single(nesdev->pcidev,
+								bus_address,
+								le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
+								PCI_DMA_TODEVICE);
+					}
+					for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+						if (wqe_fragment_length[wqe_fragment_index]) {
+							u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]);
+							u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32;
+							bus_address = (dma_addr_t)u64temp;
+							pci_unmap_page(nesdev->pcidev,
+									bus_address,
+									le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
+									PCI_DMA_TODEVICE);
+						} else
+							break;
+					}
+					if (skb)
+						dev_kfree_skb_any(skb);
+				}
+				nesnic->sq_tail++;
+				nesnic->sq_tail &= nesnic->sq_size-1;
+				if (sq_cqes > 128) {
+					barrier();
+				/* restart the queue if it had been stopped */
+				if (netif_queue_stopped(nesvnic->netdev))
+					netif_wake_queue(nesvnic->netdev);
+					sq_cqes = 0;
+				}
+			} else {
+				rqes_processed ++;
+
+				cq->rx_cqes_completed++;
+				cq->rx_pkts_indicated++;
+				rx_pkt_size = cqe_misc & 0x0000ffff;
+				nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
+				/* Get the skb */
+				rx_skb = nesnic->rx_skb[nesnic->rq_tail];
+				nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
+				bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
+				bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
+				pci_unmap_single(nesdev->pcidev, bus_address,
+						nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+				/* rx_skb->tail = rx_skb->data + rx_pkt_size; */
+				/* rx_skb->len = rx_pkt_size; */
+				rx_skb->len = 0;  /* TODO: see if this is necessary */
+				skb_put(rx_skb, rx_pkt_size);
+				rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
+				nesnic->rq_tail++;
+				nesnic->rq_tail &= nesnic->rq_size - 1;
+
+				atomic_inc(&nesvnic->rx_skbs_needed);
+				if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
+					nes_write32(nesdev->regs+NES_CQE_ALLOC,
+							cq->cq_number | (cqe_count << 16));
+//					nesadapter->tune_timer.cq_count += cqe_count;
+					nesdev->currcq_count += cqe_count;
+					cqe_count = 0;
+					nes_replenish_nic_rq(nesvnic);
+				}
+				pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
+				cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
+				rx_skb->ip_summed = CHECKSUM_NONE;
+
+				if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
+						(NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
+							NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->rx_checksum_disabled == 0) {
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+						}
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+
+				} else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
+							NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->rx_checksum_disabled == 0) {
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+							/* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
+								  nesvnic->netdev->name); */
+						}
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+					}
+				/* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
+							pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
+
+				if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
+					nes_cm_recv(rx_skb, nesvnic->netdev);
+				} else {
+					if ((cqe_misc & NES_NIC_CQE_TAG_VALID) && (nesvnic->vlan_grp != NULL)) {
+						vlan_tag = (u16)(le32_to_cpu(
+								cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
+								>> 16);
+						nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
+								nesvnic->netdev->name, vlan_tag);
+						nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag);
+					} else {
+						nes_netif_rx(rx_skb);
+					}
+				}
+
+				nesvnic->netdev->last_rx = jiffies;
+				/* nesvnic->netstats.rx_packets++; */
+				/* nesvnic->netstats.rx_bytes += rx_pkt_size; */
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+			if (cqe_count == 255) {
+				/* Replenish Nic CQ */
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+						cq->cq_number | (cqe_count << 16));
+//				nesdev->nesadapter->tune_timer.cq_count += cqe_count;
+				nesdev->currcq_count += cqe_count;
+				cqe_count = 0;
+			}
+
+			if (cq->rx_cqes_completed >= nesvnic->budget)
+				break;
+		} else {
+			cq->cqes_pending = 0;
+			break;
+		}
+
+	} while (1);
+
+	if (sq_cqes) {
+		barrier();
+		/* restart the queue if it had been stopped */
+		if (netif_queue_stopped(nesvnic->netdev))
+			netif_wake_queue(nesvnic->netdev);
+	}
+
+	cq->cq_head = head;
+	/* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
+			cq->cq_number, cqe_count, cq->cq_head); */
+	cq->cqe_allocs_pending = cqe_count;
+	if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+	{
+//		nesdev->nesadapter->tune_timer.cq_count += cqe_count;
+		nesdev->currcq_count += cqe_count;
+		nes_nic_tune_timer(nesdev);
+	}
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+	}
+
+
+/**
+ * nes_cqp_ce_handler
+ */
+void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
+{
+	u64 u64temp;
+	unsigned long flags;
+	struct nes_hw_cqp *cqp = NULL;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count=0;
+	u32 error_code;
+	/* u32 counter; */
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	do {
+		/* process the CQE */
+		/* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
+			  le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
+
+		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
+			u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
+					((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
+			cqp = *((struct nes_hw_cqp **)&u64temp);
+
+			error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
+			if (error_code) {
+				nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
+						" Major/Minor codes = 0x%04X:%04X.\n",
+						le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
+						(u16)(error_code >> 16),
+						(u16)error_code);
+				nes_debug(NES_DBG_CQP, "cqp: qp_id=%u, sq_head=%u, sq_tail=%u\n",
+						cqp->qp_id, cqp->sq_head, cqp->sq_tail);
+			}
+
+			u64temp = (((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail].
+					wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX])))<<32) |
+					((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail].
+					wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX])));
+			cqp_request = *((struct nes_cqp_request **)&u64temp);
+			if (cqp_request) {
+				if (cqp_request->waiting) {
+					/* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
+					cqp_request->major_code = (u16)(error_code >> 16);
+					cqp_request->minor_code = (u16)error_code;
+					barrier();
+					cqp_request->request_done = 1;
+					wake_up(&cqp_request->waitq);
+					if (atomic_dec_and_test(&cqp_request->refcount)) {
+						nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+								cqp_request,
+								le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f);
+						if (cqp_request->dynamic) {
+							kfree(cqp_request);
+						} else {
+							spin_lock_irqsave(&nesdev->cqp.lock, flags);
+							list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+							spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+						}
+					}
+				} else if (cqp_request->callback) {
+					/* Envoke the callback routine */
+					cqp_request->cqp_callback(nesdev, cqp_request);
+					if (cqp_request->dynamic) {
+						kfree(cqp_request);
+					} else {
+						spin_lock_irqsave(&nesdev->cqp.lock, flags);
+						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+					}
+				} else {
+					nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+							cqp_request,
+							le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f);
+					if (cqp_request->dynamic) {
+						kfree(cqp_request);
+					} else {
+						spin_lock_irqsave(&nesdev->cqp.lock, flags);
+						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+					}
+				}
+			} else {
+				wake_up(&nesdev->cqp.waitq);
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			nes_write32(nesdev->regs+NES_CQE_ALLOC, cq->cq_number | (1 << 16));
+			if (++cqp->sq_tail >= cqp->sq_size)
+				cqp->sq_tail = 0;
+
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+		} else {
+			break;
+		}
+	} while (1);
+	cq->cq_head = head;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
+			((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)) {
+		cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
+				struct nes_cqp_request, list);
+		list_del_init(&cqp_request->list);
+		head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		barrier();
+		cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] =
+			cpu_to_le32((u32)((unsigned long)cqp_request));
+		cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] =
+			cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
+				cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
+		/* Ring doorbell (1 WQEs) */
+		barrier();
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+	}
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			cq->cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+}
+
+
+/**
+ * nes_process_iwarp_aeqe
+ */
+void nes_process_iwarp_aeqe(struct nes_device *nesdev, struct nes_hw_aeqe *aeqe)
+{
+	u64 context;
+	u64 aeqe_context = 0;
+	unsigned long flags;
+	struct nes_qp *nesqp;
+	int resource_allocated;
+	/* struct iw_cm_id *cm_id; */
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ib_event ibevent;
+	/* struct iw_cm_event cm_event; */
+	u32 aeq_info;
+	u32 next_iwarp_state = 0;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+
+	nes_debug(NES_DBG_AEQ, "\n");
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if ((NES_AEQE_INBOUND_RDMA&aeq_info) || (!(NES_AEQE_QP&aeq_info))) {
+		context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
+		context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
+	} else {
+		aeqe_context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
+		aeqe_context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
+		context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN];
+		BUG_ON(!context);
+	}
+
+	async_event_id = (u16)aeq_info;
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
+			" Tcp state = %s, iWARP state = %s\n",
+			async_event_id,
+			le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
+			nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
+
+
+	switch (async_event_id) {
+		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
+			nesqp = *((struct nes_qp **)&context);
+			if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+				nesqp->cm_id->add_ref(nesqp->cm_id);
+				nes_add_ref(&nesqp->ibqp);
+				schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
+						NES_TIMER_TYPE_CLOSE, 1, 0);
+				nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
+						" need ae to finish up, original_last_aeq = 0x%04X."
+						" last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
+						nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+						async_event_id, nesqp->last_aeq, tcp_state);
+			}
+			if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+					(nesqp->ibqp_state != IB_QPS_RTS)) {
+				/* FIN Received but tcp state or IB state moved on,
+						should expect a	close complete */
+				return;
+			}
+		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
+		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+		case NES_AEQE_AEID_TERMINATE_SENT:
+		case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
+		case NES_AEQE_AEID_RESET_SENT:
+			nesqp = *((struct nes_qp **)&context);
+			if (async_event_id == NES_AEQE_AEID_RESET_SENT) {
+				tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			}
+			nes_add_ref(&nesqp->ibqp);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+
+			if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+					(tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) {
+				nesqp->hte_added = 0;
+				spin_unlock_irqrestore(&nesqp->lock, flags);
+				nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u to remove hte\n",
+						nesqp->hwqp.qp_id);
+				nes_hw_modify_qp(nesdev, nesqp,
+						NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE, 0);
+				spin_lock_irqsave(&nesqp->lock, flags);
+			}
+
+			if ((nesqp->ibqp_state == IB_QPS_RTS) &&
+					((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+					(async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+				switch (nesqp->hw_iwarp_state) {
+					case NES_AEQE_IWARP_STATE_RTS:
+						next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+						nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+						break;
+					case NES_AEQE_IWARP_STATE_TERMINATE:
+						next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+						nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
+						if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
+							next_iwarp_state |= 0x02000000;
+							nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+						}
+						break;
+					default:
+						next_iwarp_state = 0;
+				}
+				spin_unlock_irqrestore(&nesqp->lock, flags);
+				if (next_iwarp_state) {
+					nes_add_ref(&nesqp->ibqp);
+					nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X,"
+							" also added another reference\n",
+							nesqp->hwqp.qp_id, next_iwarp_state);
+					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0);
+				}
+				nes_cm_disconn(nesqp);
+			} else {
+				if (async_event_id ==  NES_AEQE_AEID_LLP_FIN_RECEIVED) {
+					/* FIN Received but ib state not RTS,
+							close complete will be on its way */
+					spin_unlock_irqrestore(&nesqp->lock, flags);
+					nes_rem_ref(&nesqp->ibqp);
+					return;
+				}
+				spin_unlock_irqrestore(&nesqp->lock, flags);
+				if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
+					next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000;
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X,"
+							" also added another reference\n",
+							nesqp->hwqp.qp_id, next_iwarp_state);
+					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0);
+				}
+				nes_cm_disconn(nesqp);
+			}
+			break;
+		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
+			nesqp = *((struct nes_qp **)&context);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TERMINATE_RECEIVED"
+					" event on QP%u \n  Q2 Data:\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_FATAL;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+					((nesqp->ibqp_state == IB_QPS_RTS)&&
+					(async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+				nes_add_ref(&nesqp->ibqp);
+				nes_cm_disconn(nesqp);
+			} else {
+				nesqp->in_disconnect = 0;
+				wake_up(&nesqp->kick_waitq);
+			}
+			break;
+		case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
+			nesqp = *((struct nes_qp **)&context);
+			nes_add_ref(&nesqp->ibqp);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
+			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			nesqp->last_aeq = async_event_id;
+			if (nesqp->cm_id) {
+				nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES"
+						" event on QP%u, remote IP = 0x%08X \n",
+						nesqp->hwqp.qp_id,
+						ntohl(nesqp->cm_id->remote_addr.sin_addr.s_addr));
+			} else {
+				nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES"
+						" event on QP%u \n",
+						nesqp->hwqp.qp_id);
+			}
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_RESET;
+			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_FATAL;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			break;
+		case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+			if (NES_AEQE_INBOUND_RDMA&aeq_info) {
+				nesqp = nesadapter->qp_table[le32_to_cpu(
+						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN];
+			} else {
+				/* TODO: get the actual WQE and mask off wqe index */
+				context &= ~((u64)511);
+				nesqp = *((struct nes_qp **)&context);
+			}
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_BAD_STAG_INDEX event on QP%u\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_ACCESS_ERR;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			break;
+		case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+			nesqp = *((struct nes_qp **)&context);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_UNALLOCATED_STAG event on QP%u\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_ACCESS_ERR;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			break;
+		case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+			nesqp = nesadapter->qp_table[le32_to_cpu(aeqe->aeqe_words
+					[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN];
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_PRIV_OPERATION_DENIED event on QP%u,"
+					" nesqp = %p, AE reported %p\n",
+					nesqp->hwqp.qp_id, nesqp, *((struct nes_qp **)&context));
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_ACCESS_ERR;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			break;
+		case NES_AEQE_AEID_CQ_OPERATION_ERROR:
+			context <<= 1;
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
+			resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+			if (resource_allocated) {
+				printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
+						__FUNCTION__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+			}
+			break;
+		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+			nesqp = nesadapter->qp_table[le32_to_cpu(
+					aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN];
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG"
+					"_FOR_AVAILABLE_BUFFER event on QP%u\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_ACCESS_ERR;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			/* tell cm to disconnect, cm will queue work to thread */
+			nes_add_ref(&nesqp->ibqp);
+			nes_cm_disconn(nesqp);
+			break;
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+			nesqp = *((struct nes_qp **)&context);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_INVALID_MSN"
+					"_NO_BUFFER_AVAILABLE event on QP%u\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_FATAL;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			/* tell cm to disconnect, cm will queue work to thread */
+			nes_add_ref(&nesqp->ibqp);
+			nes_cm_disconn(nesqp);
+			break;
+		case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+			nesqp = *((struct nes_qp **)&context);
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR"
+					" event on QP%u \n  Q2 Data:\n",
+					nesqp->hwqp.qp_id);
+			if (nesqp->ibqp.event_handler) {
+				ibevent.device = nesqp->ibqp.device;
+				ibevent.element.qp = &nesqp->ibqp;
+				ibevent.event = IB_EVENT_QP_FATAL;
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+			}
+			/* tell cm to disconnect, cm will queue work to thread */
+			nes_add_ref(&nesqp->ibqp);
+			nes_cm_disconn(nesqp);
+			break;
+			/* TODO: additional AEs need to be here */
+		default:
+			nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
+					async_event_id);
+			break;
+	}
+
+}
+
+
+/**
+ * nes_iwarp_ce_handler
+ */
+void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
+{
+	struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+
+	/* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
+			nescq->hw_cq.cq_number); */
+	nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
+
+	if (nescq->ibcq.comp_handler)
+		nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
+
+	return;
+}
+
+
+/**
+ * nes_manage_apbvt()
+ */
+int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
+		u32 nic_index, u32 add_port)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request;
+	int ret = 0;
+	u16 major_code;
+
+	/* Send manage APBVT request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
+			(add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
+			accel_local_port, accel_local_port, nic_index);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
+			((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
+
+	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	if (add_port == NES_MANAGE_APBVT_ADD)
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	if (atomic_dec_and_test(&cqp_request->refcount)) {
+		if (cqp_request->dynamic) {
+			kfree(cqp_request);
+		} else {
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		}
+	}
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	else
+		return 0;
+}
+
+
+/**
+ * nes_manage_arp_cache
+ */
+void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
+		u32 ip_addr, u32 action)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev;
+	struct nes_cqp_request *cqp_request;
+	int arp_index;
+
+	nesdev = nesvnic->nesdev;
+	arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
+	if (arp_index == -1) {
+		return;
+	}
+
+	/* update the ARP entry */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
+		return;
+	}
+	cqp_request->waiting = 0;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
+
+	if (action == NES_ARP_ADD) {
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
+				(((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
+				(((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
+				(((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]);
+	} else {
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
+	}
+
+	nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+}
+
+
+/**
+ * flush_wqes
+ */
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 which_wq, u32 wait_completion)
+{
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+		atomic_set(&cqp_request->refcount, 2);
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	if (wait_completion) {
+		/* Wait for CQP */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
+				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+				ret, cqp_request->major_code, cqp_request->minor_code);
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
new file mode 100644
index 0000000..1e10df5
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -0,0 +1,1206 @@
+/*
+* Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_HW_H
+#define __NES_HW_H
+
+#define NES_PHY_TYPE_1G   2
+#define NES_PHY_TYPE_IRIS 3
+#define NES_PHY_TYPE_PUMA_10G  6
+
+#define NES_MULTICAST_PF_MAX 8
+
+enum pci_regs {
+	NES_INT_STAT = 0x0000,
+	NES_INT_MASK = 0x0004,
+	NES_INT_PENDING = 0x0008,
+	NES_INTF_INT_STAT = 0x000C,
+	NES_INTF_INT_MASK = 0x0010,
+	NES_TIMER_STAT = 0x0014,
+	NES_PERIODIC_CONTROL = 0x0018,
+	NES_ONE_SHOT_CONTROL = 0x001C,
+	NES_EEPROM_COMMAND = 0x0020,
+	NES_EEPROM_DATA = 0x0024,
+	NES_FLASH_COMMAND = 0x0028,
+	NES_FLASH_DATA  = 0x002C,
+	NES_SOFTWARE_RESET = 0x0030,
+	NES_CQ_ACK = 0x0034,
+	NES_WQE_ALLOC = 0x0040,
+	NES_CQE_ALLOC = 0x0044,
+};
+
+enum indexed_regs {
+	NES_IDX_CREATE_CQP_LOW = 0x0000,
+	NES_IDX_CREATE_CQP_HIGH = 0x0004,
+	NES_IDX_QP_CONTROL = 0x0040,
+	NES_IDX_FLM_CONTROL = 0x0080,
+	NES_IDX_INT_CPU_STATUS = 0x00a0,
+	NES_IDX_GPIO_CONTROL = 0x00f0,
+	NES_IDX_GPIO_DATA = 0x00f4,
+	NES_IDX_TCP_CONFIG0 = 0x01e4,
+	NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
+	NES_IDX_TCP_NOW = 0x01f0,
+	NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
+	NES_IDX_QP_CTX_SIZE = 0x0218,
+	NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
+	NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
+	NES_IDX_ARP_CACHE_SIZE = 0x0258,
+	NES_IDX_CQ_CTX_SIZE = 0x0260,
+	NES_IDX_MRT_SIZE = 0x0278,
+	NES_IDX_PBL_REGION_SIZE = 0x0280,
+	NES_IDX_IRRQ_COUNT = 0x02b0,
+	NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
+	NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
+	NES_IDX_DST_IP_ADDR = 0x0400,
+	NES_IDX_PCIX_DIAG = 0x08e8,
+	NES_IDX_MPP_DEBUG = 0x0a00,
+	NES_IDX_PORT_RX_DISCARDS = 0x0a30,
+	NES_IDX_PORT_TX_DISCARDS = 0x0a34,
+	NES_IDX_MPP_LB_DEBUG = 0x0b00,
+	NES_IDX_DENALI_CTL_22 = 0x1058,
+	NES_IDX_MAC_TX_CONTROL = 0x2000,
+	NES_IDX_MAC_TX_CONFIG = 0x2004,
+	NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
+	NES_IDX_MAC_RX_CONTROL = 0x200c,
+	NES_IDX_MAC_RX_CONFIG = 0x2010,
+	NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
+	NES_IDX_MAC_MDIO_CONTROL = 0x2084,
+	NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
+	NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
+	NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
+	NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
+	NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
+	NES_IDX_MAC_TX_ERRORS = 0x2138,
+	NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
+	NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
+	NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
+	NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
+	NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
+	NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
+	NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
+	NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
+	NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
+	NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
+	NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
+	NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
+	NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
+	NES_IDX_MAC_INT_STATUS = 0x21f0,
+	NES_IDX_MAC_INT_MASK = 0x21f4,
+	NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
+	NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
+	NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
+	NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
+	NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
+	NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
+	NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
+	NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
+	NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
+	NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
+	NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
+	NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
+	NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
+	NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
+	NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
+	NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
+	NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
+	NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
+	NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
+	NES_IDX_CM_CONFIG = 0x5100,
+	NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
+	NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
+	NES_IDX_NIC_ACTIVE = 0x6010,
+	NES_IDX_NIC_UNICAST_ALL = 0x6018,
+	NES_IDX_NIC_MULTICAST_ALL = 0x6020,
+	NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
+	NES_IDX_NIC_BROADCAST_ON = 0x6030,
+	NES_IDX_USED_CHUNKS_TX = 0x60b0,
+	NES_IDX_TX_POOL_SIZE = 0x60b8,
+	NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
+	NES_IDX_PERFECT_FILTER_LOW = 0x6200,
+	NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
+	NES_IDX_IPV4_TCP_REXMITS = 0x7080,
+	NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
+	NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
+	NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
+	NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
+	NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
+	NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
+	NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
+};
+
+#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
+#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
+
+enum nes_cqp_opcodes {
+	NES_CQP_CREATE_QP = 0x00,
+	NES_CQP_MODIFY_QP = 0x01,
+	NES_CQP_DESTROY_QP = 0x02,
+	NES_CQP_CREATE_CQ = 0x03,
+	NES_CQP_MODIFY_CQ = 0x04,
+	NES_CQP_DESTROY_CQ = 0x05,
+	NES_CQP_ALLOCATE_STAG = 0x09,
+	NES_CQP_REGISTER_STAG = 0x0a,
+	NES_CQP_QUERY_STAG = 0x0b,
+	NES_CQP_REGISTER_SHARED_STAG = 0x0c,
+	NES_CQP_DEALLOCATE_STAG = 0x0d,
+	NES_CQP_MANAGE_ARP_CACHE = 0x0f,
+	NES_CQP_SUSPEND_QPS = 0x11,
+	NES_CQP_UPLOAD_CONTEXT = 0x13,
+	NES_CQP_CREATE_CEQ = 0x16,
+	NES_CQP_DESTROY_CEQ = 0x18,
+	NES_CQP_CREATE_AEQ = 0x19,
+	NES_CQP_DESTROY_AEQ = 0x1b,
+	NES_CQP_LMI_ACCESS = 0x20,
+	NES_CQP_FLUSH_WQES = 0x22,
+	NES_CQP_MANAGE_APBVT = 0x23
+};
+
+enum nes_cqp_wqe_word_idx {
+	NES_CQP_WQE_OPCODE_IDX = 0,
+	NES_CQP_WQE_ID_IDX = 1,
+	NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+};
+
+enum nes_cqp_cq_wqeword_idx {
+	NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
+	NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
+};
+
+enum nes_cqp_stag_wqeword_idx {
+	NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
+	NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
+	NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
+	NES_CQP_STAG_WQE_STAG_IDX = 8,
+	NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
+	NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
+	NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
+	NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
+	NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
+};
+
+#define NES_CQP_OP_IWARP_STATE_SHIFT 28
+
+enum nes_cqp_qp_bits {
+	NES_CQP_QP_ARP_VALID = (1<<8),
+	NES_CQP_QP_WINBUF_VALID = (1<<9),
+	NES_CQP_QP_CONTEXT_VALID = (1<<10),
+	NES_CQP_QP_ORD_VALID = (1<<11),
+	NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
+	NES_CQP_QP_VIRT_WQS = (1<<13),
+	NES_CQP_QP_DEL_HTE = (1<<14),
+	NES_CQP_QP_CQS_VALID = (1<<15),
+	NES_CQP_QP_TYPE_TSA = 0,
+	NES_CQP_QP_TYPE_IWARP = (1<<16),
+	NES_CQP_QP_TYPE_CQP = (4<<16),
+	NES_CQP_QP_TYPE_NIC = (5<<16),
+	NES_CQP_QP_MSS_CHG = (1<<20),
+	NES_CQP_QP_STATIC_RESOURCES = (1<<21),
+	NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
+	NES_CQP_QP_VWQ_USE_LMI = (1<<23),
+	NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_RESET = (1<<31),
+};
+
+enum nes_cqp_qp_wqe_word_idx {
+	NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
+	NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
+	NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
+};
+
+enum nes_nic_ctx_bits {
+	NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
+	NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
+	NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
+	NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
+};
+
+enum nes_nic_qp_ctx_word_idx {
+	NES_NIC_CTX_MISC_IDX = 0,
+	NES_NIC_CTX_SQ_LOW_IDX = 2,
+	NES_NIC_CTX_SQ_HIGH_IDX = 3,
+	NES_NIC_CTX_RQ_LOW_IDX = 4,
+	NES_NIC_CTX_RQ_HIGH_IDX = 5,
+};
+
+enum nes_cqp_cq_bits {
+	NES_CQP_CQ_CEQE_MASK = (1<<9),
+	NES_CQP_CQ_CEQ_VALID = (1<<10),
+	NES_CQP_CQ_RESIZE = (1<<11),
+	NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
+	NES_CQP_CQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_stag_bits {
+	NES_CQP_STAG_VA_TO = (1<<9),
+	NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
+	NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
+	NES_CQP_STAG_MR = (1<<13),
+	NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
+	NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
+	NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
+	NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
+	NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
+	NES_CQP_STAG_REM_ACC_EN = (1<<21),
+	NES_CQP_STAG_LEAVE_PENDING = (1<<31),
+};
+
+enum nes_cqp_ceq_wqeword_idx {
+	NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_ceq_bits {
+	NES_CQP_CEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_aeq_wqeword_idx {
+	NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_aeq_bits {
+	NES_CQP_AEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_AEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_lmi_wqeword_idx {
+	NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
+	NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
+	NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
+	NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
+};
+
+enum nes_cqp_arp_wqeword_idx {
+	NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
+	NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
+	NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
+};
+
+enum nes_cqp_upload_wqeword_idx {
+	NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
+	NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
+	NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
+};
+
+enum nes_cqp_arp_bits {
+	NES_CQP_ARP_VALID = (1<<8),
+	NES_CQP_ARP_PERM = (1<<9),
+};
+
+enum nes_cqp_flush_bits {
+	NES_CQP_FLUSH_SQ = (1<<30),
+	NES_CQP_FLUSH_RQ = (1<<31),
+};
+
+enum nes_cqe_opcode_bits {
+	NES_CQE_STAG_VALID = (1<<6),
+	NES_CQE_ERROR = (1<<7),
+	NES_CQE_SQ = (1<<8),
+	NES_CQE_SE = (1<<9),
+	NES_CQE_PSH = (1<<29),
+	NES_CQE_FIN = (1<<30),
+	NES_CQE_VALID = (1<<31),
+};
+
+
+enum nes_cqe_word_idx {
+	NES_CQE_PAYLOAD_LENGTH_IDX = 0,
+	NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
+	NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
+	NES_CQE_INV_STAG_IDX = 4,
+	NES_CQE_QP_ID_IDX = 5,
+	NES_CQE_ERROR_CODE_IDX = 6,
+	NES_CQE_OPCODE_IDX = 7,
+};
+
+enum nes_ceqe_word_idx {
+	NES_CEQE_CQ_CTX_LOW_IDX = 0,
+	NES_CEQE_CQ_CTX_HIGH_IDX = 1,
+};
+
+enum nes_ceqe_status_bit {
+	NES_CEQE_VALID = (1<<31),
+};
+
+enum nes_int_bits {
+	NES_INT_CEQ0 = (1<<0),
+	NES_INT_CEQ1 = (1<<1),
+	NES_INT_CEQ2 = (1<<2),
+	NES_INT_CEQ3 = (1<<3),
+	NES_INT_CEQ4 = (1<<4),
+	NES_INT_CEQ5 = (1<<5),
+	NES_INT_CEQ6 = (1<<6),
+	NES_INT_CEQ7 = (1<<7),
+	NES_INT_CEQ8 = (1<<8),
+	NES_INT_CEQ9 = (1<<9),
+	NES_INT_CEQ10 = (1<<10),
+	NES_INT_CEQ11 = (1<<11),
+	NES_INT_CEQ12 = (1<<12),
+	NES_INT_CEQ13 = (1<<13),
+	NES_INT_CEQ14 = (1<<14),
+	NES_INT_CEQ15 = (1<<15),
+	NES_INT_AEQ0 = (1<<16),
+	NES_INT_AEQ1 = (1<<17),
+	NES_INT_AEQ2 = (1<<18),
+	NES_INT_AEQ3 = (1<<19),
+	NES_INT_AEQ4 = (1<<20),
+	NES_INT_AEQ5 = (1<<21),
+	NES_INT_AEQ6 = (1<<22),
+	NES_INT_AEQ7 = (1<<23),
+	NES_INT_MAC0 = (1<<24),
+	NES_INT_MAC1 = (1<<25),
+	NES_INT_MAC2 = (1<<26),
+	NES_INT_MAC3 = (1<<27),
+	NES_INT_TSW = (1<<28),
+	NES_INT_TIMER = (1<<29),
+	NES_INT_INTF = (1<<30),
+};
+
+enum nes_intf_int_bits {
+	NES_INTF_INT_PCIERR = (1<<0),
+	NES_INTF_PERIODIC_TIMER = (1<<2),
+	NES_INTF_ONE_SHOT_TIMER = (1<<3),
+	NES_INTF_INT_CRITERR = (1<<14),
+	NES_INTF_INT_AEQ0_OFLOW = (1<<16),
+	NES_INTF_INT_AEQ1_OFLOW = (1<<17),
+	NES_INTF_INT_AEQ2_OFLOW = (1<<18),
+	NES_INTF_INT_AEQ3_OFLOW = (1<<19),
+	NES_INTF_INT_AEQ4_OFLOW = (1<<20),
+	NES_INTF_INT_AEQ5_OFLOW = (1<<21),
+	NES_INTF_INT_AEQ6_OFLOW = (1<<22),
+	NES_INTF_INT_AEQ7_OFLOW = (1<<23),
+	NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
+};
+
+enum nes_mac_int_bits {
+	NES_MAC_INT_LINK_STAT_CHG = (1<<1),
+	NES_MAC_INT_XGMII_EXT = (1<<2),
+	NES_MAC_INT_TX_UNDERFLOW = (1<<6),
+	NES_MAC_INT_TX_ERROR = (1<<7),
+};
+
+enum nes_cqe_allocate_bits {
+	NES_CQE_ALLOC_INC_SELECT = (1<<28),
+	NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
+	NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
+	NES_CQE_ALLOC_RESET = (1<<31),
+};
+
+enum nes_nic_rq_wqe_word_idx {
+	NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
+	NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
+	NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
+	NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
+	NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
+	NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
+	NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
+	NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
+	NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
+	NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
+};
+
+enum nes_nic_sq_wqe_word_idx {
+	NES_NIC_SQ_WQE_MISC_IDX = 0,
+	NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
+	NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
+	NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
+	NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
+	NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
+	NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
+	NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
+	NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
+	NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
+	NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
+	NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
+	NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
+	NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
+	NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
+	NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
+};
+
+enum nes_iwarp_sq_wqe_word_idx {
+	NES_IWARP_SQ_WQE_MISC_IDX = 0,
+	NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
+	NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
+	NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
+	NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
+	NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
+	NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
+	NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
+	NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
+	NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
+	NES_IWARP_SQ_WQE_STAG0_IDX = 19,
+	NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
+	NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
+	NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
+	NES_IWARP_SQ_WQE_STAG1_IDX = 23,
+	NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
+	NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
+	NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
+	NES_IWARP_SQ_WQE_STAG2_IDX = 27,
+	NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
+	NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
+	NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
+	NES_IWARP_SQ_WQE_STAG3_IDX = 31,
+};
+
+enum nes_iwarp_sq_bind_wqe_word_idx {
+	NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
+	NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
+};
+
+enum nes_iwarp_sq_fmr_wqe_word_idx {
+	NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
+	NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
+};
+
+enum nes_iwarp_sq_locinv_wqe_word_idx {
+	NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
+};
+
+
+enum nes_iwarp_rq_wqe_word_idx {
+	NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
+	NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
+	NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
+	NES_IWARP_RQ_WQE_STAG0_IDX = 11,
+	NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
+	NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
+	NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
+	NES_IWARP_RQ_WQE_STAG1_IDX = 15,
+	NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
+	NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
+	NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
+	NES_IWARP_RQ_WQE_STAG2_IDX = 19,
+	NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
+	NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
+	NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
+	NES_IWARP_RQ_WQE_STAG3_IDX = 23,
+};
+
+enum nes_nic_sq_wqe_bits {
+	NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
+	NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
+	NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
+	NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
+	NES_NIC_SQ_WQE_COMPLETION = (1<<31),
+};
+
+enum nes_nic_cqe_word_idx {
+	NES_NIC_CQE_ACCQP_ID_IDX = 0,
+	NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
+	NES_NIC_CQE_MISC_IDX = 3,
+};
+
+#define NES_PKT_TYPE_APBVT_BITS 0xC112
+#define NES_PKT_TYPE_APBVT_MASK 0xff3e
+
+#define NES_PKT_TYPE_PVALID_BITS 0x10000000
+#define NES_PKT_TYPE_PVALID_MASK 0x30000000
+
+#define NES_PKT_TYPE_TCPV4_BITS 0x0110
+#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_UDPV4_BITS 0x0210
+#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_IPV4_BITS  0x0010
+#define NES_PKT_TYPE_IPV4_MASK  0x3f30
+
+#define NES_PKT_TYPE_OTHER_BITS 0x0000
+#define NES_PKT_TYPE_OTHER_MASK 0x0030
+
+#define NES_NIC_CQE_ERRV_SHIFT 16
+enum nes_nic_ev_bits {
+	NES_NIC_ERRV_BITS_MODE = (1<<0),
+	NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
+	NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
+	NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
+	NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
+};
+
+enum nes_nic_cqe_bits {
+	NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
+	NES_NIC_CQE_SQ = (1<<24),
+	NES_NIC_CQE_ACCQP_PORT = (1<<28),
+	NES_NIC_CQE_ACCQP_VALID = (1<<29),
+	NES_NIC_CQE_TAG_VALID = (1<<30),
+	NES_NIC_CQE_VALID = (1<<31),
+};
+
+enum nes_aeqe_word_idx {
+	NES_AEQE_COMP_CTXT_LOW_IDX = 0,
+	NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
+	NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
+	NES_AEQE_MISC_IDX = 3,
+};
+
+enum nes_aeqe_bits {
+	NES_AEQE_QP = (1<<16),
+	NES_AEQE_CQ = (1<<17),
+	NES_AEQE_SQ = (1<<18),
+	NES_AEQE_INBOUND_RDMA = (1<<19),
+	NES_AEQE_IWARP_STATE_MASK = (7<<20),
+	NES_AEQE_TCP_STATE_MASK = (0xf<<24),
+	NES_AEQE_VALID = (1<<31),
+};
+
+#define NES_AEQE_IWARP_STATE_SHIFT	20
+#define NES_AEQE_TCP_STATE_SHIFT	24
+
+enum nes_aeqe_iwarp_state {
+	NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_IWARP_STATE_IDLE = 1,
+	NES_AEQE_IWARP_STATE_RTS = 2,
+	NES_AEQE_IWARP_STATE_CLOSING = 3,
+	NES_AEQE_IWARP_STATE_TERMINATE = 5,
+	NES_AEQE_IWARP_STATE_ERROR = 6
+};
+
+enum nes_aeqe_tcp_state {
+	NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_TCP_STATE_CLOSED = 1,
+	NES_AEQE_TCP_STATE_LISTEN = 2,
+	NES_AEQE_TCP_STATE_SYN_SENT = 3,
+	NES_AEQE_TCP_STATE_SYN_RCVD = 4,
+	NES_AEQE_TCP_STATE_ESTABLISHED = 5,
+	NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
+	NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
+	NES_AEQE_TCP_STATE_CLOSING = 8,
+	NES_AEQE_TCP_STATE_LAST_ACK = 9,
+	NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
+	NES_AEQE_TCP_STATE_TIME_WAIT = 11
+};
+
+enum nes_aeqe_aeid {
+	NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
+	NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
+	NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
+	NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
+	NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
+	NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
+	NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
+	NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
+	NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
+	NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
+	NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
+	NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
+	NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
+	NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
+	NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
+	NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
+	NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
+	NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
+	NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
+	NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
+	NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
+	NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
+	NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
+	NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
+	NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
+	NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
+	NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
+	NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
+	NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
+	NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
+	NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
+	NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
+	NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
+	NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
+	NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
+	NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
+	NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
+	NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
+	NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
+	NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
+	NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
+	NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
+	NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
+	NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
+	NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
+	NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
+	NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
+	NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
+	NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
+	NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
+	NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
+};
+
+enum nes_iwarp_sq_opcodes {
+	NES_IWARP_SQ_WQE_WRPDU = (1<<15),
+	NES_IWARP_SQ_WQE_PSH = (1<<21),
+	NES_IWARP_SQ_WQE_STREAMING = (1<<23),
+	NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
+	NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
+	NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
+	NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
+};
+
+enum nes_iwarp_sq_wqe_bits {
+	NES_IWARP_SQ_OP_RDMAW = 0,
+	NES_IWARP_SQ_OP_RDMAR = 1,
+	NES_IWARP_SQ_OP_SEND = 3,
+	NES_IWARP_SQ_OP_SENDINV = 4,
+	NES_IWARP_SQ_OP_SENDSE = 5,
+	NES_IWARP_SQ_OP_SENDSEINV = 6,
+	NES_IWARP_SQ_OP_BIND = 8,
+	NES_IWARP_SQ_OP_FAST_REG = 9,
+	NES_IWARP_SQ_OP_LOCINV = 10,
+	NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
+	NES_IWARP_SQ_OP_NOP = 12,
+};
+
+#define NES_EEPROM_READ_REQUEST (1<<16)
+#define NES_MAC_ADDR_VALID      (1<<20)
+
+/*
+ * NES index registers init values.
+ */
+struct nes_init_values {
+	u32 index;
+	u32 data;
+	u8  wrt;
+};
+
+/*
+ * NES registers in BAR0.
+ */
+struct nes_pci_regs {
+	u32 int_status;
+	u32 int_mask;
+	u32 int_pending;
+	u32 intf_int_status;
+	u32 intf_int_mask;
+	u32 other_regs[59];	 /* pad out to 256 bytes for now */
+};
+
+#define NES_CQP_SQ_SIZE    128
+#define NES_CCQ_SIZE       128
+#define NES_NIC_WQ_SIZE    512
+#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
+#define NES_NIC_BACK_STORE 0x00038000
+
+struct nes_device;
+
+struct nes_hw_nic_qp_context {
+	__le32 context_words[6];
+};
+
+struct nes_hw_nic_sq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_rq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_cqe {
+	__le32 cqe_words[4];
+};
+
+struct nes_hw_cqp_qp_context {
+	__le32 context_words[4];
+};
+
+struct nes_hw_cqp_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_qp_wqe {
+	__le32 wqe_words[32];
+};
+
+struct nes_hw_cqe {
+	__le32 cqe_words[8];
+};
+
+struct nes_hw_ceqe {
+	__le32 ceqe_words[2];
+};
+
+struct nes_hw_aeqe {
+	__le32 aeqe_words[4];
+};
+
+struct nes_cqp_request {
+	union {
+		u64 cqp_callback_context;
+		void *cqp_callback_pointer;
+	};
+	wait_queue_head_t     waitq;
+	struct nes_hw_cqp_wqe cqp_wqe;
+	struct list_head      list;
+	atomic_t              refcount;
+	void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
+	u16                   major_code;
+	u16                   minor_code;
+	u8                    waiting;
+	u8                    request_done;
+	u8                    dynamic;
+	u8                    callback;
+};
+
+struct nes_hw_cqp {
+	struct nes_hw_cqp_wqe *sq_vbase;
+	dma_addr_t            sq_pbase;
+	spinlock_t            lock;
+	wait_queue_head_t     waitq;
+	u16                   qp_id;
+	u16                   sq_head;
+	u16                   sq_tail;
+	u16                   sq_size;
+};
+
+#define NES_FIRST_FRAG_SIZE 128
+struct nes_first_frag {
+	u8 buffer[NES_FIRST_FRAG_SIZE];
+};
+
+struct nes_hw_nic {
+	struct nes_first_frag    *first_frag_vbase;	/* virtual address of first frags */
+	struct nes_hw_nic_sq_wqe *sq_vbase;			/* virtual address of sq */
+	struct nes_hw_nic_rq_wqe *rq_vbase;			/* virtual address of rq */
+	struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
+	struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
+	dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
+	unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
+	dma_addr_t sq_pbase;			/* PCI memory for host rings */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+
+	u16 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+
+	spinlock_t sq_lock;
+	spinlock_t rq_lock;
+};
+
+struct nes_hw_nic_cq {
+	struct nes_hw_nic_cqe volatile *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	int rx_cqes_completed;
+	int cqe_allocs_pending;
+	int rx_pkts_indicated;
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+	u8  cqes_pending;
+};
+
+struct nes_hw_qp {
+	struct nes_hw_qp_wqe *sq_vbase;		/* PCI memory for host rings */
+	struct nes_hw_qp_wqe *rq_vbase;		/* PCI memory for host rings */
+	void                 *q2_vbase;			/* PCI memory for host rings */
+	dma_addr_t sq_pbase;	/* PCI memory for host rings */
+	dma_addr_t rq_pbase;	/* PCI memory for host rings */
+	dma_addr_t q2_pbase;	/* PCI memory for host rings */
+	u32 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8  rq_encoded_size;
+	u8  sq_encoded_size;
+};
+
+struct nes_hw_cq {
+	struct nes_hw_cqe volatile *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+};
+
+struct nes_hw_ceq {
+	struct nes_hw_ceqe volatile *ceq_vbase;	/* PCI memory for host rings */
+	dma_addr_t ceq_pbase;	/* PCI memory for host rings */
+	u16 ceq_head;
+	u16 ceq_size;
+};
+
+struct nes_hw_aeq {
+	struct nes_hw_aeqe volatile *aeq_vbase;	/* PCI memory for host rings */
+	dma_addr_t aeq_pbase;	/* PCI memory for host rings */
+	u16 aeq_head;
+	u16 aeq_size;
+};
+
+struct nic_qp_map {
+	u8 qpid;
+	u8 nic_index;
+	u8 logical_port;
+	u8 is_hnic;
+};
+
+#define	NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
+#define	NES_CQP_ARP_AEQ_INDEX_SHIFT 16
+
+#define NES_CQP_APBVT_ADD			0x00008000
+#define NES_CQP_APBVT_NIC_SHIFT		16
+
+#define NES_ARP_ADD     1
+#define NES_ARP_DELETE  2
+#define NES_ARP_RESOLVE 3
+
+#define NES_MAC_SW_IDLE      0
+#define NES_MAC_SW_INTERRUPT 1
+#define NES_MAC_SW_MH        2
+
+struct nes_arp_entry {
+	u32 ip_addr;
+	u8  mac_addr[ETH_ALEN];
+};
+
+#define NES_NIC_FAST_TIMER          96
+#define NES_NIC_FAST_TIMER_LOW      40
+#define NES_NIC_FAST_TIMER_HIGH     1000
+#define DEFAULT_NES_QL_HIGH         256
+#define DEFAULT_NES_QL_LOW          16
+#define DEFAULT_NES_QL_TARGET       64
+#define DEFAULT_JUMBO_NES_QL_LOW    12
+#define DEFAULT_JUMBO_NES_QL_TARGET 40
+#define DEFAULT_JUMBO_NES_QL_HIGH   128
+#define NES_NIC_CQ_DOWNWARD_TREND   8
+
+struct nes_hw_tune_timer {
+    //u16 cq_count;
+    u16 threshold_low;
+    u16 threshold_target;
+    u16 threshold_high;
+    u16 timer_in_use;
+    u16 timer_in_use_old;
+    u16 timer_in_use_min;
+    u16 timer_in_use_max;
+    u8  timer_direction_upward;
+    u8  timer_direction_downward;
+    u16 cq_count_old;
+    u8  cq_direction_downward;
+};
+
+#define NES_TIMER_INT_LIMIT         2
+#define NES_TIMER_INT_LIMIT_DYNAMIC 10
+#define NES_TIMER_ENABLE_LIMIT      4
+#define NES_MAX_LINK_INTERRUPTS		128
+#define NES_MAX_LINK_CHECK		200
+
+struct nes_adapter {
+	u64              fw_ver;
+	unsigned long    *allocated_qps;
+	unsigned long    *allocated_cqs;
+	unsigned long    *allocated_mrs;
+	unsigned long    *allocated_pds;
+	unsigned long    *allocated_arps;
+	struct nes_qp    **qp_table;
+	struct workqueue_struct *work_q;
+
+	struct list_head list;
+	struct list_head active_listeners;
+	/* list of the netdev's associated with each logical port */
+	struct list_head nesvnic_list[4];
+
+	struct timer_list  mh_timer;
+	struct timer_list  lc_timer;
+	struct work_struct work;
+	spinlock_t         resource_lock;
+	spinlock_t         phy_lock;
+	spinlock_t         pbl_lock;
+	spinlock_t         periodic_timer_lock;
+
+	struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
+
+	/* Adapter CEQ and AEQs */
+	struct nes_hw_ceq ceq[16];
+	struct nes_hw_aeq aeq[8];
+
+	struct nes_hw_tune_timer tune_timer;
+
+	unsigned long doorbell_start;
+
+	u32 hw_rev;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 device_cap_flags;
+	u32 tick_delta;
+	u32 timer_int_req;
+	u32 arp_table_size;
+	u32 next_arp_index;
+
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 free_256pbl;
+	u32 free_4kpbl;
+	u32 max_mr_size;
+	u32 max_qp;
+	u32 next_qp;
+	u32 max_irrq;
+	u32 max_qp_wr;
+	u32 max_sge;
+	u32 max_cq;
+	u32 next_cq;
+	u32 max_cqe;
+	u32 max_pd;
+	u32 base_pd;
+	u32 next_pd;
+	u32 hte_index_mask;
+
+	/* EEPROM information */
+	u32 rx_pool_size;
+	u32 tx_pool_size;
+	u32 rx_threshold;
+	u32 tcp_timer_core_clk_divisor;
+	u32 iwarp_config;
+	u32 cm_config;
+	u32 sws_timer_config;
+	u32 tcp_config1;
+	u32 wqm_wat;
+	u32 core_clock;
+	u32 firmware_version;
+
+	u32 nic_rx_eth_route_err;
+
+	u32 et_rx_coalesce_usecs;
+	u32	et_rx_max_coalesced_frames;
+	u32 et_rx_coalesce_usecs_irq;
+	u32 et_rx_max_coalesced_frames_irq;
+	u32 et_pkt_rate_low;
+	u32 et_rx_coalesce_usecs_low;
+	u32 et_rx_max_coalesced_frames_low;
+	u32 et_pkt_rate_high;
+	u32 et_rx_coalesce_usecs_high;
+	u32 et_rx_max_coalesced_frames_high;
+	u32 et_rate_sample_interval;
+	u32 timer_int_limit;
+
+	/* Adapter base MAC address */
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+
+	u16 firmware_eeprom_offset;
+	u16 software_eeprom_offset;
+
+	u16 max_irrq_wr;
+
+	/* pd config for each port */
+	u16 pd_config_size[4];
+	u16 pd_config_base[4];
+
+	u16 link_interrupt_count[4];
+
+	/* the phy index for each port */
+	u8  phy_index[4];
+	u8  mac_sw_state[4];
+	u8  mac_link_down[4];
+	u8  phy_type[4];
+
+	/* PCI information */
+	unsigned int  devfn;
+	unsigned char bus_number;
+	unsigned char OneG_Mode;
+
+	unsigned char ref_count;
+	u8            netdev_count;
+	u8            netdev_max;	/* from host nic address count in EEPROM */
+	u8            port_count;
+	u8            virtwq;
+	u8            et_use_adaptive_rx_coalesce;
+	u8            adapter_fcn_count;
+};
+
+struct nes_pbl {
+	u64              *pbl_vbase;
+	dma_addr_t       pbl_pbase;
+	struct page      *page;
+	unsigned long    user_base;
+	u32              pbl_size;
+	struct list_head list;
+	/* TODO: need to add list for two level tables */
+};
+
+struct nes_listener {
+	struct work_struct      work;
+	struct workqueue_struct *wq;
+	struct nes_vnic         *nesvnic;
+	struct iw_cm_id         *cm_id;
+	struct list_head        list;
+	unsigned long           socket;
+	u8                      accept_failed;
+};
+
+struct nes_ib_device;
+
+struct nes_vnic {
+	struct nes_ib_device *nesibdev;
+	u64 sq_full;
+	u64 sq_locked;
+	u64 tso_requests;
+	u64 segmented_tso_requests;
+	u64 linearized_skbs;
+	u64 tx_sw_dropped;
+	u64 endnode_nstat_rx_discard;
+	u64 endnode_nstat_rx_octets;
+	u64 endnode_nstat_rx_frames;
+	u64 endnode_nstat_tx_octets;
+	u64 endnode_nstat_tx_frames;
+	u64 endnode_ipv4_tcp_retransmits;
+	/* void *mem; */
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct vlan_group *vlan_grp;
+	atomic_t          rx_skbs_needed;
+	atomic_t          rx_skb_timer_running;
+	int               budget;
+	u32               msg_enable;
+	/* u32 tx_avail; */
+	__be32            local_ipaddr;
+	struct napi_struct   napi;
+	spinlock_t           tx_lock;	/* could use netdev tx lock? */
+	struct timer_list    rq_wqes_timer;
+	u32                  nic_mem_size;
+	void                 *nic_vbase;
+	dma_addr_t           nic_pbase;
+	struct nes_hw_nic    nic;
+	struct nes_hw_nic_cq nic_cq;
+	u32    mcrq_qp_id;
+	struct nes_ucontext *mcrq_ucontext;
+	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
+	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *, int);
+	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
+	struct net_device_stats netstats;
+	/* used to put the netdev on the adapters logical port list */
+	struct list_head list;
+	u16 max_frame_size;
+	u8  netdev_open;
+	u8  linkup;
+	u8  logical_port;
+	u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
+	u8  perfect_filter_index;
+	u8  nic_index;
+	u8  qp_nic_index[4];
+	u8  next_qp_nic_index;
+	u8  of_device_registered;
+	u8  rdma_enabled;
+	u8  rx_checksum_disabled;
+};
+
+struct nes_ib_device {
+	struct ib_device ibdev;
+	struct nes_vnic *nesvnic;
+
+	/* Virtual RNIC Limits */
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_pd;
+	u32 num_mr;
+	u32 num_qp;
+	u32 num_cq;
+	u32 num_pd;
+};
+
+#define nes_vlan_rx vlan_hwaccel_receive_skb
+#define nes_netif_rx netif_receive_skb
+
+#endif		/* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
new file mode 100644
index 0000000..b6cc265
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -0,0 +1,1703 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <net/tcp.h>
+
+#include <net/inet_common.h>
+#include <linux/inet.h>
+
+#include "nes.h"
+
+static struct nic_qp_map nic_qp_mapping_0[] = {
+	{16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_1[] = {
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_2[] = {
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_3[] = {
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_4[] = {
+	{28,8,0,0},{32,12,0,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_5[] = {
+	{29,9,1,0},{33,13,1,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_6[] = {
+	{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_7[] = {
+	{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map *nic_qp_mapping_per_function[] = {
+	nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
+	nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
+};
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+		| NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+static int debug = -1;
+
+
+static int nes_netdev_open(struct net_device *);
+static int nes_netdev_stop(struct net_device *);
+static int nes_netdev_start_xmit(struct sk_buff *, struct net_device *);
+static struct net_device_stats *nes_netdev_get_stats(struct net_device *);
+static void nes_netdev_tx_timeout(struct net_device *);
+static int nes_netdev_set_mac_address(struct net_device *, void *);
+static int nes_netdev_change_mtu(struct net_device *, int);
+
+/**
+ * nes_netdev_poll
+ */
+static int nes_netdev_poll(struct napi_struct *napi, int budget)
+{
+	struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
+	struct net_device *netdev = nesvnic->netdev;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
+
+	nesvnic->budget = budget;
+	nescq->cqes_pending = 0;
+	nescq->rx_cqes_completed = 0;
+	nescq->cqe_allocs_pending = 0;
+	nescq->rx_pkts_indicated = 0;
+
+	nes_nic_ce_handler(nesdev, nescq);
+
+	if (nescq->cqes_pending == 0) {
+		netif_rx_complete(netdev, napi);
+		/* clear out completed cqes and arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	} else {
+		/* clear out completed cqes but don't arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
+				nesvnic->netdev->name);
+	}
+	return nescq->rx_pkts_indicated;
+}
+
+
+/**
+ * nes_netdev_open - Activate the network interface; ifconfig
+ * ethx up.
+ */
+static int nes_netdev_open(struct net_device *netdev)
+{
+	u32 macaddr_low;
+	u16 macaddr_high;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret;
+	int i;
+	struct nes_vnic *first_nesvnic;
+	u32 nic_active_bit;
+	u32 nic_active;
+
+	assert(nesdev != NULL);
+
+	first_nesvnic = list_entry(nesdev->nesadapter->nesvnic_list[nesdev->mac_index].next,
+			struct nes_vnic, list);
+
+	if (netif_msg_ifup(nesvnic))
+		printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
+
+	ret = nes_init_nic_qp(nesdev, netdev);
+	if (ret) {
+		return ret;
+	}
+
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
+	if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
+		nesvnic->nesibdev = nes_init_ofa_device(netdev);
+		if (nesvnic->nesibdev == NULL) {
+			printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
+		} else {
+			nesvnic->nesibdev->nesvnic = nesvnic;
+			ret = nes_register_ofa_device(nesvnic->nesibdev);
+			if (ret) {
+				printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
+						netdev->name, ret);
+			}
+		}
+	}
+	/* Set packet filters */
+	nic_active_bit = 1 << nesvnic->nic_index;
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	macaddr_high = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+	macaddr_low = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low += (u32)netdev->dev_addr[5];
+
+	/* Program the various MAC regs */
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
+				" (Addr:%08X) = %08X, HIGH = %08X.\n",
+				i, nesvnic->qp_nic_index[i],
+				NES_IDX_PERFECT_FILTER_LOW+((nesvnic->perfect_filter_index + i) * 8),
+				macaddr_low,
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			nesvnic->nic_cq.cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	if (first_nesvnic->linkup) {
+		/* Enable network packets */
+		nesvnic->linkup = 1;
+		netif_start_queue(netdev);
+		netif_carrier_on(netdev);
+	}
+	napi_enable(&nesvnic->napi);
+	nesvnic->netdev_open = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_stop
+ */
+static int nes_netdev_stop(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 nic_active_mask;
+	u32 nic_active;
+
+	nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
+			nesvnic, nesdev, netdev, netdev->name);
+	if (nesvnic->netdev_open == 0)
+		return 0;
+
+	if (netif_msg_ifdown(nesvnic))
+		printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
+
+	/* Disable network packets */
+	napi_disable(&nesvnic->napi);
+	netif_stop_queue(netdev);
+	if ((nesdev->netdev[0] == netdev) & (nesvnic->logical_port == nesdev->mac_index)) {
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
+	}
+
+	nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
+	nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
+			(nesvnic->perfect_filter_index*8), 0);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+
+	if (nesvnic->of_device_registered) {
+		nes_destroy_ofa_device(nesvnic->nesibdev);
+		nesvnic->nesibdev = NULL;
+		nesvnic->of_device_registered = 0;
+	}
+	nes_destroy_nic_qp(nesvnic);
+
+	nesvnic->netdev_open = 0;
+
+	return 0;
+}
+
+
+/**
+ * nes_nic_send
+ */
+static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	__le16 *wqe_fragment_length;
+	u32 wqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 skb_fragment_index;
+	dma_addr_t bus_address;
+
+	nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+
+	/* setup the VLAN tag if present */
+	if (vlan_tx_tag_present(skb)) {
+		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+				netdev->name, vlan_tx_tag_get(skb));
+		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+		wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+	} else
+		wqe_misc = 0;
+
+	/* bump past the vlan tag */
+	wqe_fragment_length++;
+	/*	wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcph = tcp_hdr(skb);
+		if (1) {
+			if (skb_is_gso(skb)) {
+				/* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... seg size = %u\n",
+						netdev->name, skb_is_gso(skb)); */
+				wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE |
+						NES_NIC_SQ_WQE_COMPLETION | (u16)skb_is_gso(skb);
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+						((u32)tcph->doff) |
+						(((u32)(((unsigned char *)tcph) - skb->data)) << 4));
+			} else {
+				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
+			}
+		}
+	} else {	/* CHECKSUM_HW */
+		wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM | NES_NIC_SQ_WQE_COMPLETION;
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+				skb->len);
+	memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+			skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
+	wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+			skb_headlen(skb)));
+	wqe_fragment_length[1] = 0;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
+			nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
+					netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
+			kfree_skb(skb);
+			nesvnic->tx_sw_dropped++;
+			return NETDEV_TX_LOCKED;
+		}
+		set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+		bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
+				skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
+		wqe_fragment_length[wqe_fragment_index++] =
+				cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
+		wqe_fragment_length[wqe_fragment_index] = 0;
+		set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				((u64)(bus_address)));
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+	}
+
+	if (skb_headlen(skb) == skb->len) {
+		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
+			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
+			nesnic->tx_skb[nesnic->sq_head] = NULL;
+			dev_kfree_skb(skb);
+		}
+	} else {
+		/* Deal with Fragments */
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+		for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
+				skb_fragment_index++) {
+			bus_address = pci_map_page( nesdev->pcidev,
+					skb_shinfo(skb)->frags[skb_fragment_index].page,
+					skb_shinfo(skb)->frags[skb_fragment_index].page_offset,
+					skb_shinfo(skb)->frags[skb_fragment_index].size,
+					PCI_DMA_TODEVICE);
+			wqe_fragment_length[wqe_fragment_index] =
+					cpu_to_le16(skb_shinfo(skb)->frags[skb_fragment_index].size);
+			set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+				bus_address);
+			wqe_fragment_index++;
+			if (wqe_fragment_index < 5)
+				wqe_fragment_length[wqe_fragment_index] = 0;
+		}
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
+	nesnic->sq_head++;
+	nesnic->sq_head &= nesnic->sq_size - 1;
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_start_xmit
+ */
+static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	/* struct udphdr *udph; */
+#define NES_MAX_TSO_FRAGS 18
+	/* 64K segment plus overflow on each side */
+	dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
+	dma_addr_t bus_address;
+	u32 tso_frag_index;
+	u32 tso_frag_count;
+	u32 tso_wqe_length;
+	u32 curr_tcp_seq;
+	u32 wqe_count=1;
+	u32 send_rc;
+	struct iphdr *iph;
+	unsigned long flags;
+	__le16 *wqe_fragment_length;
+	u32 nr_frags;
+	u32 original_first_length;
+//	u64 *wqe_fragment_address;
+	/* first fragment (0) is used by copy buffer */
+	u16 wqe_fragment_index=1;
+	u16 hoffset;
+	u16 nhoffset;
+	u16 wqes_needed;
+	u16 wqes_available;
+	u32 old_head;
+	u32 wqe_misc;
+
+	/* nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+			" (%u frags), tso_size=%u\n",
+			netdev->name, skb->len, skb_headlen(skb),
+			skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
+	*/
+
+	if (!netif_carrier_ok(netdev))
+		return NETDEV_TX_OK;
+
+	if (netif_queue_stopped(netdev))
+		return NETDEV_TX_BUSY;
+
+	local_irq_save(flags);
+	if (!spin_trylock(&nesnic->sq_lock)) {
+		local_irq_restore(flags);
+		nesvnic->sq_locked++;
+		return NETDEV_TX_LOCKED;
+	}
+
+	/* Check if SQ is full */
+	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
+		if (!netif_queue_stopped(netdev)) {
+			netif_stop_queue(netdev);
+			barrier();
+			if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
+				netif_start_queue(netdev);
+				goto sq_no_longer_full;
+			}
+		}
+		nesvnic->sq_full++;
+		spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+		return NETDEV_TX_BUSY;
+	}
+
+sq_no_longer_full:
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		nr_frags++;
+	}
+	/* Check if too many fragments */
+	if (unlikely((nr_frags > 4))) {
+		if (skb_is_gso(skb)) {
+			nesvnic->segmented_tso_requests++;
+			nesvnic->tso_requests++;
+			old_head = nesnic->sq_head;
+			/* Basically 4 fragments available per WQE with extended fragments */
+			wqes_needed = nr_frags >> 2;
+			wqes_needed += (nr_frags&3)?1:0;
+			wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
+					(nesnic->sq_size - 1);
+
+			if (unlikely(wqes_needed > wqes_available)) {
+				if (!netif_queue_stopped(netdev)) {
+					netif_stop_queue(netdev);
+					barrier();
+					wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
+						(nesnic->sq_size - 1);
+					if (wqes_needed <= wqes_available) {
+						netif_start_queue(netdev);
+						goto tso_sq_no_longer_full;
+					}
+				}
+				nesvnic->sq_full++;
+				spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
+						netdev->name);
+				return NETDEV_TX_BUSY;
+			}
+tso_sq_no_longer_full:
+			/* Map all the buffers */
+			for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
+					tso_frag_count++) {
+				tso_bus_address[tso_frag_count] = pci_map_page( nesdev->pcidev,
+						skb_shinfo(skb)->frags[tso_frag_count].page,
+						skb_shinfo(skb)->frags[tso_frag_count].page_offset,
+						skb_shinfo(skb)->frags[tso_frag_count].size,
+						PCI_DMA_TODEVICE);
+			}
+
+			tso_frag_index = 0;
+			curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
+
+			for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
+				tso_wqe_length = 0;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+				wqe_fragment_length =
+						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* setup the VLAN tag if present */
+				if (vlan_tx_tag_present(skb)) {
+					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+							netdev->name, vlan_tx_tag_get(skb) );
+					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+					wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+				} else
+					wqe_misc = 0;
+
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+
+				/* Assumes header totally fits in allocated buffer and is in first fragment */
+				if (original_first_length > NES_FIRST_FRAG_SIZE) {
+					nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
+							original_first_length, NES_FIRST_FRAG_SIZE);
+					nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+							" (%u frags), tso_size=%u\n",
+							netdev->name,
+							skb->len, skb_headlen(skb),
+							skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
+				}
+				memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+						skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+				iph = (struct iphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
+				tcph = (struct tcphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
+				if ((wqe_count+1)!=(u32)wqes_needed) {
+					tcph->fin = 0;
+					tcph->psh = 0;
+					tcph->rst = 0;
+					tcph->urg = 0;
+				}
+				if (wqe_count) {
+					tcph->syn = 0;
+				}
+				tcph->seq = htonl(curr_tcp_seq);
+				wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+
+				wqe_fragment_index = 1;
+				if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
+					set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+					bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
+							skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
+					wqe_fragment_length[wqe_fragment_index++] =
+						cpu_to_le16(skb_headlen(skb) - original_first_length);
+					wqe_fragment_length[wqe_fragment_index] = 0;
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+									bus_address);
+				}
+				while (wqe_fragment_index < 5) {
+					wqe_fragment_length[wqe_fragment_index] =
+							cpu_to_le16(skb_shinfo(skb)->frags[tso_frag_index].size);
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+						(u64)tso_bus_address[tso_frag_index]);
+					wqe_fragment_index++;
+					tso_wqe_length += skb_shinfo(skb)->frags[tso_frag_index++].size;
+					if (wqe_fragment_index < 5)
+						wqe_fragment_length[wqe_fragment_index] = 0;
+					if (tso_frag_index == tso_frag_count)
+						break;
+				}
+				if ((wqe_count+1) == (u32)wqes_needed) {
+					nesnic->tx_skb[nesnic->sq_head] = skb;
+				} else {
+					nesnic->tx_skb[nesnic->sq_head] = NULL;
+				}
+				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_is_gso(skb);
+				if ((tso_wqe_length + original_first_length) > skb_is_gso(skb)) {
+					wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
+				} else {
+					iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
+				}
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
+						 wqe_misc);
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+						((u32)tcph->doff) | (((u32)hoffset) << 4));
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+						tso_wqe_length + original_first_length);
+				curr_tcp_seq += tso_wqe_length;
+				nesnic->sq_head++;
+				nesnic->sq_head &= nesnic->sq_size-1;
+			}
+		} else {
+			nesvnic->linearized_skbs++;
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			skb_linearize(skb);
+			skb_set_transport_header(skb, hoffset);
+			skb_set_network_header(skb, nhoffset);
+			send_rc = nes_nic_send(skb, netdev);
+			if (send_rc != NETDEV_TX_OK) {
+				spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+				return NETDEV_TX_OK;
+			}
+		}
+	} else {
+		send_rc = nes_nic_send(skb, netdev);
+		if (send_rc != NETDEV_TX_OK) {
+			spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+			return NETDEV_TX_OK;
+		}
+	}
+
+	barrier();
+
+	if (wqe_count)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC,
+				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
+
+	netdev->trans_start = jiffies;
+	spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_get_stats
+ */
+static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u64 u64temp;
+	u32 u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->endnode_nstat_rx_discard += u32temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_octets += u64temp;
+	nesvnic->netstats.rx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_frames += u64temp;
+	nesvnic->netstats.rx_packets += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_octets += u64temp;
+	nesvnic->netstats.tx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_frames += u64temp;
+	nesvnic->netstats.tx_packets += u64temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	return &nesvnic->netstats;
+}
+
+
+/**
+ * nes_netdev_tx_timeout
+ */
+static void nes_netdev_tx_timeout(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (netif_msg_timer(nesvnic))
+		nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
+}
+
+
+/**
+ * nes_netdev_set_mac_address
+ */
+static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct sockaddr *mac_addr = p;
+	int i;
+	u32 macaddr_low;
+	u16 macaddr_high;
+
+	if (!is_valid_ether_addr(mac_addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
+	printk(PFX "%s: Address length = %d, Address = %02X%02X%02X%02X%02X%02X..\n",
+		   __FUNCTION__, netdev->addr_len,
+		   mac_addr->sa_data[0], mac_addr->sa_data[1],
+		   mac_addr->sa_data[2], mac_addr->sa_data[3],
+		   mac_addr->sa_data[4], mac_addr->sa_data[5]);
+	macaddr_high = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+	macaddr_low = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low += (u32)netdev->dev_addr[5];
+
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+	return 0;
+}
+
+
+/**
+ * nes_netdev_set_multicast_list
+ */
+void nes_netdev_set_multicast_list(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct dev_mc_list *multicast_addr;
+	u32 nic_active_bit;
+	u32 nic_active;
+	u32 perfect_filter_register_address;
+	u32 macaddr_low;
+	u16 macaddr_high;
+	u8 mc_all_on = 0;
+	u8 mc_index;
+	int mc_nic_index = -1;
+
+	nic_active_bit = 1 << nesvnic->nic_index;
+
+	if (netdev->flags & IFF_PROMISC) {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+		mc_all_on = 1;
+	} else if ((netdev->flags & IFF_ALLMULTI) || (netdev->mc_count > NES_MULTICAST_PF_MAX) ||
+			   (nesvnic->nic_index > 3)) {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+		mc_all_on = 1;
+	} else {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscous = %d, All Multicast = %d.\n",
+			  netdev->mc_count, (netdev->flags & IFF_PROMISC)?1:0,
+			  (netdev->flags & IFF_ALLMULTI)?1:0);
+	if (!mc_all_on) {
+		multicast_addr = netdev->mc_list;
+		perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW + 0x80;
+		perfect_filter_register_address += nesvnic->nic_index*0x40;
+		for (mc_index=0; mc_index < NES_MULTICAST_PF_MAX; mc_index++) {
+			while (multicast_addr && nesvnic->mcrq_mcast_filter && ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic, multicast_addr->dmi_addr)) == 0))
+				multicast_addr = multicast_addr->next;
+
+			if (mc_nic_index < 0)
+				mc_nic_index = nesvnic->nic_index;
+			if (multicast_addr) {
+				nes_debug(NES_DBG_NIC_RX, "Assigning MC Address = %02X%02X%02X%02X%02X%02X to register 0x%04X nic_idx=%d\n",
+						  multicast_addr->dmi_addr[0], multicast_addr->dmi_addr[1],
+						  multicast_addr->dmi_addr[2], multicast_addr->dmi_addr[3],
+						  multicast_addr->dmi_addr[4], multicast_addr->dmi_addr[5],
+						  perfect_filter_register_address+(mc_index * 8), mc_nic_index);
+				macaddr_high = ((u16)multicast_addr->dmi_addr[0]) << 8;
+				macaddr_high += (u16)multicast_addr->dmi_addr[1];
+				macaddr_low = ((u32)multicast_addr->dmi_addr[2]) << 24;
+				macaddr_low += ((u32)multicast_addr->dmi_addr[3]) << 16;
+				macaddr_low += ((u32)multicast_addr->dmi_addr[4]) << 8;
+				macaddr_low += (u32)multicast_addr->dmi_addr[5];
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+(mc_index * 8),
+						macaddr_low);
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						(u32)macaddr_high | NES_MAC_ADDR_VALID |
+						((((u32)(1<<mc_nic_index)) << 16)));
+				multicast_addr = multicast_addr->next;
+			} else {
+				nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
+						  perfect_filter_register_address+(mc_index * 8));
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						0);
+			}
+		}
+	}
+}
+
+
+/**
+ * nes_netdev_change_mtu
+ */
+static int nes_netdev_change_mtu(struct	net_device *netdev,	int	new_mtu)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev =	nesvnic->nesdev;
+	int	ret	= 0;
+	u8 jumbomode=0;
+
+	if ((new_mtu < ETH_ZLEN) ||	(new_mtu > max_mtu))
+		return -EINVAL;
+
+	netdev->mtu	= new_mtu;
+	nesvnic->max_frame_size	= new_mtu+ETH_HLEN;
+
+	if (netdev->mtu	> 1500)	{
+		jumbomode=1;
+	}
+	nes_nic_init_timer_defaults(nesdev,	jumbomode);
+
+	if (netif_running(netdev)) {
+		nes_netdev_stop(netdev);
+		nes_netdev_open(netdev);
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_netdev_exit - destroy network device
+ */
+void nes_netdev_exit(struct nes_vnic *nesvnic)
+{
+	struct net_device *netdev = nesvnic->netdev;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+	nes_debug(NES_DBG_SHUTDOWN, "\n");
+
+	// destroy the ibdevice if RDMA enabled
+	if ((nesvnic->rdma_enabled)&&(nesvnic->of_device_registered)) {
+		nes_destroy_ofa_device( nesibdev );
+		nesvnic->of_device_registered = 0;
+		nesvnic->nesibdev = NULL;
+	}
+	unregister_netdev(netdev);
+	nes_debug(NES_DBG_SHUTDOWN, "\n");
+}
+
+
+#define NES_ETHTOOL_STAT_COUNT 55
+static const char nes_ethtool_stringset[NES_ETHTOOL_STAT_COUNT][ETH_GSTRING_LEN] = {
+	"Link Change Interrupts",
+	"Linearized SKBs",
+	"T/GSO Requests",
+	"Pause Frames Sent",
+	"Pause Frames Received",
+	"Internal Routing Errors",
+	"SQ SW Dropped SKBs",
+	"SQ Locked",
+	"SQ Full",
+	"Segmented TSO Requests",
+	"Rx Symbol Errors",
+	"Rx Jabber Errors",
+	"Rx Oversized Frames",
+	"Rx Short Frames",
+	"Endnode Rx Discards",
+	"Endnode Rx Octets",
+	"Endnode Rx Frames",
+	"Endnode Tx Octets",
+	"Endnode Tx Frames",
+	"mh detected",
+	"mh pauses",
+	"Retransmission Count",
+	"CM Connects",
+	"CM Accepts",
+	"Disconnects",
+	"Connected Events",
+	"Connect Requests",
+	"CM Rejects",
+	"ModifyQP Timeouts",
+	"CreateQPs",
+	"SW DestroyQPs",
+	"DestroyQPs",
+	"CM Closes",
+	"CM Packets Sent",
+	"CM Packets Bounced",
+	"CM Packets Created",
+	"CM Packets Rcvd",
+	"CM Packets Dropped",
+	"CM Packets Retrans",
+	"CM Listens Created",
+	"CM Listens Destroyed",
+	"CM Backlog Drops",
+	"CM Loopbacks",
+	"CM Nodes Created",
+	"CM Nodes Destroyed",
+	"CM Accel Drops",
+	"CM Resets Received",
+	"Timer Inits",
+	"CQ Depth 1",
+	"CQ Depth 4",
+	"CQ Depth 16",
+	"CQ Depth 24",
+	"CQ Depth 32",
+	"CQ Depth 128",
+	"CQ Depth 256",
+};
+
+
+/**
+ * nes_netdev_get_rx_csum
+ */
+static u32 nes_netdev_get_rx_csum (struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (nesvnic->rx_checksum_disabled)
+		return 0;
+	else
+		return 1;
+}
+
+
+/**
+ * nes_netdev_set_rc_csum
+ */
+static int nes_netdev_set_rx_csum(struct net_device *netdev, u32 enable)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (enable)
+		nesvnic->rx_checksum_disabled = 0;
+	else
+		nesvnic->rx_checksum_disabled = 1;
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_stats_count
+ */
+static int nes_netdev_get_stats_count(struct net_device *netdev)
+{
+	return NES_ETHTOOL_STAT_COUNT;
+}
+
+
+/**
+ * nes_netdev_get_strings
+ */
+static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
+		u8 *ethtool_strings)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(ethtool_strings,
+				&nes_ethtool_stringset,
+				sizeof(nes_ethtool_stringset));
+}
+
+
+/**
+ * nes_netdev_get_ethtool_stats
+ */
+static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
+		struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 nic_count;
+	u32 u32temp;
+
+	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
+	target_stat_values[0] = nesvnic->nesdev->link_status_interrupts;
+	target_stat_values[1] = nesvnic->linearized_skbs;
+	target_stat_values[2] = nesvnic->tso_requests;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
+	target_stat_values[3] = nesvnic->nesdev->mac_pause_frames_sent;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_received += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_rx_discards += u32temp;
+	nesvnic->netstats.rx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_tx_discards += u32temp;
+	nesvnic->netstats.tx_dropped += u32temp;
+
+	for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
+		if (nesvnic->qp_nic_index[nic_count] == 0xf)
+			break;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->netstats.rx_dropped += u32temp;
+		nesvnic->endnode_nstat_rx_discard += u32temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_octets += u64temp;
+		nesvnic->netstats.rx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_frames += u64temp;
+		nesvnic->netstats.rx_packets += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_octets += u64temp;
+		nesvnic->netstats.tx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_frames += u64temp;
+		nesvnic->netstats.tx_packets += u64temp;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
+	}
+
+	target_stat_values[4] = nesvnic->nesdev->mac_pause_frames_received;
+	target_stat_values[5] = nesdev->nesadapter->nic_rx_eth_route_err;
+	target_stat_values[6] = nesvnic->tx_sw_dropped;
+	target_stat_values[7] = nesvnic->sq_locked;
+	target_stat_values[8] = nesvnic->sq_full;
+	target_stat_values[9] = nesvnic->segmented_tso_requests;
+	target_stat_values[10] = nesvnic->nesdev->mac_rx_symbol_err_frames;
+	target_stat_values[11] = nesvnic->nesdev->mac_rx_jabber_frames;
+	target_stat_values[12] = nesvnic->nesdev->mac_rx_oversized_frames;
+	target_stat_values[13] = nesvnic->nesdev->mac_rx_short_frames;
+	target_stat_values[14] = nesvnic->endnode_nstat_rx_discard;
+	target_stat_values[15] = nesvnic->endnode_nstat_rx_octets;
+	target_stat_values[16] = nesvnic->endnode_nstat_rx_frames;
+	target_stat_values[17] = nesvnic->endnode_nstat_tx_octets;
+	target_stat_values[18] = nesvnic->endnode_nstat_tx_frames;
+	target_stat_values[19] = mh_detected;
+	target_stat_values[20] = mh_pauses_sent;
+	target_stat_values[21] = nesvnic->endnode_ipv4_tcp_retransmits;
+	target_stat_values[22] = atomic_read(&cm_connects);
+	target_stat_values[23] = atomic_read(&cm_accepts);
+	target_stat_values[24] = atomic_read(&cm_disconnects);
+	target_stat_values[25] = atomic_read(&cm_connecteds);
+	target_stat_values[26] = atomic_read(&cm_connect_reqs);
+	target_stat_values[27] = atomic_read(&cm_rejects);
+	target_stat_values[28] = atomic_read(&mod_qp_timouts);
+	target_stat_values[29] = atomic_read(&qps_created);
+	target_stat_values[30] = atomic_read(&sw_qps_destroyed);
+	target_stat_values[31] = atomic_read(&qps_destroyed);
+	target_stat_values[32] = atomic_read(&cm_closes);
+	target_stat_values[33] = cm_packets_sent;
+	target_stat_values[34] = cm_packets_bounced;
+	target_stat_values[35] = cm_packets_created;
+	target_stat_values[36] = cm_packets_received;
+	target_stat_values[37] = cm_packets_dropped;
+	target_stat_values[38] = cm_packets_retrans;
+	target_stat_values[39] = cm_listens_created;
+	target_stat_values[40] = cm_listens_destroyed;
+	target_stat_values[41] = cm_backlog_drops;
+	target_stat_values[42] = atomic_read(&cm_loopbacks);
+	target_stat_values[43] = atomic_read(&cm_nodes_created);
+	target_stat_values[44] = atomic_read(&cm_nodes_destroyed);
+	target_stat_values[45] = atomic_read(&cm_accel_dropped_pkts);
+	target_stat_values[46] = atomic_read(&cm_resets_recvd);
+	target_stat_values[47] = int_mod_timer_init;
+	target_stat_values[48] = int_mod_cq_depth_1;
+	target_stat_values[49] = int_mod_cq_depth_4;
+	target_stat_values[50] = int_mod_cq_depth_16;
+	target_stat_values[51] = int_mod_cq_depth_24;
+	target_stat_values[52] = int_mod_cq_depth_32;
+	target_stat_values[53] = int_mod_cq_depth_128;
+	target_stat_values[54] = int_mod_cq_depth_256;
+
+}
+
+
+/**
+ * nes_netdev_get_drvinfo
+ */
+static void nes_netdev_get_drvinfo(struct net_device *netdev,
+		struct ethtool_drvinfo *drvinfo)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	strcpy(drvinfo->driver, DRV_NAME);
+	strcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev));
+	strcpy(drvinfo->fw_version, "TBD");
+	strcpy(drvinfo->version, DRV_VERSION);
+	drvinfo->n_stats = nes_netdev_get_stats_count(netdev);
+	drvinfo->testinfo_len = 0;
+	drvinfo->eedump_len = 0;
+	drvinfo->regdump_len = 0;
+}
+
+
+/**
+ * nes_netdev_set_coalesce
+ */
+static int nes_netdev_set_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev =	nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
+	if (et_coalesce->rx_max_coalesced_frames_low) {
+		shared_timer->threshold_low	 = et_coalesce->rx_max_coalesced_frames_low;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_irq) {
+		shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_high) {
+		shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
+	}
+	if (et_coalesce->rx_coalesce_usecs_low) {
+		shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
+	}
+	if (et_coalesce->rx_coalesce_usecs_high) {
+		shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+
+	/* using this to drive total interrupt moderation */
+	nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
+	if (et_coalesce->use_adaptive_rx_coalesce) {
+		nesadapter->et_use_adaptive_rx_coalesce	= 1;
+		nesadapter->timer_int_limit	= NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		if (et_coalesce->pkt_rate_low) {
+			nesadapter->et_pkt_rate_low	= et_coalesce->pkt_rate_low;
+		}
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce	= 0;
+		nesadapter->timer_int_limit	= NES_TIMER_INT_LIMIT;
+		if (nesadapter->et_rx_coalesce_usecs_irq) {
+			nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+					0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_coalesce
+ */
+static int nes_netdev_get_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev =	nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ethtool_coalesce	temp_et_coalesce;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
+	temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq;
+	temp_et_coalesce.use_adaptive_rx_coalesce =	nesadapter->et_use_adaptive_rx_coalesce;
+	temp_et_coalesce.rate_sample_interval =	nesadapter->et_rate_sample_interval;
+	temp_et_coalesce.pkt_rate_low =	nesadapter->et_pkt_rate_low;
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
+	temp_et_coalesce.rx_max_coalesced_frames_low =	shared_timer->threshold_low;
+	temp_et_coalesce.rx_max_coalesced_frames_irq =	shared_timer->threshold_target;
+	temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
+	temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min;
+	temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
+	if (nesadapter->et_use_adaptive_rx_coalesce) {
+		temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+	memcpy(et_coalesce,	&temp_et_coalesce, sizeof(*et_coalesce));
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_pauseparam
+ */
+static void nes_netdev_get_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	et_pauseparam->autoneg = 0;
+	et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
+	et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
+}
+
+
+/**
+ * nes_netdev_set_pauseparam
+ */
+static int nes_netdev_set_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 u32temp;
+
+	if (et_pauseparam->autoneg) {
+		/* TODO: should return unsupported */
+		return 0;
+	}
+	if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 0;
+	} else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 1;
+	}
+	if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 0;
+	} else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_settings
+ */
+static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u16 phy_data;
+
+	et_cmd->duplex = DUPLEX_FULL;
+	et_cmd->port = PORT_MII;
+	if (nesadapter->OneG_Mode) {
+		et_cmd->supported = SUPPORTED_1000baseT_Full|SUPPORTED_Autoneg;
+		et_cmd->advertising = ADVERTISED_1000baseT_Full|ADVERTISED_Autoneg;
+		et_cmd->speed = SPEED_1000;
+		nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index],
+				&phy_data);
+		if (phy_data&0x1000) {
+			et_cmd->autoneg = AUTONEG_ENABLE;
+		} else {
+			et_cmd->autoneg = AUTONEG_DISABLE;
+		}
+		et_cmd->transceiver = XCVR_EXTERNAL;
+		et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index];
+	} else {
+		if (nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) {
+			et_cmd->transceiver = XCVR_EXTERNAL;
+			et_cmd->port = PORT_FIBRE;
+			et_cmd->supported = SUPPORTED_FIBRE;
+			et_cmd->advertising = ADVERTISED_FIBRE;
+			et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index];
+		} else {
+			et_cmd->transceiver = XCVR_INTERNAL;
+			et_cmd->supported = SUPPORTED_10000baseT_Full;
+			et_cmd->advertising = ADVERTISED_10000baseT_Full;
+			et_cmd->phy_address = nesdev->mac_index;
+		}
+		et_cmd->speed = SPEED_10000;
+		et_cmd->autoneg = AUTONEG_DISABLE;
+	}
+	et_cmd->maxtxpkt = 511;
+	et_cmd->maxrxpkt = 511;
+	return 0;
+}
+
+
+/**
+ * nes_netdev_set_settings
+ */
+static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u16 phy_data;
+
+	if (nesadapter->OneG_Mode) {
+		nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index],
+				&phy_data);
+		if (et_cmd->autoneg) {
+			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
+			phy_data |= 0x1300;
+		} else {
+			// Turn off autoneg
+			phy_data &= ~0x1000;
+		}
+		nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index],
+				phy_data);
+	}
+
+	return 0;
+}
+
+
+static struct ethtool_ops nes_ethtool_ops = {
+	.get_link = ethtool_op_get_link,
+	.get_settings = nes_netdev_get_settings,
+	.set_settings = nes_netdev_set_settings,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.get_rx_csum = nes_netdev_get_rx_csum,
+	.get_sg = ethtool_op_get_sg,
+	.get_strings = nes_netdev_get_strings,
+	.get_stats_count = nes_netdev_get_stats_count,
+	.get_ethtool_stats = nes_netdev_get_ethtool_stats,
+	.get_drvinfo = nes_netdev_get_drvinfo,
+	.get_coalesce = nes_netdev_get_coalesce,
+	.set_coalesce = nes_netdev_set_coalesce,
+	.get_pauseparam = nes_netdev_get_pauseparam,
+	.set_pauseparam = nes_netdev_set_pauseparam,
+	.set_tx_csum = ethtool_op_set_tx_csum,
+	.set_rx_csum = nes_netdev_set_rx_csum,
+	.set_sg = ethtool_op_set_sg,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = ethtool_op_set_tso,
+};
+
+
+static void nes_netdev_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 u32temp;
+
+	nesvnic->vlan_grp = grp;
+
+	/* Enable/Disable VLAN Stripping */
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
+	if (grp)
+		u32temp &= 0xfdffffff;
+	else
+		u32temp	|= 0x02000000;
+
+	nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
+}
+
+
+/**
+ * nes_netdev_init - initialize network device
+ */
+struct net_device *nes_netdev_init(struct nes_device *nesdev,
+		void __iomem *mmio_addr)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = NULL;
+	struct net_device *netdev;
+	struct nic_qp_map *curr_qp_map;
+	u32 u32temp;
+	u16 phy_data;
+	u16 temp_phy_data;
+
+	netdev = alloc_etherdev(sizeof(struct nes_vnic));
+	if (!netdev) {
+		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
+
+	SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
+
+	nesvnic = netdev_priv(netdev);
+	memset(nesvnic, 0, sizeof(*nesvnic));
+
+	netdev->open = nes_netdev_open;
+	netdev->stop = nes_netdev_stop;
+	netdev->hard_start_xmit = nes_netdev_start_xmit;
+	netdev->get_stats = nes_netdev_get_stats;
+	netdev->tx_timeout = nes_netdev_tx_timeout;
+	netdev->set_mac_address = nes_netdev_set_mac_address;
+	netdev->set_multicast_list = nes_netdev_set_multicast_list;
+	netdev->change_mtu = nes_netdev_change_mtu;
+	netdev->watchdog_timeo = NES_TX_TIMEOUT;
+	netdev->irq = nesdev->pcidev->irq;
+	netdev->mtu = ETH_DATA_LEN;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->type = ARPHRD_ETHER;
+	netdev->features = NETIF_F_HIGHDMA;
+	netdev->ethtool_ops = &nes_ethtool_ops;
+	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
+	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
+	netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	netdev->vlan_rx_register = nes_netdev_vlan_rx_register;
+	netdev->features |= NETIF_F_LLTX;
+
+	/* Fill in the port structure */
+	nesvnic->netdev = netdev;
+	nesvnic->nesdev = nesdev;
+	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
+	nesvnic->netdev_index = nesdev->netdev_count;
+	nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
+	nesvnic->max_frame_size = netdev->mtu+netdev->hard_header_len;
+
+	curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
+	nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
+	nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
+	nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
+
+	/* Setup the burned in MAC address */
+	u64temp = (u64)nesdev->nesadapter->mac_addr_low;
+	u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
+	u64temp += nesvnic->nic_index;
+	netdev->dev_addr[0] = (u8)(u64temp>>40);
+	netdev->dev_addr[1] = (u8)(u64temp>>32);
+	netdev->dev_addr[2] = (u8)(u64temp>>24);
+	netdev->dev_addr[3] = (u8)(u64temp>>16);
+	netdev->dev_addr[4] = (u8)(u64temp>>8);
+	netdev->dev_addr[5] = (u8)u64temp;
+	memcpy(netdev->perm_addr, netdev->dev_addr, 6);
+
+	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) {
+		netdev->features |= NETIF_F_TSO | NETIF_F_SG | NETIF_F_IP_CSUM;
+		netdev->features |= NETIF_F_GSO | NETIF_F_TSO | NETIF_F_SG | NETIF_F_IP_CSUM;
+	} else {
+		netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
+	}
+
+	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
+			" nic_index = %d, logical_port = %d, mac_index = %d.\n",
+			nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
+			nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
+
+	if (nesvnic->nesdev->nesadapter->port_count == 1) {
+		nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+		nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
+			nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
+		}
+	} else {
+		if (nesvnic->nesdev->nesadapter->port_count == 2) {
+			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+			nesvnic->qp_nic_index[1] = nesvnic->nic_index + 2;
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+			nesvnic->qp_nic_index[1] = 0xf;
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		}
+	}
+	nesvnic->next_qp_nic_index = 0;
+
+	if (nesdev->netdev_count == 0) {
+		nesvnic->rdma_enabled = 1;
+	} else {
+		nesvnic->rdma_enabled = 0;
+	}
+	nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
+	spin_lock_init(&nesvnic->tx_lock);
+	nesdev->netdev[nesdev->netdev_count] = netdev;
+
+	nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
+			nesvnic, nesdev->mac_index);
+	list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
+
+	if ((nesdev->netdev_count == 0) &&
+			(PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index)) {
+		nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n",
+				NES_IDX_PHY_PCS_CONTROL_STATUS0+(0x200*(nesvnic->logical_port&1)));
+		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200*(nesvnic->logical_port&1)));
+		u32temp |= 0x00200000;
+		nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200*(nesvnic->logical_port&1)), u32temp);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200*(nesvnic->logical_port&1)) );
+		if ((u32temp&0x0f1f0000) == 0x0f0f0000) {
+			if (nesdev->nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) {
+				nes_init_phy(nesdev);
+				nes_read_10G_phy_reg(nesdev, 1,
+						nesdev->nesadapter->phy_index[nesvnic->logical_port]);
+				temp_phy_data = (u16)nes_read_indexed(nesdev,
+									NES_IDX_MAC_MDIO_CONTROL);
+				u32temp = 20;
+				do {
+					nes_read_10G_phy_reg(nesdev, 1,
+							nesdev->nesadapter->phy_index[nesvnic->logical_port]);
+					phy_data = (u16)nes_read_indexed(nesdev,
+									NES_IDX_MAC_MDIO_CONTROL);
+					if ((phy_data == temp_phy_data) || (!(--u32temp)))
+						break;
+					temp_phy_data = phy_data;
+				} while (1);
+				if (phy_data & 4) {
+					nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
+					nesvnic->linkup = 1;
+				} else {
+					nes_debug(NES_DBG_INIT, "The Link is DOWN!!.\n");
+				}
+			} else {
+				nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
+				nesvnic->linkup = 1;
+			}
+		}
+		nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
+		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS+(0x200*nesvnic->logical_port));
+		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS+(0x200*nesvnic->logical_port), u32temp);
+
+		if (nesdev->nesadapter->phy_type[nesvnic->logical_port] != NES_PHY_TYPE_IRIS)
+			nes_init_phy(nesdev);
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesvnic->logical_port),
+				~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+				NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+	}
+
+	return netdev;
+}
+
+
+/**
+ * nes_netdev_destroy - destroy network device structure
+ */
+void nes_netdev_destroy(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	/* make sure 'stop' method is called by Linux stack */
+	/* nes_netdev_stop(netdev); */
+
+	list_del(&nesvnic->list);
+
+	if (nesvnic->of_device_registered) {
+		nes_destroy_ofa_device(nesvnic->nesibdev);
+	}
+
+	free_netdev(netdev);
+}
+
+
+/**
+ * nes_nic_cm_xmit -- CM calls this to send out pkts
+ */
+int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	int ret;
+
+	skb->dev = netdev;
+	ret = dev_queue_xmit(skb);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h
new file mode 100644
index 0000000..e64306b
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_user.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_USER_H
+#define NES_USER_H
+
+#include <linux/types.h>
+
+#define NES_ABI_USERSPACE_VER 1
+#define NES_ABI_KERNEL_VER    1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct nes_alloc_ucontext_req {
+	__u32 reserved32;
+	__u8  userspace_ver;
+	__u8  reserved8[3];
+};
+
+struct nes_alloc_ucontext_resp {
+	__u32 max_pds; /* maximum pds allowed for this user process */
+	__u32 max_qps; /* maximum qps allowed for this user process */
+	__u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8  virtwq;  /* flag to indicate if virtual WQ are to be used or not */
+	__u8  kernel_ver;
+	__u8  reserved[2];
+};
+
+struct nes_alloc_pd_resp {
+	__u32 pd_id;
+	__u32 mmap_db_index;
+};
+
+struct nes_create_cq_req {
+	__u64 user_cq_buffer;
+	__u32 mcrqf;
+	__u8 reserved[4];
+};
+
+struct nes_create_qp_req {
+	__u64 user_wqe_buffers;
+};
+
+enum iwnes_memreg_type {
+	IWNES_MEMREG_TYPE_MEM = 0x0000,
+	IWNES_MEMREG_TYPE_QP = 0x0001,
+	IWNES_MEMREG_TYPE_CQ = 0x0002,
+	IWNES_MEMREG_TYPE_MW = 0x0003,
+	IWNES_MEMREG_TYPE_FMR = 0x0004,
+};
+
+struct nes_mem_reg_req {
+	__u32 reg_type;	/* indicates if id is memory, QP or CQ */
+	__u32 reserved;
+};
+
+struct nes_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct nes_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 mmap_sq_db_index;
+	__u32 mmap_rq_db_index;
+	__u32 nes_drv_opt;
+};
+
+#endif				/* NES_USER_H */
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
new file mode 100644
index 0000000..c4ec6ac
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include "nes.h"
+
+
+
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
+
+u32 mh_detected;
+u32 mh_pauses_sent;
+
+/**
+ * nes_read_eeprom_values -
+ */
+int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
+{
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+	u16 eeprom_data;
+	u16 eeprom_offset;
+	u16 next_section_address;
+	u16 sw_section_ver;
+	u8  major_ver = 0;
+	u8  minor_ver = 0;
+
+	/* TODO: deal with EEPROM endian issues */
+	if (nesadapter->firmware_eeprom_offset == 0) {
+		/* Read the EEPROM Parameters */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
+		nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
+		eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
+				((eeprom_data & 0x0080) >> 7));
+		nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->firmware_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
+		nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->software_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5753) {
+			printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+		sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
+		nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
+				sw_section_ver);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x4f52) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5753) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x464e) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
+		printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		major_ver = (u8)(eeprom_data >> 8);
+		minor_ver = (u8)(eeprom_data);
+
+		if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
+			nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
+		} else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
+			nesadapter->virtwq = 1;
+		}
+		nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
+				(u32)((u8)eeprom_data);
+
+no_fw_rev:
+		/* eeprom is valid */
+		eeprom_offset = nesadapter->software_eeprom_offset;
+		eeprom_offset += 8;
+		nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low <<= 16;
+		mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
+				mac_addr_high, mac_addr_low);
+		nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
+
+		nesadapter->mac_addr_low = mac_addr_low;
+		nesadapter->mac_addr_high = mac_addr_high;
+
+		/* Read the Phy Type array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[1] = (u8)eeprom_data;
+
+		/* Read the port array */
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[3] = (u8)eeprom_data;
+		/* port_count is set by soft reset reg */
+		nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
+				" port 2 -> %u, port 3 -> %u\n",
+				nesadapter->port_count,
+				nesadapter->phy_type[0], nesadapter->phy_type[1],
+				nesadapter->phy_type[2], nesadapter->phy_type[3]);
+
+		/* Read PD config array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[0] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[0] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[1] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[1] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[2] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[2] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[3] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[3] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
+
+		/* Read Rx Pool Size */
+		eeprom_offset += 22;   /* 46 */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
+				nesadapter->tcp_timer_core_clk_divisor);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->cm_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->core_clock = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
+
+		if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[1] = eeprom_data & 0x00ff;
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[3] = eeprom_data & 0x00ff;
+		} else {
+			nesadapter->phy_index[0] = 4;
+			nesadapter->phy_index[1] = 5;
+			nesadapter->phy_index[2] = 6;
+			nesadapter->phy_index[3] = 7;
+		}
+		nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
+			   nesadapter->phy_index[0],nesadapter->phy_index[1],
+			   nesadapter->phy_index[2],nesadapter->phy_index[3]);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_read16_eeprom
+ */
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
+{
+	writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
+			(void __iomem *)addr + NES_EEPROM_COMMAND);
+
+	do {
+	} while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
+			NES_EEPROM_READ_REQUEST);
+
+	return readw((void __iomem *)addr + NES_EEPROM_DATA);
+}
+
+
+/**
+ * nes_write_1G_phy_reg
+ */
+void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 u32temp;
+	u32 counter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+
+/**
+ * nes_read_1G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 u32temp;
+	u32 counter;
+	unsigned long flags;
+
+	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
+			phy_addr, nesdev->mac_index); */
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1)) {
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+		*data = 0xffff;
+	} else {
+		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	}
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+
+/**
+ * nes_write_10G_phy_reg
+ */
+void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg,
+		u8 phy_addr, u16 data)
+{
+	u32 dev_addr;
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	dev_addr = 1;
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* set data */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_10G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, u8 phy_addr)
+{
+	u32 dev_addr;
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	dev_addr = 1;
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* issue read */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_get_cqp_request
+ */
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request = NULL;
+
+	if (!list_empty(&nesdev->cqp_avail_reqs)) {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
+				struct nes_cqp_request, list);
+		list_del_init(&cqp_request->list);
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	} else {
+		cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_KERNEL);
+		if (cqp_request) {
+			cqp_request->dynamic = 1;
+			INIT_LIST_HEAD(&cqp_request->list);
+		}
+	}
+
+	if (cqp_request) {
+		init_waitqueue_head(&cqp_request->waitq);
+		cqp_request->waiting = 0;
+		cqp_request->request_done = 0;
+		cqp_request->callback = 0;
+		init_waitqueue_head(&cqp_request->waitq);
+		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
+				cqp_request);
+	} else
+		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
+			   __FUNCTION__);
+
+	return cqp_request;
+}
+
+
+/**
+ * nes_post_cqp_request
+ */
+void nes_post_cqp_request(struct nes_device *nesdev,
+		struct nes_cqp_request *cqp_request, int ring_doorbell)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	unsigned long flags;
+	u32 cqp_head;
+	u64 u64temp;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)
+			&& (list_empty(&nesdev->cqp_pending_reqs))) {
+		cqp_head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		barrier();
+		u64temp = (unsigned long)cqp_request;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_SCRATCH_LOW_IDX,
+				    u64temp);
+		nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
+				" request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
+				" waiting = %d, refcount = %d.\n",
+				le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
+				le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
+				nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
+				cqp_request->waiting, atomic_read(&cqp_request->refcount));
+		barrier();
+		if (ring_doorbell) {
+			/* Ring doorbell (1 WQEs) */
+			nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+		}
+
+		barrier();
+	} else {
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
+				" put on the pending queue.\n",
+				cqp_request,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
+		list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
+	}
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	return;
+}
+
+
+/**
+ * nes_arp_table
+ */
+int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int arp_index;
+	int err = 0;
+
+	for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
+		if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
+			break;
+	}
+
+	if (action == NES_ARP_ADD) {
+		if (arp_index != nesadapter->arp_table_size) {
+			return -1;
+		}
+
+		arp_index = 0;
+		err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
+				nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index);
+		if (err) {
+			nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
+			return err;
+		}
+		nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
+
+		nesadapter->arp_table[arp_index].ip_addr = ip_addr;
+		memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
+		return arp_index;
+	}
+
+	/* DELETE or RESOLVE */
+	if (arp_index == nesadapter->arp_table_size) {
+		nes_debug(NES_DBG_NETDEV, "mac address not in ARP table - cannot delete or resolve\n");
+		return -1;
+	}
+
+	if (action == NES_ARP_RESOLVE) {
+		nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
+		return arp_index;
+	}
+
+	if (action == NES_ARP_DELETE) {
+		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
+		nesadapter->arp_table[arp_index].ip_addr = 0;
+		memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
+		return arp_index;
+	}
+
+	return -1;
+}
+
+
+/**
+ * nes_mh_fix
+ */
+void nes_mh_fix(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 used_chunks_tx;
+	u32 temp_used_chunks_tx;
+	u32 temp_last_used_chunks_tx;
+	u32 used_chunks_mask;
+	u32 mac_tx_frames_low;
+	u32 mac_tx_frames_high;
+	u32 mac_tx_pauses;
+	u32 serdes_status;
+	u32 reset_value;
+	u32 tx_control;
+	u32 tx_config;
+	u32 tx_pause_quanta;
+	u32 rx_control;
+	u32 rx_config;
+	u32 mac_exact_match;
+	u32 mpp_debug;
+	u32 i=0;
+	u32 chunks_tx_progress = 0;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		goto no_mh_work;
+	}
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	do {
+		mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
+		mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
+		nesdev->mac_pause_frames_sent += mac_tx_pauses;
+		used_chunks_mask = 0;
+		temp_used_chunks_tx = used_chunks_tx;
+		temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
+
+		if (nesdev->netdev[0]) {
+			nesvnic = netdev_priv(nesdev->netdev[0]);
+		} else {
+			break;
+		}
+
+		for (i=0; i<4; i++) {
+			used_chunks_mask <<= 8;
+			if (nesvnic->qp_nic_index[i] != 0xff) {
+				used_chunks_mask |= 0xff;
+				if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
+					chunks_tx_progress = 1;
+				}
+			}
+			temp_used_chunks_tx >>= 8;
+			temp_last_used_chunks_tx >>= 8;
+		}
+		if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
+			(!(used_chunks_tx&used_chunks_mask)) ||
+			(!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
+			(chunks_tx_progress) ) {
+			nesdev->last_used_chunks_tx = used_chunks_tx;
+			break;
+		}
+		nesdev->last_used_chunks_tx = used_chunks_tx;
+		barrier();
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
+		mh_pauses_sent++;
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->mac_pause_frames_sent += mac_tx_pauses;
+			break;
+		}
+
+		tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
+		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+		tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
+		rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
+		rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
+		mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
+		mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
+
+		/* one last ditch effort to avoid a false positive */
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
+			nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
+			break;
+		}
+		mh_detected++;
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
+		reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
+
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (i++ < 5000)) {
+			/* mdelay(1); */
+		}
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (nesadapter->OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		} else {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+		}
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
+		nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
+
+	} while (0);
+
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
+no_mh_work:
+	nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
+	add_timer(&nesdev->nesadapter->mh_timer);
+}
+
+/**
+ * nes_clc
+ */
+void nes_clc(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+    nesadapter->link_interrupt_count[0] = 0;
+    nesadapter->link_interrupt_count[1] = 0;
+    nesadapter->link_interrupt_count[2] = 0;
+    nesadapter->link_interrupt_count[3] = 0;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	add_timer(&nesadapter->lc_timer);
+}
+
+
+/**
+ * nes_dump_mem
+ */
+void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
+{
+	char  xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+		'a', 'b', 'c', 'd', 'e', 'f'};
+	char  *ptr;
+	char  hex_buf[80];
+	char  ascii_buf[20];
+	int   num_char;
+	int   num_ascii;
+	int   num_hex;
+
+	if (!(nes_debug_level & dump_debug_level)) {
+		return;
+	}
+
+	ptr = addr;
+	if (length > 0x100) {
+		nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
+		length = 0x100;
+	}
+	nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length);
+
+	memset(ascii_buf, 0, 20);
+	memset(hex_buf, 0, 80);
+
+	num_ascii = 0;
+	num_hex = 0;
+	for (num_char = 0; num_char < length; num_char++) {
+		if (num_ascii == 8) {
+			ascii_buf[num_ascii++] = ' ';
+			hex_buf[num_hex++] = '-';
+			hex_buf[num_hex++] = ' ';
+		}
+
+		if (*ptr < 0x20 || *ptr > 0x7e)
+			ascii_buf[num_ascii++] = '.';
+		else
+			ascii_buf[num_ascii++] = *ptr;
+		hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)];
+		hex_buf[num_hex++] = xlate[*ptr & 0x0f];
+		hex_buf[num_hex++] = ' ';
+		ptr++;
+
+		if (num_ascii >= 17) {
+			/* output line and reset */
+			nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+			memset(ascii_buf, 0, 20);
+			memset(hex_buf, 0, 80);
+			num_ascii = 0;
+			num_hex = 0;
+		}
+	}
+
+	/* output the rest */
+	if (num_ascii) {
+		while (num_ascii < 17) {
+			if (num_ascii == 8) {
+				hex_buf[num_hex++] = ' ';
+				hex_buf[num_hex++] = ' ';
+			}
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			num_ascii++;
+		}
+
+		nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
new file mode 100644
index 0000000..ffd4b42
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -0,0 +1,3917 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "nes.h"
+
+#include <rdma/ib_umem.h>
+
+atomic_t mod_qp_timouts;
+atomic_t qps_created;
+atomic_t sw_qps_destroyed;
+
+
+/**
+ * nes_alloc_mw
+ */
+static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd) {
+	unsigned long flags;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cqp_request *cqp_request;
+	struct nes_mr *nesmr;
+	struct ib_mw *ibmw;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u8 stag_key = 0;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+			nesadapter->max_mr, &stag_index, &next_stag_index);
+	if (ret) {
+		return ERR_PTR(ret);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
+			stag, stag_index);
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
+			NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_REM_ACC_EN);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		if (!ret) {
+			return ERR_PTR(-ETIME);
+		} else {
+			return ERR_PTR(-ENOMEM);
+		}
+	} else {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+
+	nesmr->ibmw.rkey = stag;
+	nesmr->mode = IWNES_MEMREG_TYPE_MW;
+	ibmw = &nesmr->ibmw;
+	nesmr->pbl_4k = 0;
+	nesmr->pbls_used = 0;
+
+	return ibmw;
+}
+
+
+/**
+ * nes_dealloc_mw
+ */
+static int nes_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct nes_mr *nesmr = to_nesmw(ibmw);
+	struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int err = 0;
+	unsigned long flags;
+	int ret;
+
+	/* Deallocate the window with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
+			ibmw->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		if (!ret) {
+			err = -ETIME;
+		} else {
+			err = -EIO;
+		}
+	} else {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ibmw->rkey & 0x0fffff00) >> 8);
+	kfree(nesmr);
+
+	return err;
+}
+
+
+/**
+ * nes_bind_mw
+ */
+static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
+		struct ib_mw_bind *ibmw_bind)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_mr *nesmr = to_nesmw(ibmw); */
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	unsigned long flags = 0;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 qsize;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS)
+		return -EINVAL;
+
+		spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+	qsize = nesqp->hwqp.sq_tail;
+
+	/* Check for SQ overflow */
+	if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -EINVAL;
+	}
+
+	wqe = &nesqp->hwqp.sq_vbase[head];
+	/* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
+	nes_fill_init_qp_wqe(wqe, nesqp, head);
+	u64temp = ibmw_bind->wr_id;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
+	wqe_misc = NES_IWARP_SQ_OP_BIND;
+
+	wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+	if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
+		wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+	if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_WRITE) {
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
+	}
+	if (ibmw_bind->mw_access_flags & IB_ACCESS_REMOTE_READ) {
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
+	}
+
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, ibmw_bind->mr->lkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
+			ibmw_bind->length);
+	wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
+	u64temp = (u64)ibmw_bind->addr;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
+
+	head++;
+	if (head >= qsize)
+		head = 0;
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+
+	nes_write32(nesdev->regs+NES_WQE_ALLOC,
+			(1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	return 0;
+}
+
+
+/**
+ * nes_alloc_fmr
+ */
+static struct ib_fmr *nes_alloc_fmr(struct ib_pd *ibpd,
+		int ibmr_access_flags,
+		struct ib_fmr_attr *ibfmr_attr)
+{
+	unsigned long flags;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_fmr *nesfmr;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u32 opcode = 0;
+	u8 stag_key = 0;
+	int i=0;
+	struct nes_vpbl vpbl;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+			nesadapter->max_mr, &stag_index, &next_stag_index);
+	if (ret) {
+		goto failed_resource_alloc;
+	}
+
+	nesfmr = kzalloc(sizeof(*nesfmr), GFP_KERNEL);
+	if (!nesfmr) {
+		ret = -ENOMEM;
+		goto failed_fmr_alloc;
+	}
+
+	nesfmr->nesmr.mode = IWNES_MEMREG_TYPE_FMR;
+	if (ibfmr_attr->max_pages == 1) {
+		/* use zero length PBL */
+		nesfmr->nesmr.pbl_4k = 0;
+		nesfmr->nesmr.pbls_used = 0;
+	} else if (ibfmr_attr->max_pages <= 32) {
+		/* use PBL 256 */
+		nesfmr->nesmr.pbl_4k = 0;
+		nesfmr->nesmr.pbls_used = 1;
+	} else if (ibfmr_attr->max_pages <= 512) {
+		/* use 4K PBLs */
+		nesfmr->nesmr.pbl_4k = 1;
+		nesfmr->nesmr.pbls_used = 1;
+	} else {
+		/* use two level 4K PBLs */
+		/* add support for two level 256B PBLs */
+		nesfmr->nesmr.pbl_4k = 1;
+		nesfmr->nesmr.pbls_used = 1 + (ibfmr_attr->max_pages >> 9) +
+				((ibfmr_attr->max_pages & 511) ? 1 : 0);
+	}
+	/* Register the region with the adapter */
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+	/* track PBL resources */
+	if (nesfmr->nesmr.pbls_used != 0) {
+		if (nesfmr->nesmr.pbl_4k) {
+			if (nesfmr->nesmr.pbls_used > nesadapter->free_4kpbl) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				ret = -ENOMEM;
+				goto failed_vpbl_alloc;
+			} else {
+				nesadapter->free_4kpbl -= nesfmr->nesmr.pbls_used;
+			}
+		} else {
+			if (nesfmr->nesmr.pbls_used > nesadapter->free_256pbl) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				ret = -ENOMEM;
+				goto failed_vpbl_alloc;
+			} else {
+				nesadapter->free_256pbl -= nesfmr->nesmr.pbls_used;
+			}
+		}
+	}
+
+	/* one level pbl */
+	if (nesfmr->nesmr.pbls_used == 0) {
+		nesfmr->root_vpbl.pbl_vbase = NULL;
+		nes_debug(NES_DBG_MR,  "zero level pbl \n");
+	} else if (nesfmr->nesmr.pbls_used == 1) {
+		/* can change it to kmalloc & dma_map_single */
+		nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+				&nesfmr->root_vpbl.pbl_pbase);
+		if (!nesfmr->root_vpbl.pbl_vbase) {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			ret = -ENOMEM;
+			goto failed_vpbl_alloc;
+		}
+		nesfmr->leaf_pbl_cnt = 0;
+		nes_debug(NES_DBG_MR, "one level pbl, root_vpbl.pbl_vbase=%p \n",
+				nesfmr->root_vpbl.pbl_vbase);
+	}
+	/* two level pbl */
+	else {
+		nesfmr->root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
+				&nesfmr->root_vpbl.pbl_pbase);
+		if (!nesfmr->root_vpbl.pbl_vbase) {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			ret = -ENOMEM;
+			goto failed_vpbl_alloc;
+		}
+
+		nesfmr->root_vpbl.leaf_vpbl = kzalloc(sizeof(*nesfmr->root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
+		if (!nesfmr->root_vpbl.leaf_vpbl) {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			ret = -ENOMEM;
+			goto failed_leaf_vpbl_alloc;
+		}
+
+		nesfmr->leaf_pbl_cnt = nesfmr->nesmr.pbls_used-1;
+		nes_debug(NES_DBG_MR, "two level pbl, root_vpbl.pbl_vbase=%p"
+				" leaf_pbl_cnt=%d root_vpbl.leaf_vpbl=%p\n",
+				nesfmr->root_vpbl.pbl_vbase, nesfmr->leaf_pbl_cnt, nesfmr->root_vpbl.leaf_vpbl);
+
+		for (i=0; i<nesfmr->leaf_pbl_cnt; i++)
+			nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase = NULL;
+
+		for (i=0; i<nesfmr->leaf_pbl_cnt; i++) {
+			vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+					&vpbl.pbl_pbase);
+
+			if (!vpbl.pbl_vbase) {
+				ret = -ENOMEM;
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				goto failed_leaf_vpbl_pages_alloc;
+			}
+
+			nesfmr->root_vpbl.pbl_vbase[i].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
+			nesfmr->root_vpbl.pbl_vbase[i].pa_high = cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
+			nesfmr->root_vpbl.leaf_vpbl[i] = vpbl;
+
+			nes_debug(NES_DBG_MR, "pbase_low=0x%x, pbase_high=0x%x, vpbl=%p\n",
+					nesfmr->root_vpbl.pbl_vbase[i].pa_low,
+					nesfmr->root_vpbl.pbl_vbase[i].pa_high,
+					&nesfmr->root_vpbl.leaf_vpbl[i]);
+		}
+	}
+	nesfmr->ib_qp = NULL;
+	nesfmr->access_rights =0;
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto failed_leaf_vpbl_pages_alloc;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
+			stag, stag_index);
+
+	opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
+
+	if (nesfmr->nesmr.pbl_4k == 1)
+		opcode |= NES_CQP_STAG_PBL_BLK_SIZE;
+
+	if (ibmr_access_flags & IB_ACCESS_REMOTE_WRITE) {
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE |
+				NES_CQP_STAG_RIGHTS_LOCAL_WRITE | NES_CQP_STAG_REM_ACC_EN;
+		nesfmr->access_rights |=
+				NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_RIGHTS_LOCAL_WRITE |
+				NES_CQP_STAG_REM_ACC_EN;
+	}
+
+	if (ibmr_access_flags & IB_ACCESS_REMOTE_READ) {
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ |
+				NES_CQP_STAG_RIGHTS_LOCAL_READ | NES_CQP_STAG_REM_ACC_EN;
+		nesfmr->access_rights |=
+				NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_RIGHTS_LOCAL_READ |
+				NES_CQP_STAG_REM_ACC_EN;
+	}
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] =
+			cpu_to_le32((nesfmr->nesmr.pbls_used>1) ?
+			(nesfmr->nesmr.pbls_used-1) : nesfmr->nesmr.pbls_used);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+
+	if ((!ret) || (cqp_request->major_code)) {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		ret = (!ret) ? -ETIME : -EIO;
+		goto failed_leaf_vpbl_pages_alloc;
+	} else {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+
+	nesfmr->nesmr.ibfmr.lkey = stag;
+	nesfmr->nesmr.ibfmr.rkey = stag;
+	nesfmr->attr = *ibfmr_attr;
+
+	return &nesfmr->nesmr.ibfmr;
+
+	failed_leaf_vpbl_pages_alloc:
+	/* unroll all allocated pages */
+	for (i=0; i<nesfmr->leaf_pbl_cnt; i++) {
+		if (nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase) {
+			pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase,
+					nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase);
+		}
+	}
+	if (nesfmr->root_vpbl.leaf_vpbl)
+		kfree(nesfmr->root_vpbl.leaf_vpbl);
+
+	failed_leaf_vpbl_alloc:
+	if (nesfmr->leaf_pbl_cnt == 0) {
+		if (nesfmr->root_vpbl.pbl_vbase)
+			pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase,
+					nesfmr->root_vpbl.pbl_pbase);
+	} else
+		pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase,
+				nesfmr->root_vpbl.pbl_pbase);
+
+	failed_vpbl_alloc:
+	kfree(nesfmr);
+
+	failed_fmr_alloc:
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+
+	failed_resource_alloc:
+	return ERR_PTR(ret);
+}
+
+
+/**
+ * nes_dealloc_fmr
+ */
+static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct nes_mr *nesmr = to_nesmr_from_ibfmr(ibfmr);
+	struct nes_fmr *nesfmr = to_nesfmr(nesmr);
+	struct nes_vnic *nesvnic = to_nesvnic(ibfmr->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_mr temp_nesmr = *nesmr;
+	int i = 0;
+
+	temp_nesmr.ibmw.device = ibfmr->device;
+	temp_nesmr.ibmw.pd = ibfmr->pd;
+	temp_nesmr.ibmw.rkey = ibfmr->rkey;
+	temp_nesmr.ibmw.uobject = NULL;
+
+	/* free the resources */
+	if (nesfmr->leaf_pbl_cnt == 0) {
+		/* single PBL case */
+		if (nesfmr->root_vpbl.pbl_vbase)
+			pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.pbl_vbase,
+					nesfmr->root_vpbl.pbl_pbase);
+	} else {
+		for (i = 0; i < nesfmr->leaf_pbl_cnt; i++) {
+			pci_free_consistent(nesdev->pcidev, 4096, nesfmr->root_vpbl.leaf_vpbl[i].pbl_vbase,
+					nesfmr->root_vpbl.leaf_vpbl[i].pbl_pbase);
+		}
+		kfree(nesfmr->root_vpbl.leaf_vpbl);
+		pci_free_consistent(nesdev->pcidev, 8192, nesfmr->root_vpbl.pbl_vbase,
+				nesfmr->root_vpbl.pbl_pbase);
+	}
+
+	return nes_dealloc_mw(&temp_nesmr.ibmw);
+}
+
+
+/**
+ * nes_map_phys_fmr
+ */
+static int nes_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		int list_len, u64 iova)
+{
+	return 0;
+}
+
+
+/**
+ * nes_unmap_frm
+ */
+static int nes_unmap_fmr(struct list_head *ibfmr_list)
+{
+	return 0;
+}
+
+
+
+/**
+ * nes_query_device
+ */
+static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+	memset(props, 0, sizeof(*props));
+	memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
+
+	props->fw_ver = nesdev->nesadapter->fw_ver;
+	props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
+	props->vendor_id = nesdev->nesadapter->vendor_id;
+	props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
+	props->hw_ver = nesdev->nesadapter->hw_rev;
+	props->max_mr_size = 0x80000000;
+	props->max_qp = nesibdev->max_qp;
+	props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
+	props->max_sge = nesdev->nesadapter->max_sge;
+	props->max_cq = nesibdev->max_cq;
+	props->max_cqe = nesdev->nesadapter->max_cqe - 1;
+	props->max_mr = nesibdev->max_mr;
+	props->max_mw = nesibdev->max_mr;
+	props->max_pd = nesibdev->max_pd;
+	props->max_sge_rd = 1;
+	switch (nesdev->nesadapter->max_irrq_wr) {
+		case 0:
+			props->max_qp_rd_atom = 1;
+			break;
+		case 1:
+			props->max_qp_rd_atom = 4;
+			break;
+		case 2:
+			props->max_qp_rd_atom = 16;
+			break;
+		case 3:
+			props->max_qp_rd_atom = 32;
+			break;
+		default:
+			props->max_qp_rd_atom = 0;
+	}
+	props->max_qp_init_rd_atom = props->max_qp_wr;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_map_per_fmr = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_port
+ */
+static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+	memset(props, 0, sizeof(*props));
+
+	props->max_mtu = IB_MTU_2048;
+	props->active_mtu = IB_MTU_2048;
+	props->lid = 1;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = 0;
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+			IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = 1;
+	props->max_msg_sz = 0x80000000;
+
+	return 0;
+}
+
+
+/**
+ * nes_modify_port
+ */
+static int nes_modify_port(struct ib_device *ibdev, u8 port,
+		int port_modify_mask, struct ib_port_modify *props)
+{
+	return 0;
+}
+
+
+/**
+ * nes_query_pkey
+ */
+static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+
+/**
+ * nes_query_gid
+ */
+static int nes_query_gid(struct ib_device *ibdev, u8 port,
+		int index, union ib_gid *gid)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+
+/**
+ * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
+		struct ib_udata *udata)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_alloc_ucontext_req req;
+	struct nes_alloc_ucontext_resp uresp;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+
+	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
+		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
+		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
+			req.userspace_ver, NES_ABI_USERSPACE_VER);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.max_qps = nesibdev->max_qp;
+	uresp.max_pds = nesibdev->max_pd;
+	uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
+	uresp.virtwq = nesadapter->virtwq;
+	uresp.kernel_ver = NES_ABI_KERNEL_VER;
+
+	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
+	if (!nes_ucontext)
+		return ERR_PTR(-ENOMEM);
+
+	nes_ucontext->nesdev = nesdev;
+	nes_ucontext->mmap_wq_offset = uresp.max_pds;
+	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
+			((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
+			PAGE_SIZE;
+
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		kfree(nes_ucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
+	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
+	atomic_set(&nes_ucontext->usecnt, 1);
+	return &nes_ucontext->ibucontext;
+}
+
+
+/**
+ * nes_dealloc_ucontext
+ */
+static int nes_dealloc_ucontext(struct ib_ucontext *context)
+{
+	/* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
+	/* struct nes_device *nesdev = nesvnic->nesdev; */
+	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
+
+	if (!atomic_dec_and_test(&nes_ucontext->usecnt))
+	  return 0;
+	kfree(nes_ucontext);
+	return 0;
+}
+
+
+/**
+ * nes_mmap
+ */
+static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	unsigned long index;
+	struct nes_vnic *nesvnic = to_nesvnic(context->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_adapter *nesadapter = nesdev->nesadapter; */
+	struct nes_ucontext *nes_ucontext;
+	struct nes_qp *nesqp;
+
+	nes_ucontext = to_nesucontext(context);
+
+
+	if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
+		index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
+		index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
+				PAGE_SIZE-1) & (~(PAGE_SIZE-1));
+		if (!test_bit(index, nes_ucontext->allocated_wqs)) {
+			nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
+			return -EFAULT;
+		}
+		nesqp = nes_ucontext->mmap_nesqp[index];
+		if (nesqp == NULL) {
+			nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
+			return -EFAULT;
+		}
+		if (remap_pfn_range(vma, vma->vm_start,
+				virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
+				vma->vm_end - vma->vm_start,
+				vma->vm_page_prot)) {
+			nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
+			return -EAGAIN;
+		}
+		vma->vm_private_data = nesqp;
+		return 0;
+	} else {
+		index = vma->vm_pgoff;
+		if (!test_bit(index, nes_ucontext->allocated_doorbells))
+			return -EFAULT;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				(nesdev->doorbell_start +
+				((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
+				>> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+		vma->vm_private_data = nes_ucontext;
+		return 0;
+	}
+
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_alloc_pd
+ */
+static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct nes_pd *nespd;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_ucontext *nesucontext;
+	struct nes_alloc_pd_resp uresp;
+	u32 pd_num = 0;
+	int err;
+
+	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
+			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
+			atomic_read(&nesvnic->netdev->refcnt));
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
+			nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
+	if (!nespd) {
+		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
+			nespd, nesvnic->nesibdev->ibdev.name);
+
+	nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
+
+	if (context) {
+		nesucontext = to_nesucontext(context);
+		nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
+				NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
+		nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
+				nespd->mmap_db_index, nespd->pd_id);
+		if (nespd->mmap_db_index > NES_MAX_USER_DB_REGIONS) {
+			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.pd_id = nespd->pd_id;
+		uresp.mmap_db_index = nespd->mmap_db_index;
+		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-EFAULT);
+		}
+
+		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
+		nesucontext->first_free_db = nespd->mmap_db_index + 1;
+	}
+
+	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
+	return &nespd->ibpd;
+}
+
+
+/**
+ * nes_dealloc_pd
+ */
+static int nes_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct nes_ucontext *nesucontext;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((ibpd->uobject) && (ibpd->uobject->context)) {
+		nesucontext = to_nesucontext(ibpd->uobject->context);
+		nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
+				nespd->mmap_db_index);
+		clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
+		if (nesucontext->first_free_db > nespd->mmap_db_index) {
+			nesucontext->first_free_db = nespd->mmap_db_index;
+		}
+	}
+
+	nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
+			nespd->pd_id, nespd);
+	nes_free_resource(nesadapter, nesadapter->allocated_pds,
+			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
+	kfree(nespd);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_ah
+ */
+static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_destroy_ah
+ */
+static int nes_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_get_encoded_size
+ */
+static inline u8 nes_get_encoded_size(int *size)
+{
+	u8 encoded_size = 0;
+	if (*size <= 32) {
+		*size = 32;
+		encoded_size = 1;
+	} else if (*size <= 128) {
+		*size = 128;
+		encoded_size = 2;
+	} else if (*size <= 512) {
+		*size = 512;
+		encoded_size = 3;
+	}
+	return (encoded_size);
+}
+
+
+
+/**
+ * nes_setup_virt_qp
+ */
+static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
+		struct nes_vnic *nesvnic, int sq_size, int rq_size)
+{
+	unsigned long flags;
+	void *mem;
+	__le64 *pbl = NULL;
+	__le64 *tpbl;
+	__le64 *pblbuffer;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 pbl_entries;
+	u8 rq_pbl_entries;
+	u8 sq_pbl_entries;
+
+	pbl_entries = nespbl->pbl_size >> 3;
+	nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%p\n",
+			nespbl->pbl_size, pbl_entries,
+			(void *)nespbl->pbl_vbase,
+			(void *)nespbl->pbl_pbase);
+	pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
+	/* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
+	/* the first pbl to be fro the rq_vbase... */
+	rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	if (!nespbl->page) {
+		nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	nesqp->hwqp.sq_vbase = kmap(nespbl->page);
+	nesqp->page = nespbl->page;
+	if (!nesqp->hwqp.sq_vbase) {
+		nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	/* Now to get to sq.. we need to calculate how many */
+	/* PBL entries were used by the rq.. */
+	pbl += sq_pbl_entries;
+	nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	/* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
+	/*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
+
+	nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%p rq_vbase=%p rq_pbase=%p\n",
+			nesqp->hwqp.sq_vbase, (void *)nesqp->hwqp.sq_pbase,
+			nesqp->hwqp.rq_vbase, (void *)nesqp->hwqp.rq_pbase);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (!nesadapter->free_256pbl) {
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+	nesadapter->free_256pbl--;
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
+	pblbuffer = nesqp->pbl_vbase;
+	if (!nesqp->pbl_vbase) {
+		/* memory allocated during nes_reg_user_mr() */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				    nespbl->pbl_pbase);
+		kfree(nespbl);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	memset(nesqp->pbl_vbase, 0, 256);
+	/* fill in the page address in the pbl buffer.. */
+	tpbl = pblbuffer + 16;
+	pbl = (__le64 *)nespbl->pbl_vbase;
+	while (sq_pbl_entries--)
+		*tpbl++ = *pbl++;
+	tpbl = pblbuffer;
+	while (rq_pbl_entries--)
+		*tpbl++ = *pbl++;
+
+	/* done with memory allocated during nes_reg_user_mr() */
+	pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+			    nespbl->pbl_pbase);
+	kfree(nespbl);
+
+	nesqp->qp_mem_size =
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.q2_pbase);
+
+	if (!mem) {
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	nesqp->hwqp.q2_vbase = mem;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+	nesqp->nesqp_context = mem;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+
+	return 0;
+}
+
+
+/**
+ * nes_setup_mmap_qp
+ */
+static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
+		int sq_size, int rq_size)
+{
+	void *mem;
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
+			(sizeof(struct nes_hw_qp_wqe) * rq_size) +
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
+			256; /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.sq_pbase);
+	if (!mem)
+		return -ENOMEM;
+	nes_debug(NES_DBG_QP, "PCI consistent memory for "
+			"host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
+			mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
+
+	memset(mem, 0, nesqp->qp_mem_size);
+
+	nesqp->hwqp.sq_vbase = mem;
+	mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
+
+	nesqp->hwqp.rq_vbase = mem;
+	nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * sq_size;
+	mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
+
+	nesqp->hwqp.q2_vbase = mem;
+	nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * rq_size;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+
+	nesqp->nesqp_context = mem;
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	return 0;
+}
+
+
+/**
+ * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
+ */
+static inline void nes_free_qp_mem(struct nes_device *nesdev,
+		struct nes_qp *nesqp, int virt_wqs)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	if (!virt_wqs) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}else {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
+		nesqp->pbl_vbase = NULL;
+		kunmap(nesqp->page);
+	}
+}
+
+
+/**
+ * nes_create_qp
+ */
+static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
+		struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
+{
+	u64 u64temp= 0;
+	u64 u64nesqp = 0;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	struct nes_create_qp_req req;
+	struct nes_create_qp_resp uresp;
+	struct nes_pbl  *nespbl = NULL;
+	u32 qp_num = 0;
+	u32 opcode = 0;
+	/* u32 counter = 0; */
+	void *mem;
+	unsigned long flags;
+	int ret;
+	int err;
+	int virt_wqs = 0;
+	int sq_size;
+	int rq_size;
+	u8 sq_encoded_size;
+	u8 rq_encoded_size;
+	/* int counter; */
+
+	atomic_inc(&qps_created);
+	switch (init_attr->qp_type) {
+		case IB_QPT_RC:
+			if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
+				init_attr->cap.max_inline_data = 0;
+			} else {
+				init_attr->cap.max_inline_data = 64;
+			}
+			sq_size = init_attr->cap.max_send_wr;
+			rq_size = init_attr->cap.max_recv_wr;
+
+			// check if the encoded sizes are OK or not...
+			sq_encoded_size = nes_get_encoded_size(&sq_size);
+			rq_encoded_size = nes_get_encoded_size(&rq_size);
+
+			if ((!sq_encoded_size) || (!rq_encoded_size)) {
+				nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
+						rq_size, sq_size);
+				return ERR_PTR(-EINVAL);
+			}
+
+			init_attr->cap.max_send_wr = sq_size -2;
+			init_attr->cap.max_recv_wr = rq_size -1;
+			nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
+
+			ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
+					nesadapter->max_qp, &qp_num, &nesadapter->next_qp);
+			if (ret) {
+				return ERR_PTR(ret);
+			}
+
+			/* Need 512 (actually now 1024) byte alignment on this structure */
+			mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
+			if (!mem) {
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_debug(NES_DBG_QP, "Unable to allocate QP\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			u64nesqp = (unsigned long)mem;
+			u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64nesqp &= ~u64temp;
+			nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
+			/* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
+					nesqp, mem, NES_SW_CONTEXT_ALIGN); */
+			nesqp->allocated_buffer = mem;
+
+			if (udata) {
+				if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
+					return NULL;
+				}
+				if (req.user_wqe_buffers) {
+					virt_wqs = 1;
+				}
+				if ((ibpd->uobject) && (ibpd->uobject->context)) {
+					nesqp->user_mode = 1;
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					if (virt_wqs) {
+						err = 1;
+						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
+							if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
+								list_del(&nespbl->list);
+								err = 0;
+								nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
+									  nespbl, nespbl->user_base);
+								break;
+							}
+						}
+						if (err) {
+							nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
+								  (long long unsigned int)req.user_wqe_buffers);
+							nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+							kfree(nesqp->allocated_buffer);
+							return ERR_PTR(-ENOMEM);
+						}
+					}
+
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					nesqp->mmap_sq_db_index =
+						find_next_zero_bit(nes_ucontext->allocated_wqs,
+								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
+					/* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
+							nespd->mmap_db_index); */
+					if (nesqp->mmap_sq_db_index > NES_MAX_USER_WQ_REGIONS) {
+						nes_debug(NES_DBG_QP,
+							  "db index > max user regions, failing create QP\n");
+						nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+						if (virt_wqs) {
+							pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+									    nespbl->pbl_pbase);
+							kfree(nespbl);
+						}
+						kfree(nesqp->allocated_buffer);
+						return ERR_PTR(-ENOMEM);
+					}
+					set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+					nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
+					nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
+				} else {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+			err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
+					nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
+			if (err) {
+				nes_debug(NES_DBG_QP,
+					  "error geting qp mem code = %d\n", err);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nesqp->hwqp.sq_size = sq_size;
+			nesqp->hwqp.sq_encoded_size = sq_encoded_size;
+			nesqp->hwqp.sq_head = 1;
+			nesqp->hwqp.rq_size = rq_size;
+			nesqp->hwqp.rq_encoded_size = rq_encoded_size;
+			/* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
+					(void *)nesqp->nesqp_context_pbase);
+			*/
+			nesqp->hwqp.qp_id = qp_num;
+			nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
+			nesqp->nespd = nespd;
+
+			nescq = to_nescq(init_attr->send_cq);
+			nesqp->nesscq = nescq;
+			nescq = to_nescq(init_attr->recv_cq);
+			nesqp->nesrcq = nescq;
+
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+					NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
+					NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
+					NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
+			nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
+					((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
+			u64temp = (u64)nesqp->hwqp.sq_pbase;
+			nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+
+
+			if (!virt_wqs) {
+				u64temp = (u64)nesqp->hwqp.sq_pbase;
+				nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+				u64temp = (u64)nesqp->hwqp.rq_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			} else {
+				u64temp = (u64)nesqp->pbl_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			}
+
+			/* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
+					nesvnic->next_qp_nic_index,
+					nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			nesqp->nesqp_context->misc2 |= cpu_to_le32(
+					(u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
+					NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
+			nesvnic->next_qp_nic_index++;
+			if ((nesvnic->next_qp_nic_index > 3) ||
+					(nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
+				nesvnic->next_qp_nic_index = 0;
+			}
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+			nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
+			u64temp = (u64)nesqp->hwqp.q2_pbase;
+			nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
+			nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
+			nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
+					((((u32)nesadapter->max_irrq_wr) <<
+					NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
+			if (disable_mpa_crc) {
+				nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
+				nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
+			}
+
+
+			/* Create the QP */
+			cqp_request = nes_get_cqp_request(nesdev);
+			if (cqp_request == NULL) {
+				nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+			cqp_request->waiting = 1;
+			cqp_wqe = &cqp_request->cqp_wqe;
+
+			if (!virt_wqs) {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			} else {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			}
+			opcode |= NES_CQP_QP_CQS_VALID;
+			nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+
+			u64temp = (u64)nesqp->nesqp_context_pbase;
+			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+			atomic_set(&cqp_request->refcount, 2);
+			nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+			/* Wait for CQP */
+			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
+					nesqp->hwqp.qp_id);
+			ret = wait_event_timeout(cqp_request->waitq,
+					(cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
+			nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
+					" nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
+					" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
+					cqp_request->major_code, cqp_request->minor_code);
+			if ((!ret) || (cqp_request->major_code)) {
+				if (atomic_dec_and_test(&cqp_request->refcount)) {
+					if (cqp_request->dynamic) {
+						kfree(cqp_request);
+					} else {
+						spin_lock_irqsave(&nesdev->cqp.lock, flags);
+						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+					}
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				if (!ret) {
+					return ERR_PTR(-ETIME);
+				} else {
+					return ERR_PTR(-EIO);
+				}
+			} else {
+				if (atomic_dec_and_test(&cqp_request->refcount)) {
+					if (cqp_request->dynamic) {
+						kfree(cqp_request);
+					} else {
+						spin_lock_irqsave(&nesdev->cqp.lock, flags);
+						list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+						spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+					}
+				}
+			}
+
+			if (ibpd->uobject) {
+				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
+				uresp.actual_sq_size = sq_size;
+				uresp.actual_rq_size = rq_size;
+				uresp.qp_id = nesqp->hwqp.qp_id;
+				uresp.nes_drv_opt = nes_drv_opt;
+				if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+
+			nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
+					nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
+			spin_lock_init(&nesqp->lock);
+			init_waitqueue_head(&nesqp->state_waitq);
+			init_waitqueue_head(&nesqp->kick_waitq);
+			nes_add_ref(&nesqp->ibqp);
+			break;
+		default:
+			nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
+			return ERR_PTR(-EINVAL);
+			break;
+	}
+
+	/* update the QP table */
+	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
+	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
+			atomic_read(&nesvnic->netdev->refcnt));
+
+	return &nesqp->ibqp;
+}
+
+
+/**
+ * nes_destroy_qp
+ */
+static int nes_destroy_qp(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	/* struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); */
+	struct nes_ucontext *nes_ucontext;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret;
+
+	atomic_inc(&sw_qps_destroyed);
+	nesqp->destroyed = 1;
+
+	/* Blow away the connection if it exists. */
+	if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
+		/* if (nesqp->ibqp_state == IB_QPS_RTS) { */
+		attr.qp_state = IB_QPS_ERR;
+		nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	}
+
+	if (((nesqp->ibqp_state == IB_QPS_INIT) ||
+			(nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
+		cm_id = nesqp->cm_id;
+		cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+		cm_event.status = IW_CM_EVENT_STATUS_TIMEOUT;
+		cm_event.local_addr = cm_id->local_addr;
+		cm_event.remote_addr = cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+
+		nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
+				"QP%u. cm_id = %p, refcount = %u. \n",
+				nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
+
+		cm_id->rem_ref(cm_id);
+		ret = cm_id->event_handler(cm_id, &cm_event);
+		if (ret)
+			nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
+	}
+
+
+	if (nesqp->user_mode) {
+		if ((ibqp->uobject)&&(ibqp->uobject->context)) {
+			nes_ucontext = to_nesucontext(ibqp->uobject->context);
+			clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+			nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
+			if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
+				nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
+			}
+		}
+		if (nesqp->pbl_pbase)
+			kunmap(nesqp->page);
+	}
+
+	nes_rem_ref(&nesqp->ibqp);
+	return 0;
+}
+
+
+/**
+ * nes_create_cq
+ */
+static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
+		int comp_vector,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext = NULL;
+	struct nes_cqp_request *cqp_request;
+	void *mem = NULL;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_pbl *nespbl = NULL;
+	struct nes_create_cq_req req;
+	struct nes_create_cq_resp resp;
+	u32 cq_num = 0;
+	u32 opcode = 0;
+	u32 pbl_entries = 1;
+	int err;
+	unsigned long flags;
+	int ret;
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
+			nesadapter->max_cq, &cq_num, &nesadapter->next_cq);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
+	if (!nescq) {
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nescq->hw_cq.cq_size = max(entries + 1, 5);
+	nescq->hw_cq.cq_number = cq_num;
+	nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
+
+
+	if (context) {
+		nes_ucontext = to_nesucontext(context);
+		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+		nesvnic->mcrq_ucontext = nes_ucontext;
+		nes_ucontext->mcrqf = req.mcrqf;
+		if (nes_ucontext->mcrqf) {
+			if (nes_ucontext->mcrqf & 0x80000000)
+				nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 12 + (nes_ucontext->mcrqf & 0xf) - 1;
+			else if (nes_ucontext->mcrqf & 0x40000000)
+				nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
+			else
+				nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		}
+		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
+				(unsigned long)req.user_cq_buffer, entries);
+		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
+			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
+				list_del(&nespbl->list);
+				err = 0;
+				nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
+						nespbl);
+				break;
+			}
+		}
+		if (err) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(err);
+		}
+
+		pbl_entries = nespbl->pbl_size >> 3;
+		nescq->cq_mem_size = 0;
+	} else {
+		nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
+		nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
+				entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
+
+		/* allocate the physical buffer space */
+		mem = pci_alloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
+				&nescq->hw_cq.cq_pbase);
+		if (!mem) {
+			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		memset(mem, 0, nescq->cq_mem_size);
+		nescq->hw_cq.cq_vbase = mem;
+		nescq->hw_cq.cq_head = 0;
+		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
+				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
+				(u32)nescq->hw_cq.cq_pbase);
+	}
+
+	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
+	spin_lock_init(&nescq->lock);
+
+	/* send CreateCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-ENOMEM);
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW |
+			NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+	if (pbl_entries != 1) {
+		if (pbl_entries > 32) {
+			/* use 4k pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
+			if (nesadapter->free_4kpbl == 0) {
+				if (cqp_request->dynamic) {
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+					kfree(cqp_request);
+				} else {
+					list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				}
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
+				nescq->virtual_cq = 2;
+				nesadapter->free_4kpbl--;
+			}
+		} else {
+			/* use 256 byte pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
+			if (nesadapter->free_256pbl == 0) {
+				if (cqp_request->dynamic) {
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+					kfree(cqp_request);
+				} else {
+					list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				}
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= NES_CQP_CQ_VIRT;
+				nescq->virtual_cq = 1;
+				nesadapter->free_256pbl--;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			(nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+	if (context) {
+		if (pbl_entries != 1)
+			u64temp = (u64)nespbl->pbl_pbase;
+		else
+			u64temp	= le64_to_cpu(nespbl->pbl_vbase[0]);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
+				nes_ucontext->mmap_db_index[0]);
+	} else {
+		u64temp = (u64)nescq->hw_cq.cq_pbase;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	}
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (u64)(unsigned long)&nescq->hw_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT * 2);
+	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
+			nescq->hw_cq.cq_number, ret);
+	if ((!ret) || (cqp_request->major_code)) {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u create timeout expired, major code = 0x%04X,"
+				" minor code = 0x%04X\n",
+				nescq->hw_cq.cq_number, cqp_request->major_code, cqp_request->minor_code);
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-EIO);
+	} else {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+
+	if (context) {
+		/* free the nespbl */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		kfree(nespbl);
+		resp.cq_id = nescq->hw_cq.cq_number;
+		resp.cq_size = nescq->hw_cq.cq_size;
+		resp.mmap_db_index = 0;
+		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &nescq->ibcq;
+}
+
+
+/**
+ * nes_destroy_cq
+ */
+static int nes_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct nes_cq *nescq;
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_adapter *nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u32 opcode = 0;
+	int ret;
+
+	if (ib_cq == NULL)
+		return 0;
+
+	nescq = to_nescq(ib_cq);
+	nesvnic = to_nesvnic(ib_cq->device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
+
+	/* Send DestroyCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nescq->virtual_cq == 1) {
+		nesadapter->free_256pbl++;
+		if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
+			printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
+					__FUNCTION__, nesadapter->free_256pbl, nesadapter->max_256pbl);
+		}
+	} else if (nescq->virtual_cq == 2) {
+		nesadapter->free_4kpbl++;
+		if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
+			printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
+					__FUNCTION__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
+		}
+		opcode |= NES_CQP_CQ_4KB_CHUNK;
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
+	nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
+			cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		if (!ret) {
+			nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+					nescq->hw_cq.cq_number);
+			ret = -ETIME;
+		} else {
+			nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+					nescq->hw_cq.cq_number);
+			ret = -EIO;
+		}
+	} else {
+		ret = 0;
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+	}
+
+	if (nescq->cq_mem_size)
+		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
+				(void *)nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
+	kfree(nescq);
+
+	return ret;
+}
+
+
+/**
+ * nes_reg_mr
+ */
+static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
+		dma_addr_t single_buffer, u16 pbl_count, u16 residual_page_count,
+		int acc, u64 *iova_start)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	/* int count; */
+	u32 opcode = 0;
+	u16 major_code;
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	/* track PBL resources */
+	if (pbl_count != 0) {
+		if (pbl_count > 1) {
+			/* Two level PBL */
+			if ((pbl_count+1) > nesadapter->free_4kpbl) {
+				nes_debug(NES_DBG_MR, "Out of 4KB Pbls for two level request.\n");
+				if (cqp_request->dynamic) {
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+					kfree(cqp_request);
+				} else {
+					list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				}
+				return -ENOMEM;
+			} else {
+				nesadapter->free_4kpbl -= pbl_count+1;
+			}
+		} else if (residual_page_count > 32) {
+			if (pbl_count > nesadapter->free_4kpbl) {
+				nes_debug(NES_DBG_MR, "Out of 4KB Pbls.\n");
+				if (cqp_request->dynamic) {
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+					kfree(cqp_request);
+				} else {
+					list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				}
+				return -ENOMEM;
+			} else {
+				nesadapter->free_4kpbl -= pbl_count;
+			}
+		} else {
+			if (pbl_count > nesadapter->free_256pbl) {
+				nes_debug(NES_DBG_MR, "Out of 256B Pbls.\n");
+				if (cqp_request->dynamic) {
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+					kfree(cqp_request);
+				} else {
+					list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+					spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				}
+				return -ENOMEM;
+			} else {
+				nesadapter->free_256pbl -= pbl_count;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
+					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
+	if (acc & IB_ACCESS_LOCAL_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
+	if (acc & IB_ACCESS_REMOTE_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_REMOTE_READ)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_MW_BIND)
+		opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	if (pbl_count == 0) {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
+	} else {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX,
+				(((pbl_count - 1) * 4096) + (residual_page_count*8)));
+
+		if ((pbl_count > 1) || (residual_page_count > 32))
+			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	}
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	if (atomic_dec_and_test(&cqp_request->refcount)) {
+		if (cqp_request->dynamic) {
+			kfree(cqp_request);
+		} else {
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		}
+	}
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	else
+		return 0;
+
+	return 0;
+}
+
+
+/**
+ * nes_reg_phys_mr
+ */
+static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+		struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
+		u64 * iova_start)
+{
+	u64 region_length;
+	struct nes_pd *nespd = to_nespd(ib_pd);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_mr *nesmr;
+	struct ib_mr *ibmr;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	u32 stag;
+	u32 i;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	int err = 0, pbl_depth = 0;
+	int ret = 0;
+	u16 pbl_count = 0;
+	u8 single_page = 1;
+	u8 stag_key = 0;
+
+	pbl_depth = 0;
+	region_length = 0;
+	vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_pbase = 0;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+	if (num_phys_buf > (1024*512)) {
+		return ERR_PTR(-E2BIG);
+	}
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
+			&stag_index, &next_stag_index);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if ((i & 0x01FF) == 0) {
+			if (root_pbl_index == 1) {
+				/* Allocate the root PBL */
+				root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
+						&root_vpbl.pbl_pbase);
+				nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+						root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+				if (!root_vpbl.pbl_vbase) {
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
+				if (!root_vpbl.leaf_vpbl) {
+					pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+							root_vpbl.pbl_pbase);
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[0].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[0] = vpbl;
+			}
+			/* Allocate a 4K buffer for the PBL */
+			vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+					&vpbl.pbl_pbase);
+			nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+					vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+			if (!vpbl.pbl_vbase) {
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				ibmr = ERR_PTR(-ENOMEM);
+				kfree(nesmr);
+				goto reg_phys_err;
+			}
+			/* Fill in the root table */
+			if (1 <= root_pbl_index) {
+				root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+						cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+			}
+			root_pbl_index++;
+			cur_pbl_index = 0;
+		}
+		if (buffer_list[i].addr & ~PAGE_MASK) {
+			/* TODO: Unwind allocated buffers */
+			nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+			nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
+					(unsigned int) buffer_list[i].addr);
+			ibmr = ERR_PTR(-EINVAL);
+			kfree(nesmr);
+			goto reg_phys_err;
+		}
+
+		if (!buffer_list[i].size) {
+			nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+			nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
+			ibmr = ERR_PTR(-EINVAL);
+			kfree(nesmr);
+			goto reg_phys_err;
+		}
+
+		region_length += buffer_list[i].size;
+		if ((i != 0) && (single_page)) {
+			if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
+				single_page = 0;
+		}
+		vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr);
+		vpbl.pbl_vbase[cur_pbl_index++].pa_high =
+				cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
+			" length = 0x%016lX, index = 0x%08X\n",
+			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
+
+	region_length -= (*iova_start)&PAGE_MASK;
+
+	/* Make the leaf PBL the root if only one PBL */
+	if (root_pbl_index == 1) {
+		root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+	}
+
+	if (single_page) {
+		pbl_count = 0;
+	} else {
+		pbl_count = root_pbl_index;
+	}
+	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
+			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+		ibmr = &nesmr->ibmr;
+		nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
+		nesmr->pbls_used = pbl_count;
+		if (pbl_count > 1) {
+			nesmr->pbls_used++;
+		}
+	} else {
+		kfree(nesmr);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+
+	reg_phys_err:
+	/* free the resources */
+	if (root_pbl_index == 1) {
+		/* single PBL case */
+		pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
+	} else {
+		for (i=0; i<root_pbl_index; i++) {
+			pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
+					root_vpbl.leaf_vpbl[i].pbl_pbase);
+		}
+		kfree(root_vpbl.leaf_vpbl);
+		pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+				root_vpbl.pbl_pbase);
+	}
+
+	return ibmr;
+}
+
+
+/**
+ * nes_get_dma_mr
+ */
+static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	nes_debug(NES_DBG_MR, "\n");
+
+	bl.size = (u64)0xffffffffffULL;
+	bl.addr = 0;
+	return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+
+/**
+ * nes_reg_user_mr
+ */
+static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+		u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 iova_start;
+	__le64 *pbl;
+	u64 region_length;
+	dma_addr_t last_dma_addr = 0;
+	dma_addr_t first_dma_addr = 0;
+	struct nes_pd *nespd = to_nespd(pd);
+	struct nes_vnic *nesvnic = to_nesvnic(pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
+	struct ib_umem_chunk *chunk;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_pbl *nespbl;
+	struct nes_mr *nesmr;
+	struct ib_umem *region;
+	struct nes_mem_reg_req req;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	int nmap_index, page_index;
+	int page_count = 0;
+	int err, pbl_depth = 0;
+	int chunk_pages;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index;
+	u32 driver_key;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	u32 skip_pages;
+	u16 pbl_count;
+	u8 single_page = 1;
+	u8 stag_key;
+
+	region = ib_umem_get(pd->uobject->context, start, length, acc);
+	if (IS_ERR(region)) {
+		return (struct ib_mr *)region;
+	}
+
+	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
+			" offset = %u, page size = %u.\n",
+			(unsigned long int)start, (unsigned long int)virt, (u32)length,
+			region->offset, region->page_size);
+
+	skip_pages = ((u32)region->offset) >> 12;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req)))
+		return ERR_PTR(-EFAULT);
+	nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
+
+	switch (req.reg_type) {
+		case IWNES_MEMREG_TYPE_MEM:
+			pbl_depth = 0;
+			region_length = 0;
+			vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_pbase = 0;
+
+			get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+			stag_key = (u8)next_stag_index;
+
+			driver_key = next_stag_index & 0x70000000;
+
+			next_stag_index >>= 8;
+			next_stag_index %= nesadapter->max_mr;
+
+			err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+					nesadapter->max_mr, &stag_index, &next_stag_index);
+			if (err) {
+				ib_umem_release(region);
+				return ERR_PTR(err);
+			}
+
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+
+			list_for_each_entry(chunk, &region->chunk_list, list) {
+				nes_debug(NES_DBG_MR, "Chunk: nents = %u, nmap = %u .\n",
+						chunk->nents, chunk->nmap);
+				for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) {
+					if (sg_dma_address(&chunk->page_list[nmap_index]) & ~PAGE_MASK) {
+						ib_umem_release(region);
+						nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+						nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
+								(unsigned int) sg_dma_address(&chunk->page_list[nmap_index]));
+						ibmr = ERR_PTR(-EINVAL);
+						kfree(nesmr);
+						goto reg_user_mr_err;
+					}
+
+					if (!sg_dma_len(&chunk->page_list[nmap_index])) {
+						ib_umem_release(region);
+						nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+								stag_index);
+						nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
+						ibmr = ERR_PTR(-EINVAL);
+						kfree(nesmr);
+						goto reg_user_mr_err;
+					}
+
+					region_length += sg_dma_len(&chunk->page_list[nmap_index]);
+					chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12;
+					region_length -= skip_pages << 12;
+					for (page_index=skip_pages; page_index < chunk_pages; page_index++) {
+						skip_pages = 0;
+						if ((page_count!=0)&&(page_count<<12)-(region->offset&(4096-1))>=region->length)
+							goto enough_pages;
+						if ((page_count&0x01FF) == 0) {
+							if (page_count>(1024*512)) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										vpbl.pbl_pbase);
+								nes_free_resource(nesadapter,
+										nesadapter->allocated_mrs, stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-E2BIG);
+								goto reg_user_mr_err;
+							}
+							if (root_pbl_index == 1) {
+								root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+										8192, &root_vpbl.pbl_pbase);
+								nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+										root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+								if (!root_vpbl.pbl_vbase) {
+									ib_umem_release(region);
+									pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+											vpbl.pbl_pbase);
+									nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+											stag_index);
+									kfree(nesmr);
+									ibmr = ERR_PTR(-ENOMEM);
+									goto reg_user_mr_err;
+								}
+								root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024,
+										GFP_KERNEL);
+								if (!root_vpbl.leaf_vpbl) {
+									ib_umem_release(region);
+									pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+											root_vpbl.pbl_pbase);
+									pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+											vpbl.pbl_pbase);
+									nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+											stag_index);
+									kfree(nesmr);
+									ibmr = ERR_PTR(-ENOMEM);
+									goto reg_user_mr_err;
+								}
+								root_vpbl.pbl_vbase[0].pa_low =
+										cpu_to_le32((u32)vpbl.pbl_pbase);
+								root_vpbl.pbl_vbase[0].pa_high =
+										cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+								root_vpbl.leaf_vpbl[0] = vpbl;
+							}
+							vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+									&vpbl.pbl_pbase);
+							nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+									vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+							if (!vpbl.pbl_vbase) {
+								ib_umem_release(region);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+								ibmr = ERR_PTR(-ENOMEM);
+								kfree(nesmr);
+								goto reg_user_mr_err;
+							}
+							if (1 <= root_pbl_index) {
+								root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+										cpu_to_le32((u32)vpbl.pbl_pbase);
+								root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+										cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
+								root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+							}
+							root_pbl_index++;
+							cur_pbl_index = 0;
+						}
+						if (single_page) {
+							if (page_count != 0) {
+								if ((last_dma_addr+4096) !=
+										(sg_dma_address(&chunk->page_list[nmap_index])+
+										(page_index*4096)))
+									single_page = 0;
+								last_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+
+										(page_index*4096);
+							} else {
+								first_dma_addr = sg_dma_address(&chunk->page_list[nmap_index])+
+										(page_index*4096);
+								last_dma_addr = first_dma_addr;
+							}
+						}
+
+						vpbl.pbl_vbase[cur_pbl_index].pa_low =
+								cpu_to_le32((u32)(sg_dma_address(&chunk->page_list[nmap_index])+
+								(page_index*4096)));
+						vpbl.pbl_vbase[cur_pbl_index].pa_high =
+								cpu_to_le32((u32)((((u64)(sg_dma_address(&chunk->page_list[nmap_index])+
+								(page_index*4096))) >> 32)));
+						cur_pbl_index++;
+						page_count++;
+					}
+				}
+			}
+			enough_pages:
+			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
+					" stag_key=0x%08x\n",
+					stag_index, driver_key, stag_key);
+			stag = stag_index << 8;
+			stag |= driver_key;
+			stag += (u32)stag_key;
+			if (stag == 0) {
+				stag = 1;
+			}
+
+			iova_start = virt;
+			/* Make the leaf PBL the root if only one PBL */
+			if (root_pbl_index == 1) {
+				root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+			}
+
+			if (single_page) {
+				pbl_count = 0;
+			} else {
+				pbl_count = root_pbl_index;
+				first_dma_addr = 0;
+			}
+			nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
+					" index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
+					stag, (unsigned int)iova_start,
+					(unsigned int)region_length, stag_index,
+					(unsigned long long)region->length, pbl_count);
+			ret = nes_reg_mr( nesdev, nespd, stag, region->length, &root_vpbl,
+					first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, &iova_start);
+
+			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
+
+			if (ret == 0) {
+				nesmr->ibmr.rkey = stag;
+				nesmr->ibmr.lkey = stag;
+				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+				ibmr = &nesmr->ibmr;
+				nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
+				nesmr->pbls_used = pbl_count;
+				if (pbl_count > 1) {
+					nesmr->pbls_used++;
+				}
+			} else {
+				ib_umem_release(region);
+				kfree(nesmr);
+				ibmr = ERR_PTR(-ENOMEM);
+			}
+
+			reg_user_mr_err:
+			/* free the resources */
+			if (root_pbl_index == 1) {
+				pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+						vpbl.pbl_pbase);
+			} else {
+				for (page_index=0; page_index<root_pbl_index; page_index++) {
+					pci_free_consistent(nesdev->pcidev, 4096,
+							root_vpbl.leaf_vpbl[page_index].pbl_vbase,
+							root_vpbl.leaf_vpbl[page_index].pbl_pbase);
+				}
+				kfree(root_vpbl.leaf_vpbl);
+				pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+						root_vpbl.pbl_pbase);
+			}
+
+			nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
+
+			return ibmr;
+			break;
+		case IWNES_MEMREG_TYPE_QP:
+		case IWNES_MEMREG_TYPE_CQ:
+			nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
+			if (!nespbl) {
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL\n");
+				ib_umem_release(region);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+			nes_ucontext = to_nesucontext(pd->uobject->context);
+			pbl_depth = region->length >> 12;
+			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
+			nespbl->pbl_size = pbl_depth*sizeof(u64);
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
+			} else {
+				nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
+			}
+
+			nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
+					nespbl->pbl_size, pbl_depth);
+			pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
+					&nespbl->pbl_pbase);
+			if (!pbl) {
+				ib_umem_release(region);
+				kfree(nesmr);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nespbl->pbl_vbase = (u64 *)pbl;
+			nespbl->user_base = start;
+			nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%p,"
+					" pbl_vbase=%p user_base=0x%lx\n",
+					nespbl->pbl_size, (void *)nespbl->pbl_pbase,
+					(void*)nespbl->pbl_vbase, nespbl->user_base);
+
+			list_for_each_entry(chunk, &region->chunk_list, list) {
+				for (nmap_index = 0; nmap_index < chunk->nmap; ++nmap_index) {
+					chunk_pages = sg_dma_len(&chunk->page_list[nmap_index]) >> 12;
+					chunk_pages += (sg_dma_len(&chunk->page_list[nmap_index]) & (4096-1)) ? 1 : 0;
+					nespbl->page = sg_page(&chunk->page_list[0]);
+					for (page_index=0; page_index<chunk_pages; page_index++) {
+						((__le32 *)pbl)[0] = cpu_to_le32((u32)
+								(sg_dma_address(&chunk->page_list[nmap_index])+
+								(page_index*4096)));
+						((__le32 *)pbl)[1] = cpu_to_le32(((u64)
+								(sg_dma_address(&chunk->page_list[nmap_index])+
+								(page_index*4096)))>>32);
+						nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+								(unsigned long long)*pbl,
+								le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+						pbl++;
+					}
+				}
+			}
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
+			} else {
+				list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
+			}
+			nesmr->ibmr.rkey = -1;
+			nesmr->ibmr.lkey = -1;
+			nesmr->mode = req.reg_type;
+			return &nesmr->ibmr;
+			break;
+	}
+
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_dereg_mr
+ */
+static int nes_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct nes_mr *nesmr = to_nesmr(ib_mr);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	u16 major_code;
+	u16 minor_code;
+
+	if (nesmr->region) {
+		ib_umem_release(nesmr->region);
+	}
+	if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
+		kfree(nesmr);
+		return 0;
+	}
+
+	/* Deallocate the region with the adapter */
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nesmr->pbls_used != 0) {
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has exceeded the max(%u)\n",
+						nesadapter->free_4kpbl, nesadapter->max_4kpbl);
+			}
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has exceeded the max(%u)\n",
+						nesadapter->free_256pbl, nesadapter->max_256pbl);
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	major_code = cqp_request->major_code;
+	minor_code = cqp_request->minor_code;
+	if (atomic_dec_and_test(&cqp_request->refcount)) {
+		if (cqp_request->dynamic) {
+			kfree(cqp_request);
+		} else {
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		}
+	}
+	if (!ret) {
+		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
+				" ib_mr=%p, rkey = 0x%08X\n",
+				ib_mr, ib_mr->rkey);
+		return -ETIME;
+	} else if (major_code) {
+		nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
+				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
+				major_code, minor_code, ib_mr, ib_mr->rkey);
+		return -EIO;
+	} else
+		return 0;
+}
+
+
+/**
+ * show_rev
+ */
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(cdev, struct nes_ib_device, ibdev.class_dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
+}
+
+
+/**
+ * show_fw_ver
+ */
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(cdev, struct nes_ib_device, ibdev.class_dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%x.%x.%x\n",
+			(int)(nesvnic->nesdev->nesadapter->fw_ver >> 32),
+			(int)(nesvnic->nesdev->nesadapter->fw_ver >> 16) & 0xffff,
+			(int)(nesvnic->nesdev->nesadapter->fw_ver & 0xffff));
+}
+
+
+/**
+ * show_hca
+ */
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "NES020\n");
+}
+
+
+/**
+ * show_board
+ */
+static ssize_t show_board(struct class_device *cdev, char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
+}
+
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct class_device_attribute *nes_class_attributes[] = {
+	&class_device_attr_hw_rev,
+	&class_device_attr_fw_ver,
+	&class_device_attr_hca_type,
+	&class_device_attr_board_id
+};
+
+
+/**
+ * nes_query_qp
+ */
+static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	nes_debug(NES_DBG_QP, "\n");
+
+	attr->qp_access_flags = 0;
+	attr->cap.max_send_wr = nesqp->hwqp.sq_size;
+	attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
+	attr->cap.max_recv_sge = 1;
+	if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
+		init_attr->cap.max_inline_data = 0;
+	} else {
+		init_attr->cap.max_inline_data = 64;
+	}
+
+	init_attr->event_handler = nesqp->ibqp.event_handler;
+	init_attr->qp_context = nesqp->ibqp.qp_context;
+	init_attr->send_cq = nesqp->ibqp.send_cq;
+	init_attr->recv_cq = nesqp->ibqp.recv_cq;
+	init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq;
+	init_attr->cap = attr->cap;
+
+	return 0;
+}
+
+
+/**
+ * nes_hw_modify_qp
+ */
+int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 next_iwarp_state, u32 wait_completion)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
+	/* struct iw_cm_event cm_event; */
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	u16 major_code;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
+	nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
+			next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request, NES_CQP_REQUEST_RING_DOORBELL);
+
+	/* Wait for CQP */
+	if (wait_completion) {
+		/* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
+				nesqp->hwqp.qp_id); */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
+				"CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+				nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
+		major_code = cqp_request->major_code;
+		if (major_code) {
+			nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
+					"CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
+					nesqp->hwqp.qp_id, cqp_request->major_code,
+					cqp_request->minor_code, next_iwarp_state);
+		}
+		if (atomic_dec_and_test(&cqp_request->refcount)) {
+			if (cqp_request->dynamic) {
+				kfree(cqp_request);
+			} else {
+				spin_lock_irqsave(&nesdev->cqp.lock, flags);
+				list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+				spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+			}
+		}
+		if (!ret)
+			return -ETIME;
+		else if (major_code)
+			return -EIO;
+		else
+			return 0;
+	} else {
+		return 0;
+	}
+}
+
+
+/**
+ * nes_modify_qp
+ */
+int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_udata *udata)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* u32 cqp_head; */
+	/* u32 counter; */
+	u32 next_iwarp_state = 0;
+	int err;
+	unsigned long qplockflags;
+	int ret;
+	u16 original_last_aeq;
+	u8 issue_modify_qp = 0;
+	u8 issue_disconnect = 0;
+	u8 dont_wait = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
+			" iwarp_state=0x%X, refcount=%d\n",
+			nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
+			nesqp->iwarp_state, atomic_read(&nesqp->refcount));
+
+	nes_add_ref(&nesqp->ibqp);
+	spin_lock_irqsave(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
+			" QP Access Flags=0x%X, attr_mask = 0x%0x\n",
+			nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
+			nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
+
+	if (attr_mask & IB_QP_STATE) {
+		switch (attr->qp_state) {
+			case IB_QPS_INIT:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTR:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTS:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				if (nesqp->cm_id == NULL) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
+							nesqp->hwqp.qp_id );
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
+				if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
+					next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
+							NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
+				issue_modify_qp = 1;
+				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
+				nesqp->hte_added = 1;
+				break;
+			case IB_QPS_SQD:
+				issue_modify_qp = 1;
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
+						nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return 0;
+				} else {
+					if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" ignored due to current iWARP state\n",
+								nesqp->hwqp.qp_id);
+						spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+						nes_rem_ref(&nesqp->ibqp);
+						return -EINVAL;
+					}
+					if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" already done based on hw state.\n",
+								nesqp->hwqp.qp_id);
+						issue_modify_qp = 0;
+						nesqp->in_disconnect = 0;
+					}
+					switch (nesqp->hw_iwarp_state) {
+						case NES_AEQE_IWARP_STATE_CLOSING:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+						case NES_AEQE_IWARP_STATE_TERMINATE:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+							break;
+						case NES_AEQE_IWARP_STATE_ERROR:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+							break;
+						default:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							nesqp->in_disconnect = 1;
+							nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+							break;
+					}
+				}
+				break;
+			case IB_QPS_SQE:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
+				issue_modify_qp = 1;
+				nesqp->in_disconnect = 1;
+				break;
+			case IB_QPS_ERR:
+			case IB_QPS_RESET:
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_rem_ref(&nesqp->ibqp);
+					return -EINVAL;
+				}
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
+						nesqp->hwqp.qp_id);
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+					if (nesqp->hte_added) {
+						nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
+						next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+						nesqp->hte_added = 0;
+					}
+				if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
+						(nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
+					next_iwarp_state |= NES_CQP_QP_RESET;
+					nesqp->in_disconnect = 1;
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
+							nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
+					dont_wait = 1;
+				}
+				issue_modify_qp = 1;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
+				break;
+			default:
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				nes_rem_ref(&nesqp->ibqp);
+				return -EINVAL;
+				break;
+		}
+
+		nesqp->ibqp_state = attr->qp_state;
+		if (((nesqp->iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) ==
+				(u32)NES_CQP_QP_IWARP_STATE_RTS) &&
+				((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) >
+				(u32)NES_CQP_QP_IWARP_STATE_RTS)) {
+			nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
+			nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
+					nesqp->iwarp_state);
+			issue_disconnect = 1;
+		} else {
+			nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
+			nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
+					nesqp->iwarp_state);
+		}
+	}
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
+			issue_modify_qp = 1;
+		}
+
+		if (nesqp->user_mode) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+	}
+
+	original_last_aeq = nesqp->last_aeq;
+	spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
+
+	ret = 0;
+
+
+	if (issue_modify_qp) {
+		nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
+		ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 1);
+		if (ret)
+			nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
+					" failed for QP%u.\n",
+					next_iwarp_state, nesqp->hwqp.qp_id);
+
+	}
+
+	if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+		if ((!ret) ||
+				((original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) &&
+				(ret))) {
+			if (dont_wait) {
+				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
+							" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+					/* this one is for the cm_disconnect thread */
+					nes_add_ref(&nesqp->ibqp);
+					spin_lock_irqsave(&nesqp->lock, qplockflags);
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_cm_disconn(nesqp);
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+					nes_rem_ref(&nesqp->ibqp);
+				}
+			} else {
+				spin_lock_irqsave(&nesqp->lock, qplockflags);
+				if (nesqp->cm_id) {
+					/* These two are for the timer thread */
+					if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+						nes_add_ref(&nesqp->ibqp);
+						nesqp->cm_id->add_ref(nesqp->cm_id);
+						nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+								" need ae to finish up, original_last_aeq = 0x%04X."
+								" last_aeq = 0x%04X, scheduling timer.\n",
+								nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+								original_last_aeq, nesqp->last_aeq);
+						schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
+					}
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				} else {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+							" need ae to finish up, original_last_aeq = 0x%04X."
+							" last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+				}
+			}
+		} else {
+			nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+					" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+					nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+					original_last_aeq, nesqp->last_aeq);
+			nes_rem_ref(&nesqp->ibqp);
+		}
+	} else {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+		nes_rem_ref(&nesqp->ibqp);
+	}
+
+	err = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	return err;
+}
+
+
+/**
+ * nes_muticast_attach
+ */
+static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_multicast_detach
+ */
+static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_process_mad
+ */
+static int nes_process_mad(struct ib_device *ibdev, int mad_flags,
+		u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+static inline void
+fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey)
+{
+	int sge_index;
+	int total_payload_length = 0;
+	for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+			ib_wr->sg_list[sge_index].addr);
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
+			ib_wr->sg_list[sge_index].length);
+		if (uselkey)
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
+						(ib_wr->sg_list[sge_index].lkey));
+		else
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
+
+		total_payload_length += ib_wr->sg_list[sge_index].length;
+	}
+	nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
+			total_payload_length);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+				total_payload_length);
+}
+
+/**
+ * nes_post_send
+ */
+static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		struct ib_send_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err;
+	u32 qsize = nesqp->hwqp.sq_size;
+	u32 head;
+	u32 wqe_misc;
+	u32 wqe_count;
+	u32 counter;
+	u32 total_payload_length;
+
+	err = 0;
+	wqe_misc = 0;
+	wqe_count = 0;
+	total_payload_length = 0;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS)
+		return -EINVAL;
+
+		spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+
+	while (ib_wr) {
+		/* Check for SQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+			err = -EINVAL;
+			break;
+		}
+
+		wqe = &nesqp->hwqp.sq_vbase[head];
+		/* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+			switch (ib_wr->opcode) {
+				case IB_WR_SEND:
+					if (ib_wr->send_flags & IB_SEND_SOLICITED) {
+						wqe_misc = NES_IWARP_SQ_OP_SENDSE;
+					} else {
+						wqe_misc = NES_IWARP_SQ_OP_SEND;
+					}
+					if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+						err = -EINVAL;
+						break;
+					}
+					if (ib_wr->send_flags & IB_SEND_FENCE) {
+						wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+					}
+					if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+							((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+							(ib_wr->sg_list[0].length <= 64)) {
+						memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+							       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+						set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+								ib_wr->sg_list[0].length);
+						wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+					} else {
+						fill_wqe_sg_send(wqe, ib_wr, 1);
+					}
+
+					break;
+				case IB_WR_RDMA_WRITE:
+					wqe_misc = NES_IWARP_SQ_OP_RDMAW;
+					if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+						nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
+								ib_wr->num_sge,
+								nesdev->nesadapter->max_sge);
+						err = -EINVAL;
+						break;
+					}
+					if (ib_wr->send_flags & IB_SEND_FENCE) {
+						wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+					}
+
+					set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+							ib_wr->wr.rdma.rkey);
+					set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+							ib_wr->wr.rdma.remote_addr);
+
+					if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+							((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+							(ib_wr->sg_list[0].length <= 64)) {
+						memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+							       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+						set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+								ib_wr->sg_list[0].length);
+						wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+					} else {
+						fill_wqe_sg_send(wqe, ib_wr, 1);
+					}
+					wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
+							wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
+					break;
+				case IB_WR_RDMA_READ:
+					/* iWARP only supports 1 sge for RDMA reads */
+					if (ib_wr->num_sge > 1) {
+						nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
+								ib_wr->num_sge);
+						err = -EINVAL;
+						break;
+					}
+					wqe_misc = NES_IWARP_SQ_OP_RDMAR;
+					set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+							ib_wr->wr.rdma.remote_addr);
+					set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+							ib_wr->wr.rdma.rkey);
+					set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
+							ib_wr->sg_list->length);
+					set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+							ib_wr->sg_list->addr);
+					set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
+							ib_wr->sg_list->lkey);
+					break;
+				default:
+					/* error */
+					err = -EINVAL;
+					break;
+			}
+
+		if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+			wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+		}
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+
+	}
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs + NES_WQE_ALLOC,
+				(counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+	}
+
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_post_recv
+ */
+static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		struct ib_recv_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	int sge_index;
+	u32 qsize = nesqp->hwqp.rq_size;
+	u32 head;
+	u32 wqe_count = 0;
+	u32 counter;
+	u32 total_payload_length;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS)
+		return -EINVAL;
+
+		spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.rq_head;
+
+	while (ib_wr) {
+		if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+			err = -EINVAL;
+			break;
+		}
+		/* Check for RQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
+			err = -EINVAL;
+			break;
+		}
+
+		nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
+		wqe = &nesqp->hwqp.rq_vbase[head];
+
+		/* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		total_payload_length = 0;
+		for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].length);
+			set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].lkey);
+
+			total_payload_length += ib_wr->sg_list[sge_index].length;
+		}
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
+					total_payload_length);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+	}
+
+	nesqp->hwqp.rq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
+	}
+
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_poll_cq
+ */
+static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	u64 u64temp;
+	u64 wrid;
+	/* u64 u64temp; */
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	struct nes_qp *nesqp;
+	struct nes_hw_cqe cqe;
+	u32 head;
+	u32 wq_tail;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 wqe_index;
+	u32 u32temp;
+	/* u32 counter; */
+
+	nes_debug(NES_DBG_CQ, "\n");
+
+		spin_lock_irqsave(&nescq->lock, flags);
+
+	head = nescq->hw_cq.cq_head;
+	cq_size = nescq->hw_cq.cq_size;
+
+	while (cqe_count < num_entries) {
+		if (le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
+				NES_CQE_VALID) {
+			cqe = nescq->hw_cq.cq_vbase[head];
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+			wqe_index = u32temp &
+					(nesdev->nesadapter->max_qp_wr - 1);
+			u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+			/* parse CQE, get completion context from WQE (either rq or sq */
+			u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
+					((u64)u32temp);
+			nesqp = *((struct nes_qp **)&u64temp);
+			memset(entry, 0, sizeof *entry);
+			if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
+				entry->status = IB_WC_SUCCESS;
+			} else {
+				entry->status = IB_WC_WR_FLUSH_ERR;
+			}
+
+			entry->qp = &nesqp->ibqp;
+			entry->src_qp = nesqp->hwqp.qp_id;
+
+			if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
+				if (nesqp->skip_lsmm) {
+					nesqp->skip_lsmm = 0;
+					wq_tail = nesqp->hwqp.sq_tail++;
+				}
+
+				/* Working on a SQ Completion*/
+				wq_tail = wqe_index;
+				nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
+				wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
+						((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
+				entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail].
+						wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
+
+				switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail].
+						wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
+					case NES_IWARP_SQ_OP_RDMAW:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
+						entry->opcode = IB_WC_RDMA_WRITE;
+						break;
+					case NES_IWARP_SQ_OP_RDMAR:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
+						entry->opcode = IB_WC_RDMA_READ;
+						entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail].
+								wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
+						break;
+					case NES_IWARP_SQ_OP_SENDINV:
+					case NES_IWARP_SQ_OP_SENDSEINV:
+					case NES_IWARP_SQ_OP_SEND:
+					case NES_IWARP_SQ_OP_SENDSE:
+						nes_debug(NES_DBG_CQ, "Operation = Send.\n");
+						entry->opcode = IB_WC_SEND;
+						break;
+				}
+			} else {
+				/* Working on a RQ Completion*/
+				wq_tail = wqe_index;
+					nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
+				entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
+				wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
+					((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
+					entry->opcode = IB_WC_RECV;
+			}
+			entry->wr_id = wrid;
+
+			if (++head >= cq_size)
+				head = 0;
+			cqe_count++;
+			nescq->polled_completions++;
+			if ((nescq->polled_completions > (cq_size / 2)) ||
+					(nescq->polled_completions == 255)) {
+				nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
+						" are pending %u of %u.\n",
+						nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+						nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+				nescq->polled_completions = 0;
+			}
+			entry++;
+		} else
+			break;
+	}
+
+	if (nescq->polled_completions) {
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+		nescq->polled_completions = 0;
+	}
+
+	nescq->hw_cq.cq_head = head;
+	nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
+			cqe_count, nescq->hw_cq.cq_number);
+
+		spin_unlock_irqrestore(&nescq->lock, flags);
+
+	return cqe_count;
+}
+
+
+/**
+ * nes_req_notify_cq
+ */
+static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+		{
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	u32 cq_arm;
+
+	nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
+			nescq->hw_cq.cq_number);
+
+	cq_arm = nescq->hw_cq.cq_number;
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
+	else
+		return -EINVAL;
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_ofa_device
+ */
+struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
+{
+	struct nes_ib_device *nesibdev;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+	if (nesibdev == NULL) {
+		return NULL;
+	}
+	strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX);
+	nesibdev->ibdev.owner = THIS_MODULE;
+
+	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
+	memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
+
+	nesibdev->ibdev.uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_REG_MR) |
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+			(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+			(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_BIND_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+			(1ull << IB_USER_VERBS_CMD_POST_SEND);
+
+	nesibdev->ibdev.phys_port_cnt = 1;
+	nesibdev->ibdev.num_comp_vectors = 1;
+	nesibdev->ibdev.dma_device = &nesdev->pcidev->dev;
+	nesibdev->ibdev.class_dev.dev = &nesdev->pcidev->dev;
+	nesibdev->ibdev.query_device = nes_query_device;
+	nesibdev->ibdev.query_port = nes_query_port;
+	nesibdev->ibdev.modify_port = nes_modify_port;
+	nesibdev->ibdev.query_pkey = nes_query_pkey;
+	nesibdev->ibdev.query_gid = nes_query_gid;
+	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
+	nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
+	nesibdev->ibdev.mmap = nes_mmap;
+	nesibdev->ibdev.alloc_pd = nes_alloc_pd;
+	nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
+	nesibdev->ibdev.create_ah = nes_create_ah;
+	nesibdev->ibdev.destroy_ah = nes_destroy_ah;
+	nesibdev->ibdev.create_qp = nes_create_qp;
+	nesibdev->ibdev.modify_qp = nes_modify_qp;
+	nesibdev->ibdev.query_qp = nes_query_qp;
+	nesibdev->ibdev.destroy_qp = nes_destroy_qp;
+	nesibdev->ibdev.create_cq = nes_create_cq;
+	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
+	nesibdev->ibdev.poll_cq = nes_poll_cq;
+	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
+	nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
+	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
+	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
+	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
+	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
+	nesibdev->ibdev.bind_mw = nes_bind_mw;
+
+	nesibdev->ibdev.alloc_fmr = nes_alloc_fmr;
+	nesibdev->ibdev.unmap_fmr = nes_unmap_fmr;
+	nesibdev->ibdev.dealloc_fmr = nes_dealloc_fmr;
+	nesibdev->ibdev.map_phys_fmr = nes_map_phys_fmr;
+
+	nesibdev->ibdev.attach_mcast = nes_multicast_attach;
+	nesibdev->ibdev.detach_mcast = nes_multicast_detach;
+	nesibdev->ibdev.process_mad = nes_process_mad;
+
+	nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
+	nesibdev->ibdev.post_send = nes_post_send;
+	nesibdev->ibdev.post_recv = nes_post_recv;
+
+	nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
+	if (nesibdev->ibdev.iwcm == NULL) {
+		ib_dealloc_device(&nesibdev->ibdev);
+		return NULL;
+	}
+	nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
+	nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
+	nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
+	nesibdev->ibdev.iwcm->connect = nes_connect;
+	nesibdev->ibdev.iwcm->accept = nes_accept;
+	nesibdev->ibdev.iwcm->reject = nes_reject;
+	nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
+	nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
+
+	return nesibdev;
+}
+
+
+/**
+ * nes_destroy_ofa_device
+ */
+void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
+{
+	if (nesibdev == NULL)
+		return;
+
+	nes_unregister_ofa_device(nesibdev);
+
+	kfree(nesibdev->ibdev.iwcm);
+	ib_dealloc_device(&nesibdev->ibdev);
+}
+
+
+/**
+ * nes_register_ofa_device
+ */
+int nes_register_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int i, ret;
+
+	ret = ib_register_device(&nesvnic->nesibdev->ibdev);
+	if (ret) {
+		return ret;
+	}
+
+	/* Get the resources allocated to this device */
+	nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
+	nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
+
+	for (i = 0; i < ARRAY_SIZE(nes_class_attributes); ++i) {
+		ret = class_device_create_file(&nesibdev->ibdev.class_dev, nes_class_attributes[i]);
+		if (ret) {
+			while (i > 0) {
+				i--;
+				class_device_remove_file(&nesibdev->ibdev.class_dev,
+						nes_class_attributes[i]);
+			}
+			ib_unregister_device(&nesibdev->ibdev);
+			return ret;
+		}
+	}
+
+	nesvnic->of_device_registered = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_unregister_ofa_device
+ */
+void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	int i;
+
+	if (nesibdev == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(nes_class_attributes); ++i) {
+		class_device_remove_file(&nesibdev->ibdev.class_dev, nes_class_attributes[i]);
+	}
+
+	if (nesvnic->of_device_registered) {
+		ib_unregister_device(&nesibdev->ibdev);
+	}
+
+	nesvnic->of_device_registered = 0;
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
new file mode 100644
index 0000000..6c6b4da
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_VERBS_H
+#define NES_VERBS_H
+
+struct nes_device;
+
+#define NES_MAX_USER_DB_REGIONS  4096
+#define NES_MAX_USER_WQ_REGIONS  4096
+
+struct nes_ucontext {
+	struct ib_ucontext ibucontext;
+	struct nes_device  *nesdev;
+	unsigned long      mmap_wq_offset;
+	unsigned long      mmap_cq_offset; /* to be removed */
+	int                index;		/* rnic index (minor) */
+	unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
+	u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
+	u16                first_free_db;
+	unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
+	struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
+	u16                first_free_wq;
+	struct list_head   cq_reg_mem_list;
+	struct list_head   qp_reg_mem_list;
+	u32                mcrqf;
+	atomic_t	   usecnt;
+};
+
+struct nes_pd {
+	struct ib_pd ibpd;
+	u16          pd_id;
+	atomic_t     sqp_count;
+	u16          mmap_db_index;
+};
+
+struct nes_mr {
+	union {
+		struct ib_mr  ibmr;
+		struct ib_mw  ibmw;
+		struct ib_fmr ibfmr;
+	};
+	struct ib_umem    *region;
+	u16               pbls_used;
+	u8                mode;
+	u8                pbl_4k;
+};
+
+struct nes_hw_pb {
+	__le32 pa_low;
+	__le32 pa_high;
+};
+
+struct nes_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+};
+
+struct nes_root_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+	struct nes_vpbl  *leaf_vpbl;
+};
+
+struct nes_fmr {
+	struct nes_mr        nesmr;
+	u32                  leaf_pbl_cnt;
+	struct nes_root_vpbl root_vpbl;
+	struct ib_qp         *ib_qp;
+	int                  access_rights;
+	struct ib_fmr_attr   attr;
+};
+
+struct nes_av;
+
+struct nes_cq {
+	struct ib_cq     ibcq;
+	struct nes_hw_cq hw_cq;
+	u32              polled_completions;
+	u32              cq_mem_size;
+	spinlock_t       lock;
+	u8               virtual_cq;
+	u8               pad[3];
+};
+
+struct nes_wq {
+	spinlock_t lock;
+};
+
+struct iw_cm_id;
+struct ietf_mpa_frame;
+
+struct nes_qp {
+	struct ib_qp          ibqp;
+	void                  *allocated_buffer;
+	struct iw_cm_id       *cm_id;
+	struct workqueue_struct *wq;
+	struct work_struct    disconn_work;
+	struct nes_cq         *nesscq;
+	struct nes_cq         *nesrcq;
+	struct nes_pd         *nespd;
+	void *cm_node; /* handle of the node this QP is associated with */
+	struct ietf_mpa_frame *ietf_frame;
+	dma_addr_t            ietf_frame_pbase;
+	wait_queue_head_t     state_waitq;
+	unsigned long         socket;
+	struct nes_hw_qp      hwqp;
+	struct work_struct    work;
+	struct work_struct    ae_work;
+	enum ib_qp_state      ibqp_state;
+	u32                   iwarp_state;
+	u32                   hte_index;
+	u32                   last_aeq;
+	u32                   qp_mem_size;
+	atomic_t              refcount;
+	atomic_t              close_timer_started;
+	u32                   mmap_sq_db_index;
+	u32                   mmap_rq_db_index;
+	spinlock_t            lock;
+	struct nes_qp_context *nesqp_context;
+	dma_addr_t            nesqp_context_pbase;
+	void	              *pbl_vbase;
+	dma_addr_t            pbl_pbase;
+	struct page           *page;
+	wait_queue_head_t     kick_waitq;
+	u16                   in_disconnect;
+	u16                   private_data_len;
+	u8                    active_conn;
+	u8                    skip_lsmm;
+	u8                    user_mode;
+	u8                    hte_added;
+	u8                    hw_iwarp_state;
+	u8                    flush_issued;
+	u8                    hw_tcp_state;
+	u8                    disconn_pending;
+	u8                    destroyed;
+};
+#endif			/* NES_VERBS_H */
-- 
cgit v1.1


From a13af4b4d842da6d7065b8c73fa8f0ac58fea1b6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Mon, 29 Oct 2007 15:14:03 +1000
Subject: agp: add chipset flushing support to AGP interface

This bumps the AGP interface to 0.103.

Certain Intel chipsets contains a global write buffer, and this can require
flushing from the drm or X.org to make sure all data has hit RAM before
initiating a GPU transfer, due to a lack of coherency with the integrated
graphics device and this buffer.

This just adds generic support to the AGP interfaces, a follow-on patch
will add support to the Intel driver to use this interface.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/char/agp/agp.h          |  3 ++-
 drivers/char/agp/backend.c      |  2 +-
 drivers/char/agp/compat_ioctl.c |  4 ++++
 drivers/char/agp/compat_ioctl.h |  2 ++
 drivers/char/agp/frontend.c     | 11 +++++++++++
 drivers/char/agp/generic.c      |  7 +++++++
 include/linux/agp_backend.h     |  1 +
 include/linux/agpgart.h         |  1 +
 8 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index b83824c..9ec9374 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -117,7 +117,8 @@ struct agp_bridge_driver {
 	void (*free_by_type)(struct agp_memory *);
 	void *(*agp_alloc_page)(struct agp_bridge_data *);
 	void (*agp_destroy_page)(void *, int flags);
-        int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
+	int (*agp_type_to_mask_type) (struct agp_bridge_data *, int);
+	void (*chipset_flush)(struct agp_bridge_data *);
 };
 
 struct agp_bridge_data {
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 2720882..b1bdd01 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -43,7 +43,7 @@
  * fix some real stupidity. It's only by chance we can bump
  * past 0.99 at all due to some boolean logic error. */
 #define AGPGART_VERSION_MAJOR 0
-#define AGPGART_VERSION_MINOR 102
+#define AGPGART_VERSION_MINOR 103
 static const struct agp_version agp_current_version =
 {
 	.major = AGPGART_VERSION_MAJOR,
diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c
index ecd4248..3927579 100644
--- a/drivers/char/agp/compat_ioctl.c
+++ b/drivers/char/agp/compat_ioctl.c
@@ -273,6 +273,10 @@ long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case AGPIOC_UNBIND32:
 		ret_val = compat_agpioc_unbind_wrap(curr_priv, (void __user *) arg);
 		break;
+
+	case AGPIOC_CHIPSET_FLUSH32:
+		ret_val = agpioc_chipset_flush_wrap(curr_priv);
+		break;
 	}
 
 ioctl_out:
diff --git a/drivers/char/agp/compat_ioctl.h b/drivers/char/agp/compat_ioctl.h
index 71939d6..0c9678a 100644
--- a/drivers/char/agp/compat_ioctl.h
+++ b/drivers/char/agp/compat_ioctl.h
@@ -39,6 +39,7 @@
 #define AGPIOC_DEALLOCATE32 _IOW (AGPIOC_BASE, 7, compat_int_t)
 #define AGPIOC_BIND32       _IOW (AGPIOC_BASE, 8, compat_uptr_t)
 #define AGPIOC_UNBIND32     _IOW (AGPIOC_BASE, 9, compat_uptr_t)
+#define AGPIOC_CHIPSET_FLUSH32 _IO (AGPIOC_BASE, 10)
 
 struct agp_info32 {
 	struct agp_version version;	/* version of the driver        */
@@ -101,5 +102,6 @@ void agp_free_memory_wrap(struct agp_memory *memory);
 struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type);
 struct agp_memory *agp_find_mem_by_key(int key);
 struct agp_client *agp_find_client_by_pid(pid_t id);
+int agpioc_chipset_flush_wrap(struct agp_file_private *priv);
 
 #endif /* _AGP_COMPAT_H */
diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 7791e98..9bd5a95 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -960,6 +960,13 @@ static int agpioc_unbind_wrap(struct agp_file_private *priv, void __user *arg)
 	return agp_unbind_memory(memory);
 }
 
+int agpioc_chipset_flush_wrap(struct agp_file_private *priv)
+{
+	DBG("");
+	agp_flush_chipset(agp_bridge);
+	return 0;
+}
+
 static int agp_ioctl(struct inode *inode, struct file *file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -1033,6 +1040,10 @@ static int agp_ioctl(struct inode *inode, struct file *file,
 	case AGPIOC_UNBIND:
 		ret_val = agpioc_unbind_wrap(curr_priv, (void __user *) arg);
 		break;
+	       
+	case AGPIOC_CHIPSET_FLUSH:
+		ret_val = agpioc_chipset_flush_wrap(curr_priv);
+		break;
 	}
 
 ioctl_out:
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 1a4674c..7484bc7 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -80,6 +80,13 @@ static int agp_get_key(void)
 	return -1;
 }
 
+void agp_flush_chipset(struct agp_bridge_data *bridge)
+{
+	if (bridge->driver->chipset_flush)
+		bridge->driver->chipset_flush(bridge);
+}
+EXPORT_SYMBOL(agp_flush_chipset);
+
 /*
  * Use kmalloc if possible for the page list. Otherwise fall back to
  * vmalloc. This speeds things up and also saves memory for small AGP
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index abc521c..03e3454 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -109,6 +109,7 @@ extern int agp_unbind_memory(struct agp_memory *);
 extern void agp_enable(struct agp_bridge_data *, u32);
 extern struct agp_bridge_data *agp_backend_acquire(struct pci_dev *);
 extern void agp_backend_release(struct agp_bridge_data *);
+extern void agp_flush_chipset(struct agp_bridge_data *);
 
 #endif				/* __KERNEL__ */
 #endif				/* _AGP_BACKEND_H */
diff --git a/include/linux/agpgart.h b/include/linux/agpgart.h
index 09fbf7e..62aef58 100644
--- a/include/linux/agpgart.h
+++ b/include/linux/agpgart.h
@@ -38,6 +38,7 @@
 #define AGPIOC_DEALLOCATE _IOW (AGPIOC_BASE, 7, int)
 #define AGPIOC_BIND       _IOW (AGPIOC_BASE, 8, struct agp_bind*)
 #define AGPIOC_UNBIND     _IOW (AGPIOC_BASE, 9, struct agp_unbind*)
+#define AGPIOC_CHIPSET_FLUSH _IO (AGPIOC_BASE, 10)
 
 #define AGP_DEVICE      "/dev/agpgart"
 
-- 
cgit v1.1


From 6c00a61e1bc969c3ea931f62f8789d9818bf1918 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Mon, 29 Oct 2007 18:06:10 +1000
Subject: intel-agp: add chipset flushing support

This adds support for flushing the chipsets on the 915, 945, 965 and G33
families of Intel chips.

The BIOS doesn't seem to always allocate the BAR on the 965 chipsets
so I have to use pci resource code to create a resource

It adds an export for pcibios_align_resource.
---
 arch/x86/pci/i386.c          |   2 +-
 drivers/char/agp/intel-agp.c | 102 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 42ba0e2..103b9dff 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -72,7 +72,7 @@ pcibios_align_resource(void *data, struct resource *res,
 		}
 	}
 }
-
+EXPORT_SYMBOL(pcibios_align_resource);
 
 /*
  *  Handle resources of PCI devices.  If the world were perfect, we could
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 189efb6..4d062fc 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -71,9 +71,11 @@ extern int agp_memory_reserved;
 #define I915_GMCH_GMS_STOLEN_64M	(0x7 << 4)
 #define G33_GMCH_GMS_STOLEN_128M       (0x8 << 4)
 #define G33_GMCH_GMS_STOLEN_256M       (0x9 << 4)
+#define I915_IFPADDR    0x60
 
 /* Intel 965G registers */
 #define I965_MSAC 0x62
+#define I965_IFPADDR    0x70
 
 /* Intel 7505 registers */
 #define INTEL_I7505_APSIZE	0x74
@@ -115,6 +117,8 @@ static struct _intel_private {
 	 * popup and for the GTT.
 	 */
 	int gtt_entries;			/* i830+ */
+	void __iomem *flush_page;
+	struct resource ifp_resource;
 } intel_private;
 
 static int intel_i810_fetch_size(void)
@@ -768,6 +772,73 @@ static struct agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type)
 	return NULL;
 }
 
+static int intel_alloc_chipset_flush_resource(void)
+{
+	int ret;
+	ret = pci_bus_alloc_resource(agp_bridge->dev->bus, &intel_private.ifp_resource, PAGE_SIZE,
+				     PAGE_SIZE, PCIBIOS_MIN_MEM, 0,
+				     pcibios_align_resource, agp_bridge->dev);
+	if (ret != 0)
+		return ret;
+
+	printk("intel priv bus start %08lx\n", intel_private.ifp_resource.start);
+	return 0;
+}
+
+static void intel_i915_setup_chipset_flush(void)
+{
+	int ret;
+	u32 temp;
+
+	pci_read_config_dword(agp_bridge->dev, I915_IFPADDR, &temp);
+	if (!(temp & 0x1)) {
+		intel_alloc_chipset_flush_resource();
+
+		pci_write_config_dword(agp_bridge->dev, I915_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
+	} else {
+		temp &= ~1;
+
+		intel_private.ifp_resource.start = temp;
+		intel_private.ifp_resource.end = temp + PAGE_SIZE;
+		ret = request_resource(&iomem_resource, &intel_private.ifp_resource);
+		if (ret) {
+			intel_private.ifp_resource.start = 0;
+			printk("Failed inserting resource into tree\n");
+		}
+	}
+}
+
+static void intel_i965_g33_setup_chipset_flush(void)
+{
+	u32 temp_hi, temp_lo;
+	int ret;
+
+	pci_read_config_dword(agp_bridge->dev, I965_IFPADDR + 4, &temp_hi);
+	pci_read_config_dword(agp_bridge->dev, I965_IFPADDR, &temp_lo);
+
+	if (!(temp_lo & 0x1)) {
+
+		intel_alloc_chipset_flush_resource();
+
+		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR + 4, (intel_private.ifp_resource.start >> 32));
+		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
+		intel_private.flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
+	} else {
+		u64 l64;
+		
+		temp_lo &= ~0x1;
+		l64 = ((u64)temp_hi << 32) | temp_lo;
+
+		intel_private.ifp_resource.start = l64;
+		intel_private.ifp_resource.end = l64 + PAGE_SIZE;
+		ret = request_resource(&iomem_resource, &intel_private.ifp_resource);
+		if (!ret) {
+			intel_private.ifp_resource.start = 0;
+			printk("Failed inserting resource into tree\n");
+		}
+	}
+}
+
 static int intel_i915_configure(void)
 {
 	struct aper_size_info_fixed *current_size;
@@ -796,15 +867,43 @@ static int intel_i915_configure(void)
 	}
 
 	global_cache_flush();
+
+	/* setup a resource for this object */
+	memset(&intel_private.ifp_resource, 0, sizeof(intel_private.ifp_resource));
+
+	intel_private.ifp_resource.name = "Intel Flush Page";
+	intel_private.ifp_resource.flags = IORESOURCE_MEM;
+
+	/* Setup chipset flush for 915 */
+	if (IS_I965 || IS_G33) {
+		intel_i965_g33_setup_chipset_flush();
+	} else {
+		intel_i915_setup_chipset_flush();
+	}
+
+	if (intel_private.ifp_resource.start) {
+		intel_private.flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
+		if (!intel_private.flush_page)
+			printk("unable to ioremap flush  page - no chipset flushing");
+	}
+	
 	return 0;
 }
 
 static void intel_i915_cleanup(void)
 {
+	if (intel_private.flush_page)
+		iounmap(intel_private.flush_page);
 	iounmap(intel_private.gtt);
 	iounmap(intel_private.registers);
 }
 
+static void intel_i915_chipset_flush(struct agp_bridge_data *bridge)
+{
+	if (intel_private.flush_page)
+		writel(1, intel_private.flush_page);
+}
+
 static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
 				int type)
 {
@@ -1721,6 +1820,7 @@ static const struct agp_bridge_driver intel_915_driver = {
 	.agp_alloc_page		= agp_generic_alloc_page,
 	.agp_destroy_page	= agp_generic_destroy_page,
 	.agp_type_to_mask_type  = intel_i830_type_to_mask_type,
+	.chipset_flush		= intel_i915_chipset_flush,
 };
 
 static const struct agp_bridge_driver intel_i965_driver = {
@@ -1746,6 +1846,7 @@ static const struct agp_bridge_driver intel_i965_driver = {
        .agp_alloc_page         = agp_generic_alloc_page,
        .agp_destroy_page       = agp_generic_destroy_page,
        .agp_type_to_mask_type  = intel_i830_type_to_mask_type,
+	.chipset_flush		= intel_i915_chipset_flush,
 };
 
 static const struct agp_bridge_driver intel_7505_driver = {
@@ -1795,6 +1896,7 @@ static const struct agp_bridge_driver intel_g33_driver = {
 	.agp_alloc_page         = agp_generic_alloc_page,
 	.agp_destroy_page       = agp_generic_destroy_page,
 	.agp_type_to_mask_type  = intel_i830_type_to_mask_type,
+	.chipset_flush		= intel_i915_chipset_flush,
 };
 
 static int find_gmch(u16 device)
-- 
cgit v1.1


From 2162e6a2b0cd5acbb9bd8a3c94e1c1269b078295 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 21 Nov 2007 16:36:31 +1000
Subject: agp/intel: Add chipset flushing support for i8xx chipsets.

This is a bit of a large hammer but it makes sure the chipset is flushed
by writing out 1k of data to an uncached page. We may be able to get better
information in the future on how to this better.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/char/agp/intel-agp.c | 108 +++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 30 deletions(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 4d062fc..ce75fa3 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -117,7 +117,11 @@ static struct _intel_private {
 	 * popup and for the GTT.
 	 */
 	int gtt_entries;			/* i830+ */
-	void __iomem *flush_page;
+	union {
+		void __iomem *i9xx_flush_page;
+		void *i8xx_flush_page;
+	};
+	struct page *i8xx_page;
 	struct resource ifp_resource;
 } intel_private;
 
@@ -579,6 +583,44 @@ static void intel_i830_init_gtt_entries(void)
 	intel_private.gtt_entries = gtt_entries;
 }
 
+static void intel_i830_fini_flush(void)
+{
+	kunmap(intel_private.i8xx_page);
+	intel_private.i8xx_flush_page = NULL;
+	unmap_page_from_agp(intel_private.i8xx_page);
+	flush_agp_mappings();
+
+	__free_page(intel_private.i8xx_page);
+}
+
+static void intel_i830_setup_flush(void)
+{
+
+	intel_private.i8xx_page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA32);
+	if (!intel_private.i8xx_page) {
+		return;
+	}
+
+	/* make page uncached */
+	map_page_into_agp(intel_private.i8xx_page);
+	flush_agp_mappings();
+
+	intel_private.i8xx_flush_page = kmap(intel_private.i8xx_page);
+	if (!intel_private.i8xx_flush_page)
+		intel_i830_fini_flush();
+}
+
+static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
+{
+	unsigned int *pg = intel_private.i8xx_flush_page;
+	int i;
+
+	for (i = 0; i < 256; i+=2)
+		*(pg + i) = i;
+	
+	wmb();
+}
+
 /* The intel i830 automatically initializes the agp aperture during POST.
  * Use the memory already set aside for in the GTT.
  */
@@ -679,6 +721,8 @@ static int intel_i830_configure(void)
 	}
 
 	global_cache_flush();
+
+	intel_i830_setup_flush();
 	return 0;
 }
 
@@ -778,11 +822,8 @@ static int intel_alloc_chipset_flush_resource(void)
 	ret = pci_bus_alloc_resource(agp_bridge->dev->bus, &intel_private.ifp_resource, PAGE_SIZE,
 				     PAGE_SIZE, PCIBIOS_MIN_MEM, 0,
 				     pcibios_align_resource, agp_bridge->dev);
-	if (ret != 0)
-		return ret;
 
-	printk("intel priv bus start %08lx\n", intel_private.ifp_resource.start);
-	return 0;
+	return ret;
 }
 
 static void intel_i915_setup_chipset_flush(void)
@@ -822,7 +863,6 @@ static void intel_i965_g33_setup_chipset_flush(void)
 
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR + 4, (intel_private.ifp_resource.start >> 32));
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
-		intel_private.flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
 	} else {
 		u64 l64;
 		
@@ -833,12 +873,33 @@ static void intel_i965_g33_setup_chipset_flush(void)
 		intel_private.ifp_resource.end = l64 + PAGE_SIZE;
 		ret = request_resource(&iomem_resource, &intel_private.ifp_resource);
 		if (!ret) {
-			intel_private.ifp_resource.start = 0;
-			printk("Failed inserting resource into tree\n");
+			printk("Failed inserting resource into tree - continuing\n");
 		}
 	}
 }
 
+static void intel_i9xx_setup_flush(void)
+{
+	/* setup a resource for this object */
+	memset(&intel_private.ifp_resource, 0, sizeof(intel_private.ifp_resource));
+
+	intel_private.ifp_resource.name = "Intel Flush Page";
+	intel_private.ifp_resource.flags = IORESOURCE_MEM;
+
+	/* Setup chipset flush for 915 */
+	if (IS_I965 || IS_G33) {
+		intel_i965_g33_setup_chipset_flush();
+	} else {
+		intel_i915_setup_chipset_flush();
+	}
+
+	if (intel_private.ifp_resource.start) {
+		intel_private.i9xx_flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
+		if (!intel_private.i9xx_flush_page)
+			printk("unable to ioremap flush  page - no chipset flushing");
+	}
+}
+
 static int intel_i915_configure(void)
 {
 	struct aper_size_info_fixed *current_size;
@@ -868,40 +929,23 @@ static int intel_i915_configure(void)
 
 	global_cache_flush();
 
-	/* setup a resource for this object */
-	memset(&intel_private.ifp_resource, 0, sizeof(intel_private.ifp_resource));
-
-	intel_private.ifp_resource.name = "Intel Flush Page";
-	intel_private.ifp_resource.flags = IORESOURCE_MEM;
-
-	/* Setup chipset flush for 915 */
-	if (IS_I965 || IS_G33) {
-		intel_i965_g33_setup_chipset_flush();
-	} else {
-		intel_i915_setup_chipset_flush();
-	}
-
-	if (intel_private.ifp_resource.start) {
-		intel_private.flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
-		if (!intel_private.flush_page)
-			printk("unable to ioremap flush  page - no chipset flushing");
-	}
+	intel_i9xx_setup_flush();
 	
 	return 0;
 }
 
 static void intel_i915_cleanup(void)
 {
-	if (intel_private.flush_page)
-		iounmap(intel_private.flush_page);
+	if (intel_private.i9xx_flush_page)
+		iounmap(intel_private.i9xx_flush_page);
 	iounmap(intel_private.gtt);
 	iounmap(intel_private.registers);
 }
 
 static void intel_i915_chipset_flush(struct agp_bridge_data *bridge)
 {
-	if (intel_private.flush_page)
-		writel(1, intel_private.flush_page);
+	if (intel_private.i9xx_flush_page)
+		writel(1, intel_private.i9xx_flush_page);
 }
 
 static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
@@ -1395,6 +1439,8 @@ static int intel_845_configure(void)
 	pci_write_config_byte(agp_bridge->dev, INTEL_I845_AGPM, temp2 | (1 << 1));
 	/* clear any possible error conditions */
 	pci_write_config_word(agp_bridge->dev, INTEL_I845_ERRSTS, 0x001c);
+
+	intel_i830_setup_flush();
 	return 0;
 }
 
@@ -1651,6 +1697,7 @@ static const struct agp_bridge_driver intel_830_driver = {
 	.agp_alloc_page		= agp_generic_alloc_page,
 	.agp_destroy_page	= agp_generic_destroy_page,
 	.agp_type_to_mask_type  = intel_i830_type_to_mask_type,
+	.chipset_flush		= intel_i830_chipset_flush,
 };
 
 static const struct agp_bridge_driver intel_820_driver = {
@@ -1747,6 +1794,7 @@ static const struct agp_bridge_driver intel_845_driver = {
 	.agp_alloc_page		= agp_generic_alloc_page,
 	.agp_destroy_page	= agp_generic_destroy_page,
 	.agp_type_to_mask_type  = agp_generic_type_to_mask_type,
+	.chipset_flush		= intel_i830_chipset_flush,
 };
 
 static const struct agp_bridge_driver intel_850_driver = {
-- 
cgit v1.1


From 1fa4db7d308da04f6644c5cb8eed244c200d4ed5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 29 Nov 2007 10:00:48 +1000
Subject: fix AGP warning

drivers/char/agp/intel-agp.c: In function 'intel_i965_g33_setup_chipset_flush':
drivers/char/agp/intel-agp.c:872: warning: right shift count >= width of type

I wish the agp code wasn't written in a 10,000-column xterm :(

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/intel-agp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index ce75fa3..8e61a05 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -861,7 +861,8 @@ static void intel_i965_g33_setup_chipset_flush(void)
 
 		intel_alloc_chipset_flush_resource();
 
-		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR + 4, (intel_private.ifp_resource.start >> 32));
+		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR + 4,
+			upper_32_bits(intel_private.ifp_resource.start));
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
 	} else {
 		u64 l64;
-- 
cgit v1.1


From 62f29babbc60ab572d3cecda981931d3a66123d6 Mon Sep 17 00:00:00 2001
From: "serue@us.ibm.com" <serue@us.ibm.com>
Date: Wed, 5 Dec 2007 13:55:36 -0800
Subject: agp: remove uid comparison as security check

In the face of containers and user namespaces, a uid==0 check for
security is not safe.  Switch to a capability check.

I'm not sure I picked the right capability, but this being AGP
CAP_SYS_RAWIO seemed to make sense.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/frontend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 9bd5a95..55d7a82 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -689,7 +689,7 @@ static int agp_open(struct inode *inode, struct file *file)
 	set_bit(AGP_FF_ALLOW_CLIENT, &priv->access_flags);
 	priv->my_pid = current->pid;
 
-	if ((current->uid == 0) || (current->suid == 0)) {
+	if (capable(CAP_SYS_RAWIO)) {
 		/* Root priv, can be controller */
 		set_bit(AGP_FF_ALLOW_CONTROLLER, &priv->access_flags);
 	}
-- 
cgit v1.1


From 91d361c279b66ce4d617d544641d5f70b27c401a Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 5 Dec 2007 13:55:36 -0800
Subject: agp: remove unnecessary pci_dev_put

pci_get_class implicitly does a pci_dev_put on its second argument, so
pci_dev_put is only needed if there is a break out of the loop.

The semantic match detecting this problem is as follows:

// <smpl>
@@
expression dev;
expression E;
@@

* pci_dev_put(dev)
  ... when != dev = E
(
* pci_get_device(...,dev)
|
* pci_get_device_reverse(...,dev)
|
* pci_get_subsys(...,dev)
|
* pci_get_class(...,dev)
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/amd-k7-agp.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 1405a42..87be464 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -436,10 +436,6 @@ static int __devinit agp_amdk7_probe(struct pci_dev *pdev,
 				return -ENODEV;
 			}
 			cap_ptr = pci_find_capability(gfxcard, PCI_CAP_ID_AGP);
-			if (!cap_ptr) {
-				pci_dev_put(gfxcard);
-				continue;
-			}
 		}
 
 		/* With so many variants of NVidia cards, it's simpler just
-- 
cgit v1.1


From 4e8b6e25943a22036a6b704ebef634c7dec4c10e Mon Sep 17 00:00:00 2001
From: Zhenyu Wang <zhenyu.z.wang@intel.com>
Date: Wed, 23 Jan 2008 14:54:37 +1000
Subject: intel-agp: add new chipset ID

This one adds new pci ids for Intel intergrated graphics chipset, with gtt
table access change on it and new gtt table size definition.

Signed-off-by: Zhenyu Wang <zhenyu.z.wang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/agp.h       |  3 +++
 drivers/char/agp/intel-agp.c | 31 ++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 9ec9374..c69f795 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -236,6 +236,9 @@ struct agp_bridge_data {
 #define I965_PGETBL_SIZE_512KB	(0 << 1)
 #define I965_PGETBL_SIZE_256KB	(1 << 1)
 #define I965_PGETBL_SIZE_128KB	(2 << 1)
+#define I965_PGETBL_SIZE_1MB	(3 << 1)
+#define I965_PGETBL_SIZE_2MB	(4 << 1)
+#define I965_PGETBL_SIZE_1_5MB	(5 << 1)
 #define G33_PGETBL_SIZE_MASK    (3 << 8)
 #define G33_PGETBL_SIZE_1M      (1 << 8)
 #define G33_PGETBL_SIZE_2M      (2 << 8)
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 8e61a05..6fa97ae 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -32,13 +32,16 @@
 #define PCI_DEVICE_ID_INTEL_Q35_IG          0x29B2
 #define PCI_DEVICE_ID_INTEL_Q33_HB          0x29D0
 #define PCI_DEVICE_ID_INTEL_Q33_IG          0x29D2
+#define PCI_DEVICE_ID_INTEL_IGD_HB          0x2A40
+#define PCI_DEVICE_ID_INTEL_IGD_IG          0x2A42
 
 #define IS_I965 (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82946GZ_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_1_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965Q_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GM_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GME_HB)
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GME_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGD_HB)
 
 #define IS_G33 (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G33_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_Q35_HB || \
@@ -461,6 +464,15 @@ static void intel_i830_init_gtt_entries(void)
 		case I965_PGETBL_SIZE_512KB:
 			size = 512;
 			break;
+		case I965_PGETBL_SIZE_1MB:
+			size = 1024;
+			break;
+		case I965_PGETBL_SIZE_2MB:
+			size = 2048;
+			break;
+		case I965_PGETBL_SIZE_1_5MB:
+			size = 1024 + 512;
+			break;
 		default:
 			printk(KERN_INFO PFX "Unknown page table size, "
 			       "assuming 512KB\n");
@@ -1124,6 +1136,7 @@ static int intel_i965_create_gatt_table(struct agp_bridge_data *bridge)
        struct aper_size_info_fixed *size;
        int num_entries;
        u32 temp;
+       int gtt_offset, gtt_size;
 
        size = agp_bridge->current_size;
        page_order = size->page_order;
@@ -1133,13 +1146,18 @@ static int intel_i965_create_gatt_table(struct agp_bridge_data *bridge)
        pci_read_config_dword(intel_private.pcidev, I915_MMADDR, &temp);
 
        temp &= 0xfff00000;
-       intel_private.gtt = ioremap((temp + (512 * 1024)) , 512 * 1024);
 
-	if (!intel_private.gtt)
-		return -ENOMEM;
+       if (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGD_HB)
+	       gtt_offset = gtt_size = MB(2);
+       else
+	       gtt_offset = gtt_size = KB(512);
+
+       intel_private.gtt = ioremap((temp + gtt_offset) , gtt_size);
 
+       if (!intel_private.gtt)
+	       return -ENOMEM;
 
-       intel_private.registers = ioremap(temp,128 * 4096);
+       intel_private.registers = ioremap(temp, 128 * 4096);
        if (!intel_private.registers) {
 		iounmap(intel_private.gtt);
 		return -ENOMEM;
@@ -2036,6 +2054,8 @@ static const struct intel_driver_description {
 		NULL, &intel_g33_driver },
 	{ PCI_DEVICE_ID_INTEL_Q33_HB, PCI_DEVICE_ID_INTEL_Q33_IG, 0, "Q33",
 		NULL, &intel_g33_driver },
+	{ PCI_DEVICE_ID_INTEL_IGD_HB, PCI_DEVICE_ID_INTEL_IGD_IG, 0,
+	    "Intel Integrated Graphics Device", NULL, &intel_i965_driver },
 	{ 0, 0, 0, NULL, NULL, NULL }
 };
 
@@ -2226,6 +2246,7 @@ static struct pci_device_id agp_intel_pci_table[] = {
 	ID(PCI_DEVICE_ID_INTEL_G33_HB),
 	ID(PCI_DEVICE_ID_INTEL_Q35_HB),
 	ID(PCI_DEVICE_ID_INTEL_Q33_HB),
+	ID(PCI_DEVICE_ID_INTEL_IGD_HB),
 	{ }
 };
 
-- 
cgit v1.1


From 4d64dd9e5d96cdcfa8dee91c7848341718c77444 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@linux.ie>
Date: Wed, 23 Jan 2008 15:34:29 +1000
Subject: intel-agp: fixup resource handling in flush code.

The flush code resource handling was having problems where some BIOS
reserve the resource in a pnp block and some don't.

Also there was a bug in that configure was being called at resume
and resetting some of the structs.

Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/intel-agp.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 6fa97ae..af7ff56 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -126,6 +126,7 @@ static struct _intel_private {
 	};
 	struct page *i8xx_page;
 	struct resource ifp_resource;
+	int resource_valid;
 } intel_private;
 
 static int intel_i810_fetch_size(void)
@@ -603,10 +604,14 @@ static void intel_i830_fini_flush(void)
 	flush_agp_mappings();
 
 	__free_page(intel_private.i8xx_page);
+	intel_private.i8xx_page = NULL;
 }
 
 static void intel_i830_setup_flush(void)
 {
+	/* return if we've already set the flush mechanism up */
+	if (intel_private.i8xx_page)
+		return;
 
 	intel_private.i8xx_page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA32);
 	if (!intel_private.i8xx_page) {
@@ -846,18 +851,18 @@ static void intel_i915_setup_chipset_flush(void)
 	pci_read_config_dword(agp_bridge->dev, I915_IFPADDR, &temp);
 	if (!(temp & 0x1)) {
 		intel_alloc_chipset_flush_resource();
-
+		intel_private.resource_valid = 1;
 		pci_write_config_dword(agp_bridge->dev, I915_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
 	} else {
 		temp &= ~1;
 
+		intel_private.resource_valid = 1;
 		intel_private.ifp_resource.start = temp;
 		intel_private.ifp_resource.end = temp + PAGE_SIZE;
 		ret = request_resource(&iomem_resource, &intel_private.ifp_resource);
-		if (ret) {
-			intel_private.ifp_resource.start = 0;
-			printk("Failed inserting resource into tree\n");
-		}
+		/* some BIOSes reserve this area in a pnp some don't */
+		if (ret)
+			intel_private.resource_valid = 0;
 	}
 }
 
@@ -873,6 +878,7 @@ static void intel_i965_g33_setup_chipset_flush(void)
 
 		intel_alloc_chipset_flush_resource();
 
+		intel_private.resource_valid = 1;
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR + 4,
 			upper_32_bits(intel_private.ifp_resource.start));
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
@@ -882,20 +888,23 @@ static void intel_i965_g33_setup_chipset_flush(void)
 		temp_lo &= ~0x1;
 		l64 = ((u64)temp_hi << 32) | temp_lo;
 
+		intel_private.resource_valid = 1;
 		intel_private.ifp_resource.start = l64;
 		intel_private.ifp_resource.end = l64 + PAGE_SIZE;
 		ret = request_resource(&iomem_resource, &intel_private.ifp_resource);
-		if (!ret) {
-			printk("Failed inserting resource into tree - continuing\n");
-		}
+		/* some BIOSes reserve this area in a pnp some don't */
+		if (ret)
+			intel_private.resource_valid = 0;
 	}
 }
 
 static void intel_i9xx_setup_flush(void)
 {
-	/* setup a resource for this object */
-	memset(&intel_private.ifp_resource, 0, sizeof(intel_private.ifp_resource));
+	/* return if already configured */
+	if (intel_private.ifp_resource.start)
+		return;
 
+	/* setup a resource for this object */
 	intel_private.ifp_resource.name = "Intel Flush Page";
 	intel_private.ifp_resource.flags = IORESOURCE_MEM;
 
@@ -951,6 +960,10 @@ static void intel_i915_cleanup(void)
 {
 	if (intel_private.i9xx_flush_page)
 		iounmap(intel_private.i9xx_flush_page);
+	if (intel_private.resource_valid)
+		release_resource(&intel_private.ifp_resource);
+	intel_private.ifp_resource.start = 0;
+	intel_private.resource_valid = 0;
 	iounmap(intel_private.gtt);
 	iounmap(intel_private.registers);
 }
-- 
cgit v1.1


From 9119f85a0cdbac0397b39fa198866bf530cfab8b Mon Sep 17 00:00:00 2001
From: Zhenyu Wang <zhenyu.z.wang@intel.com>
Date: Wed, 23 Jan 2008 15:49:26 +1000
Subject: [intel_agp] fix name for G35 chipset

Change origin chipset name i965G_1 to market name G35.

Signed-off-by: Zhenyu Wang <zhenyu.z.wang@intel.com>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/intel-agp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index af7ff56..07141c9 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -14,8 +14,8 @@
 #define PCI_DEVICE_ID_INTEL_E7221_IG	0x258a
 #define PCI_DEVICE_ID_INTEL_82946GZ_HB      0x2970
 #define PCI_DEVICE_ID_INTEL_82946GZ_IG      0x2972
-#define PCI_DEVICE_ID_INTEL_82965G_1_HB     0x2980
-#define PCI_DEVICE_ID_INTEL_82965G_1_IG     0x2982
+#define PCI_DEVICE_ID_INTEL_82G35_HB     0x2980
+#define PCI_DEVICE_ID_INTEL_82G35_IG     0x2982
 #define PCI_DEVICE_ID_INTEL_82965Q_HB       0x2990
 #define PCI_DEVICE_ID_INTEL_82965Q_IG       0x2992
 #define PCI_DEVICE_ID_INTEL_82965G_HB       0x29A0
@@ -36,7 +36,7 @@
 #define PCI_DEVICE_ID_INTEL_IGD_IG          0x2A42
 
 #define IS_I965 (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82946GZ_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_1_HB || \
+                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82G35_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965Q_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB || \
                  agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GM_HB || \
@@ -2049,7 +2049,7 @@ static const struct intel_driver_description {
 		NULL, &intel_915_driver },
 	{ PCI_DEVICE_ID_INTEL_82946GZ_HB, PCI_DEVICE_ID_INTEL_82946GZ_IG, 0, "946GZ",
 		NULL, &intel_i965_driver },
-	{ PCI_DEVICE_ID_INTEL_82965G_1_HB, PCI_DEVICE_ID_INTEL_82965G_1_IG, 0, "965G",
+	{ PCI_DEVICE_ID_INTEL_82G35_HB, PCI_DEVICE_ID_INTEL_82G35_IG, 0, "G35",
 		NULL, &intel_i965_driver },
 	{ PCI_DEVICE_ID_INTEL_82965Q_HB, PCI_DEVICE_ID_INTEL_82965Q_IG, 0, "965Q",
 		NULL, &intel_i965_driver },
@@ -2251,7 +2251,7 @@ static struct pci_device_id agp_intel_pci_table[] = {
 	ID(PCI_DEVICE_ID_INTEL_82945GM_HB),
 	ID(PCI_DEVICE_ID_INTEL_82945GME_HB),
 	ID(PCI_DEVICE_ID_INTEL_82946GZ_HB),
-	ID(PCI_DEVICE_ID_INTEL_82965G_1_HB),
+	ID(PCI_DEVICE_ID_INTEL_82G35_HB),
 	ID(PCI_DEVICE_ID_INTEL_82965Q_HB),
 	ID(PCI_DEVICE_ID_INTEL_82965G_HB),
 	ID(PCI_DEVICE_ID_INTEL_82965GM_HB),
-- 
cgit v1.1


From f011ae7437761dc071b4154cabb0041df041a7c0 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 25 Jan 2008 11:23:04 +1000
Subject: intel-agp: introduce IS_I915 and do some cleanups..

Add a new IS_I915 and also do some checkpatch whitespace cleanups.

Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/intel-agp.c | 118 +++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 61 deletions(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 07141c9..fe6fc38 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -35,11 +35,19 @@
 #define PCI_DEVICE_ID_INTEL_IGD_HB          0x2A40
 #define PCI_DEVICE_ID_INTEL_IGD_IG          0x2A42
 
+/* cover 915 and 945 variants */
+#define IS_I915 (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_E7221_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915G_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915GM_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945G_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GM_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GME_HB)
+
 #define IS_I965 (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82946GZ_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82G35_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965Q_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB || \
-                 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GM_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82G35_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965Q_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB || \
+		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GM_HB || \
 		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965GME_HB || \
 		 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IGD_HB)
 
@@ -216,7 +224,7 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
 /* Exists to support ARGB cursors */
 static void *i8xx_alloc_pages(void)
 {
-	struct page * page;
+	struct page *page;
 
 	page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
 	if (page == NULL)
@@ -445,7 +453,7 @@ static void intel_i830_init_gtt_entries(void)
 	static const int ddt[4] = { 0, 16, 32, 64 };
 	int size; /* reserved space (in kb) at the top of stolen memory */
 
-	pci_read_config_word(agp_bridge->dev,I830_GMCH_CTRL,&gmch_ctrl);
+	pci_read_config_word(agp_bridge->dev, I830_GMCH_CTRL, &gmch_ctrl);
 
 	if (IS_I965) {
 		u32 pgetbl_ctl;
@@ -544,26 +552,14 @@ static void intel_i830_init_gtt_entries(void)
 			break;
 		case I915_GMCH_GMS_STOLEN_48M:
 			/* Check it's really I915G */
-			if (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_E7221_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915G_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915GM_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945G_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GM_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GME_HB ||
-			    IS_I965 || IS_G33)
+			if (IS_I915 || IS_I965 || IS_G33)
 				gtt_entries = MB(48) - KB(size);
 			else
 				gtt_entries = 0;
 			break;
 		case I915_GMCH_GMS_STOLEN_64M:
 			/* Check it's really I915G */
-			if (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_E7221_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915G_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82915GM_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945G_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GM_HB ||
-			    agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82945GME_HB ||
-			    IS_I965 || IS_G33)
+			if (IS_I915 || IS_I965 || IS_G33)
 				gtt_entries = MB(64) - KB(size);
 			else
 				gtt_entries = 0;
@@ -614,9 +610,8 @@ static void intel_i830_setup_flush(void)
 		return;
 
 	intel_private.i8xx_page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA32);
-	if (!intel_private.i8xx_page) {
+	if (!intel_private.i8xx_page)
 		return;
-	}
 
 	/* make page uncached */
 	map_page_into_agp(intel_private.i8xx_page);
@@ -632,9 +627,9 @@ static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
 	unsigned int *pg = intel_private.i8xx_flush_page;
 	int i;
 
-	for (i = 0; i < 256; i+=2)
+	for (i = 0; i < 256; i += 2)
 		*(pg + i) = i;
-	
+
 	wmb();
 }
 
@@ -653,10 +648,10 @@ static int intel_i830_create_gatt_table(struct agp_bridge_data *bridge)
 	num_entries = size->num_entries;
 	agp_bridge->gatt_table_real = NULL;
 
-	pci_read_config_dword(intel_private.pcidev,I810_MMADDR,&temp);
+	pci_read_config_dword(intel_private.pcidev, I810_MMADDR, &temp);
 	temp &= 0xfff80000;
 
-	intel_private.registers = ioremap(temp,128 * 4096);
+	intel_private.registers = ioremap(temp, 128 * 4096);
 	if (!intel_private.registers)
 		return -ENOMEM;
 
@@ -696,7 +691,7 @@ static int intel_i830_fetch_size(void)
 		return values[0].size;
 	}
 
-	pci_read_config_word(agp_bridge->dev,I830_GMCH_CTRL,&gmch_ctrl);
+	pci_read_config_word(agp_bridge->dev, I830_GMCH_CTRL, &gmch_ctrl);
 
 	if ((gmch_ctrl & I830_GMCH_MEM_MASK) == I830_GMCH_MEM_128M) {
 		agp_bridge->previous_size = agp_bridge->current_size = (void *) values;
@@ -720,12 +715,12 @@ static int intel_i830_configure(void)
 
 	current_size = A_SIZE_FIX(agp_bridge->current_size);
 
-	pci_read_config_dword(intel_private.pcidev,I810_GMADDR,&temp);
+	pci_read_config_dword(intel_private.pcidev, I810_GMADDR, &temp);
 	agp_bridge->gart_bus_addr = (temp & PCI_BASE_ADDRESS_MEM_MASK);
 
-	pci_read_config_word(agp_bridge->dev,I830_GMCH_CTRL,&gmch_ctrl);
+	pci_read_config_word(agp_bridge->dev, I830_GMCH_CTRL, &gmch_ctrl);
 	gmch_ctrl |= I830_GMCH_ENABLED;
-	pci_write_config_word(agp_bridge->dev,I830_GMCH_CTRL,gmch_ctrl);
+	pci_write_config_word(agp_bridge->dev, I830_GMCH_CTRL, gmch_ctrl);
 
 	writel(agp_bridge->gatt_bus_addr|I810_PGETBL_ENABLED, intel_private.registers+I810_PGETBL_CTL);
 	readl(intel_private.registers+I810_PGETBL_CTL);	/* PCI Posting. */
@@ -748,9 +743,10 @@ static void intel_i830_cleanup(void)
 	iounmap(intel_private.registers);
 }
 
-static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int type)
+static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start,
+				     int type)
 {
-	int i,j,num_entries;
+	int i, j, num_entries;
 	void *temp;
 	int ret = -EINVAL;
 	int mask_type;
@@ -762,10 +758,10 @@ static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int
 	num_entries = A_SIZE_FIX(temp)->num_entries;
 
 	if (pg_start < intel_private.gtt_entries) {
-		printk (KERN_DEBUG PFX "pg_start == 0x%.8lx,intel_private.gtt_entries == 0x%.8x\n",
-				pg_start,intel_private.gtt_entries);
+		printk(KERN_DEBUG PFX "pg_start == 0x%.8lx,intel_private.gtt_entries == 0x%.8x\n",
+				pg_start, intel_private.gtt_entries);
 
-		printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n");
+		printk(KERN_INFO PFX "Trying to insert into local/stolen memory\n");
 		goto out_err;
 	}
 
@@ -803,8 +799,8 @@ out_err:
 	return ret;
 }
 
-static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start,
-				int type)
+static int intel_i830_remove_entries(struct agp_memory *mem, off_t pg_start,
+				     int type)
 {
 	int i;
 
@@ -812,7 +808,7 @@ static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start,
 		return 0;
 
 	if (pg_start < intel_private.gtt_entries) {
-		printk (KERN_INFO PFX "Trying to disable local/stolen memory\n");
+		printk(KERN_INFO PFX "Trying to disable local/stolen memory\n");
 		return -EINVAL;
 	}
 
@@ -825,7 +821,7 @@ static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start,
 	return 0;
 }
 
-static struct agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type)
+static struct agp_memory *intel_i830_alloc_by_type(size_t pg_count, int type)
 {
 	if (type == AGP_PHYS_MEMORY)
 		return alloc_agpphysmem_i8xx(pg_count, type);
@@ -884,7 +880,7 @@ static void intel_i965_g33_setup_chipset_flush(void)
 		pci_write_config_dword(agp_bridge->dev, I965_IFPADDR, (intel_private.ifp_resource.start & 0xffffffff) | 0x1);
 	} else {
 		u64 l64;
-		
+
 		temp_lo &= ~0x1;
 		l64 = ((u64)temp_hi << 32) | temp_lo;
 
@@ -918,7 +914,7 @@ static void intel_i9xx_setup_flush(void)
 	if (intel_private.ifp_resource.start) {
 		intel_private.i9xx_flush_page = ioremap_nocache(intel_private.ifp_resource.start, PAGE_SIZE);
 		if (!intel_private.i9xx_flush_page)
-			printk("unable to ioremap flush  page - no chipset flushing");
+			printk(KERN_INFO "unable to ioremap flush  page - no chipset flushing");
 	}
 }
 
@@ -935,9 +931,9 @@ static int intel_i915_configure(void)
 
 	agp_bridge->gart_bus_addr = (temp & PCI_BASE_ADDRESS_MEM_MASK);
 
-	pci_read_config_word(agp_bridge->dev,I830_GMCH_CTRL,&gmch_ctrl);
+	pci_read_config_word(agp_bridge->dev, I830_GMCH_CTRL, &gmch_ctrl);
 	gmch_ctrl |= I830_GMCH_ENABLED;
-	pci_write_config_word(agp_bridge->dev,I830_GMCH_CTRL,gmch_ctrl);
+	pci_write_config_word(agp_bridge->dev, I830_GMCH_CTRL, gmch_ctrl);
 
 	writel(agp_bridge->gatt_bus_addr|I810_PGETBL_ENABLED, intel_private.registers+I810_PGETBL_CTL);
 	readl(intel_private.registers+I810_PGETBL_CTL);	/* PCI Posting. */
@@ -952,7 +948,7 @@ static int intel_i915_configure(void)
 	global_cache_flush();
 
 	intel_i9xx_setup_flush();
-	
+
 	return 0;
 }
 
@@ -974,10 +970,10 @@ static void intel_i915_chipset_flush(struct agp_bridge_data *bridge)
 		writel(1, intel_private.i9xx_flush_page);
 }
 
-static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
-				int type)
+static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start,
+				     int type)
 {
-	int i,j,num_entries;
+	int i, j, num_entries;
 	void *temp;
 	int ret = -EINVAL;
 	int mask_type;
@@ -989,10 +985,10 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
 	num_entries = A_SIZE_FIX(temp)->num_entries;
 
 	if (pg_start < intel_private.gtt_entries) {
-		printk (KERN_DEBUG PFX "pg_start == 0x%.8lx,intel_private.gtt_entries == 0x%.8x\n",
-				pg_start,intel_private.gtt_entries);
+		printk(KERN_DEBUG PFX "pg_start == 0x%.8lx,intel_private.gtt_entries == 0x%.8x\n",
+				pg_start, intel_private.gtt_entries);
 
-		printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n");
+		printk(KERN_INFO PFX "Trying to insert into local/stolen memory\n");
 		goto out_err;
 	}
 
@@ -1030,8 +1026,8 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start,
 	return ret;
 }
 
-static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start,
-				int type)
+static int intel_i915_remove_entries(struct agp_memory *mem, off_t pg_start,
+				     int type)
 {
 	int i;
 
@@ -1039,13 +1035,13 @@ static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start,
 		return 0;
 
 	if (pg_start < intel_private.gtt_entries) {
-		printk (KERN_INFO PFX "Trying to disable local/stolen memory\n");
+		printk(KERN_INFO PFX "Trying to disable local/stolen memory\n");
 		return -EINVAL;
 	}
 
-	for (i = pg_start; i < (mem->page_count + pg_start); i++) {
+	for (i = pg_start; i < (mem->page_count + pg_start); i++)
 		writel(agp_bridge->scratch_page, intel_private.gtt+i);
-	}
+
 	readl(intel_private.gtt+i-1);
 
 	agp_bridge->driver->tlb_flush(mem);
@@ -1092,7 +1088,7 @@ static int intel_i915_create_gatt_table(struct agp_bridge_data *bridge)
 	agp_bridge->gatt_table_real = NULL;
 
 	pci_read_config_dword(intel_private.pcidev, I915_MMADDR, &temp);
-	pci_read_config_dword(intel_private.pcidev, I915_PTEADDR,&temp2);
+	pci_read_config_dword(intel_private.pcidev, I915_PTEADDR, &temp2);
 
 	if (IS_G33)
 	    gtt_map_size = 1024 * 1024; /* 1M on G33 */
@@ -1102,7 +1098,7 @@ static int intel_i915_create_gatt_table(struct agp_bridge_data *bridge)
 
 	temp &= 0xfff80000;
 
-	intel_private.registers = ioremap(temp,128 * 4096);
+	intel_private.registers = ioremap(temp, 128 * 4096);
 	if (!intel_private.registers) {
 		iounmap(intel_private.gtt);
 		return -ENOMEM;
@@ -1329,7 +1325,7 @@ static int intel_815_configure(void)
 	/* the Intel 815 chipset spec. says that bits 29-31 in the
 	* ATTBASE register are reserved -> try not to write them */
 	if (agp_bridge->gatt_bus_addr & INTEL_815_ATTBASE_MASK) {
-		printk (KERN_EMERG PFX "gatt bus addr too high");
+		printk(KERN_EMERG PFX "gatt bus addr too high");
 		return -EINVAL;
 	}
 
@@ -1986,7 +1982,7 @@ static int find_gmch(u16 device)
 	gmch_device = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
 	if (gmch_device && PCI_FUNC(gmch_device->devfn) != 0) {
 		gmch_device = pci_get_device(PCI_VENDOR_ID_INTEL,
-                                device, gmch_device);
+					     device, gmch_device);
 	}
 
 	if (!gmch_device)
@@ -2108,7 +2104,7 @@ static int __devinit agp_intel_probe(struct pci_dev *pdev,
 	if (intel_agp_chipsets[i].name == NULL) {
 		if (cap_ptr)
 			printk(KERN_WARNING PFX "Unsupported Intel chipset"
-                               "(device id: %04x)\n", pdev->device);
+			       "(device id: %04x)\n", pdev->device);
 		agp_put_bridge(bridge);
 		return -ENODEV;
 	}
@@ -2121,7 +2117,7 @@ static int __devinit agp_intel_probe(struct pci_dev *pdev,
 				intel_agp_chipsets[i].gmch_chip_id);
 		agp_put_bridge(bridge);
 		return -ENODEV;
-        }
+	}
 
 	bridge->dev = pdev;
 	bridge->capndx = cap_ptr;
-- 
cgit v1.1


From bc894606e8843808c232319f69c26c18f6eaa662 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 5 Feb 2008 15:05:23 +1000
Subject: agp: remove flush_agp_mappings calls from new flush handling code

Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/intel-agp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index fe6fc38..eeea50a 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -597,7 +597,6 @@ static void intel_i830_fini_flush(void)
 	kunmap(intel_private.i8xx_page);
 	intel_private.i8xx_flush_page = NULL;
 	unmap_page_from_agp(intel_private.i8xx_page);
-	flush_agp_mappings();
 
 	__free_page(intel_private.i8xx_page);
 	intel_private.i8xx_page = NULL;
@@ -615,7 +614,6 @@ static void intel_i830_setup_flush(void)
 
 	/* make page uncached */
 	map_page_into_agp(intel_private.i8xx_page);
-	flush_agp_mappings();
 
 	intel_private.i8xx_flush_page = kmap(intel_private.i8xx_page);
 	if (!intel_private.i8xx_flush_page)
-- 
cgit v1.1


From 322c8a3c364ef4d9ead17e86890c19816b0e262f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 5 Feb 2008 02:51:39 -0800
Subject: [IPSEC] xfrm4_beet_input(): fix an if()

A bug every C programmer makes at some point in time...

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/xfrm4_mode_beet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e093a7b..b47030b 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -102,7 +102,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
 
 		XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
 
-		if (!pskb_may_pull(skb, phlen));
+		if (!pskb_may_pull(skb, phlen))
 			goto out;
 		__skb_pull(skb, phlen);
 	}
-- 
cgit v1.1


From cc8274f50f2ad9a97a837451f63a0a3e65f7f490 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 5 Feb 2008 02:54:16 -0800
Subject: [IPV4]: Fix compile error building without CONFIG_FS_PROC

compile error building without CONFIG_FS_PROC:

net/ipv4/fib_frontend.c: In function 'fib_net_init':
net/ipv4/fib_frontend.c:1032: error: implicit declaration of function 'fib_proc_
init'
net/ipv4/fib_frontend.c: In function 'fib_net_exit':
net/ipv4/fib_frontend.c:1047: error: implicit declaration of function 'fib_proc_
exit'

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 90d1175..8b12667 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -266,6 +266,14 @@ static inline void fib_res_put(struct fib_result *res)
 #ifdef CONFIG_PROC_FS
 extern int __net_init  fib_proc_init(struct net *net);
 extern void __net_exit fib_proc_exit(struct net *net);
+#else
+static inline int fib_proc_init(struct net *net)
+{
+	return 0;
+}
+static inline void fib_proc_exit(struct net *net)
+{
+}
 #endif
 
 #endif  /* _NET_FIB_H */
-- 
cgit v1.1


From 0aead543479e7f20013217fe3fc33a604fdd8944 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Tue, 5 Feb 2008 02:56:48 -0800
Subject: [NET_SCHED]: Add #ifdef CONFIG_NET_EMATCH in net/sched/cls_flow.c
 (latest git broken build)

The 2.6 latest git build was broken when using the following
configuration options:
CONFIG_NET_EMATCH=n
CONFIG_NET_CLS_FLOW=y

with the following error:
net/sched/cls_flow.c: In function 'flow_dump':
net/sched/cls_flow.c:598: error: 'struct tcf_ematch_tree' has no
member named 'hdr'
make[2]: *** [net/sched/cls_flow.o] Error 1
make[1]: *** [net/sched] Error 2
make: *** [net] Error 2


see the recent post by Li Zefan:
  http://www.spinics.net/lists/netdev/msg54434.html

The reason for this crash is that struct tcf_ematch_tree
(net/pkt_cls.h) is empty when CONFIG_NET_EMATCH is not defined.

When CONFIG_NET_EMATCH is defined, the tcf_ematch_tree structure
indeed holds a struct tcf_ematch_tree_hdr (hdr) as flow_dump()
expects.

This patch adds #ifdef CONFIG_NET_EMATCH in flow_dump to avoid this.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flow.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5a7f6a3..8d76986 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -594,11 +594,11 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
 
 	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
 		goto nla_put_failure;
-
+#ifdef CONFIG_NET_EMATCH
 	if (f->ematches.hdr.nmatches &&
 	    tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
 		goto nla_put_failure;
-
+#endif
 	nla_nest_end(skb, nest);
 
 	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
-- 
cgit v1.1


From 6de1a9104034a2c58db3abdaf03cddb507225137 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <dlezcano@fr.ibm.com>
Date: Tue, 5 Feb 2008 02:57:59 -0800
Subject: [IPV6]: Fix sysctl compilation error.

Move ipv6_icmp_sysctl_init and ipv6_route_sysctl_init into the right
ifdef section otherwise that does not compile when CONFIG_SYSCTL=yes
and CONFIG_PROC_FS=no

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fa80ea4..c0c019f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -110,7 +110,6 @@ struct frag_hdr {
 
 /* sysctls */
 extern int sysctl_mld_max_msf;
-
 extern struct ctl_path net_ipv6_ctl_path[];
 
 #define _DEVINC(statname, modifier, idev, field)			\
@@ -586,9 +585,6 @@ extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 			 int __user *optlen);
 
 #ifdef CONFIG_PROC_FS
-extern struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
-extern struct ctl_table *ipv6_route_sysctl_init(struct net *net);
-
 extern int  ac6_proc_init(void);
 extern void ac6_proc_exit(void);
 extern int  raw6_proc_init(void);
@@ -621,6 +617,8 @@ static inline int snmp6_unregister_dev(struct inet6_dev *idev)
 extern ctl_table ipv6_route_table_template[];
 extern ctl_table ipv6_icmp_table_template[];
 
+extern struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
+extern struct ctl_table *ipv6_route_sysctl_init(struct net *net);
 extern int ipv6_sysctl_register(void);
 extern void ipv6_sysctl_unregister(void);
 #endif
-- 
cgit v1.1


From b9c4d82a853713d49ac53b507964d7cf30ee408d Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 5 Feb 2008 02:58:45 -0800
Subject: [IPV4]: Formatting fix for /proc/net/fib_trie.

The line in the /proc/net/fib_trie for route with TOS specified
- has extra \n at the end
- does not have a space after route scope
like below.
           |-- 1.1.1.1
              /32 universe UNICASTtos =1

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 35851c9..f5fba3f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2431,8 +2431,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 					   rtn_type(buf2, sizeof(buf2),
 						    fa->fa_type));
 				if (fa->fa_tos)
-					seq_printf(seq, "tos =%d\n",
-						   fa->fa_tos);
+					seq_printf(seq, " tos=%d", fa->fa_tos);
 				seq_putc(seq, '\n');
 			}
 		}
-- 
cgit v1.1


From 4c6222587118f0776bdf40d498361d49572c58dd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 5 Feb 2008 03:01:43 -0800
Subject: [SPARC64]  pci_sun4v.c: Section fixes.

WARNING: vmlinux.o(.text+0x39be4): Section mismatch in reference from the function probe_existing_entries() to the function .init.text:page_in_phys_avail()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/pci_sun4v.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c
index 1aa8e04..87c5cf5 100644
--- a/arch/sparc64/kernel/pci_sun4v.c
+++ b/arch/sparc64/kernel/pci_sun4v.c
@@ -625,8 +625,8 @@ static void __init pci_sun4v_scan_bus(struct pci_pbm_info *pbm)
 	/* XXX register error interrupt handlers XXX */
 }
 
-static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
-					    struct iommu *iommu)
+static unsigned long __init probe_existing_entries(struct pci_pbm_info *pbm,
+						   struct iommu *iommu)
 {
 	struct iommu_arena *arena = &iommu->arena;
 	unsigned long i, cnt = 0;
@@ -653,7 +653,7 @@ static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
 	return cnt;
 }
 
-static void pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
+static void __init pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
 {
 	struct iommu *iommu = pbm->iommu;
 	struct property *prop;
-- 
cgit v1.1


From d2f19fa13ee5e78d4195a771f8f1ff7d42a80740 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 5 Feb 2008 03:02:26 -0800
Subject: [SCTP]: Fix kernel panic while received AUTH chunk while enabled auth

If STCP is started while /proc/sys/net/sctp/auth_enable is set 0 and
association is established between endpoints. Then if
/proc/sys/net/sctp/auth_enable is set 1, a received AUTH chunk will
cause kernel panic.

Test as following:
step 1: echo 0> /proc/sys/net/sctp/auth_enable
step 2:

   SCTP client                  SCTP server
      INIT          --------->
                    <---------   INIT-ACK
      COOKIE-ECHO   --------->
                    <---------   COOKIE-ACK
step 3:
    echo 1> /proc/sys/net/sctp/auth_enable
step 4:
   SCTP client                  SCTP server
       AUTH        ----------->  Kernel Panic


This patch fix this probleam to treat AUTH chunk as unknow chunk if peer
has initialized with no auth capable.

> Sorry for the delay.  Was on vacation without net access.
>
> Wei Yongjun wrote:
>>
>>
>> This patch fix this probleam to treat AUTH chunk as unknow chunk if
>> peer has initialized with no auth capable.
>>
>> Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
>
> Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
>
>>

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5df0c4b..f986587 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3865,6 +3865,10 @@ sctp_disposition_t sctp_sf_eat_auth(const struct sctp_endpoint *ep,
 	struct sctp_chunk *err_chunk;
 	sctp_ierror_t error;
 
+	/* Make sure that the peer has AUTH capable */
+	if (!asoc->peer.auth_capable)
+		return sctp_sf_unk_chunk(ep, asoc, type, arg, commands);
+
 	if (!sctp_vtag_verify(chunk, asoc)) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
 				SCTP_NULL());
-- 
cgit v1.1


From 7cc08b55fc476a9474e4dc9da41071b5dc2b406e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 5 Feb 2008 03:03:06 -0800
Subject: [SCTP]: Fix kernel panic while received AUTH chunk with BAD shared
 key identifier

If SCTP-AUTH is enabled, received AUTH chunk with BAD shared key
identifier will cause kernel panic.

Test as following:
step1: enabled /proc/sys/net/sctp/auth_enable
step 2:  connect  to SCTP server with auth capable. Association is
established between endpoints. Then send a AUTH chunk with a bad
shareid, SCTP server will kernel panic after received that AUTH chunk.

SCTP client                   SCTP server
  INIT         ---------->
    (with auth capable)
               <----------    INIT-ACK
                              (with auth capable)
  COOKIE-ECHO  ---------->
               <----------    COOKIE-ACK
  AUTH         ---------->


AUTH chunk is like this:
  AUTH chunk
    Chunk type: AUTH (15)
    Chunk flags: 0x00
    Chunk length: 28
    Shared key identifier: 10
    HMAC identifier: SHA-1 (1)
    HMAC: 0000000000000000000000000000000000000000

The assignment of NULL to key can safely be removed, since key_for_each
(which is just list_for_each_entry under the covers does an initial
assignment to key anyway).

If the endpoint_shared_keys list is empty, or if the key_id being
requested does not exist, the function as it currently stands returns
the actuall list_head (in this case endpoint_shared_keys.  Since that
list_head isn't surrounded by an actuall data structure, the last
iteration through list_for_each_entry will do a container_of on key, and
we wind up returning a bogus pointer, instead of NULL, as we should.

> Neil Horman wrote:
>> On Tue, Jan 22, 2008 at 05:29:20PM +0900, Wei Yongjun wrote:
>>
>> FWIW, Ack from me.  The assignment of NULL to key can safely be
>> removed, since
>> key_for_each (which is just list_for_each_entry under the covers does
>> an initial
>> assignment to key anyway).
>> If the endpoint_shared_keys list is empty, or if the key_id being
>> requested does
>> not exist, the function as it currently stands returns the actuall
>> list_head (in
>> this case endpoint_shared_keys.  Since that list_head isn't
>> surrounded by an
>> actuall data structure, the last iteration through
>> list_for_each_entry will do a
>> container_of on key, and we wind up returning a bogus pointer,
>> instead of NULL,
>> as we should.  Wei's patch corrects that.
>>
>> Regards
>> Neil
>>
>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>
>
> Yep, the patch is correct.
>
> Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
>
> -vlad
>

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 97e6ebd..ae367c8 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -420,15 +420,15 @@ struct sctp_shared_key *sctp_auth_get_shkey(
 				const struct sctp_association *asoc,
 				__u16 key_id)
 {
-	struct sctp_shared_key *key = NULL;
+	struct sctp_shared_key *key;
 
 	/* First search associations set of endpoint pair shared keys */
 	key_for_each(key, &asoc->endpoint_shared_keys) {
 		if (key->key_id == key_id)
-			break;
+			return key;
 	}
 
-	return key;
+	return NULL;
 }
 
 /*
-- 
cgit v1.1


From cd8d627a6b66d9755637b4dad2083864a9bfce9a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 5 Feb 2008 03:04:05 -0800
Subject: hamradio: fix dmascc section mismatch

hw[] is used in both init and exit functions so it cannot be initdata (section
mismatch is when CONFIG_MODULES=n and CONFIG_DMASCC=y).

WARNING: vmlinux.o(.exit.text+0xba7): Section mismatch: reference to .init.data: (between 'dmascc_exit' and 'sixpack_exit_driver')

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Klaus Kudielka <klaus.kudielka@gmx.net>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/dmascc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index 11b83da..e04bf99 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -262,8 +262,8 @@ static void tm_isr(struct scc_priv *priv);
 
 static int io[MAX_NUM_DEVS] __initdata = { 0, };
 
-/* Beware! hw[] is also used in cleanup_module(). */
-static struct scc_hardware hw[NUM_TYPES] __initdata_or_module = HARDWARE;
+/* Beware! hw[] is also used in dmascc_exit(). */
+static struct scc_hardware hw[NUM_TYPES] = HARDWARE;
 
 
 /* Global variables */
-- 
cgit v1.1


From a26af1e08a3a1e0f88e6f2685ac2313d713a59c9 Mon Sep 17 00:00:00 2001
From: Nathaniel Filardo <nwfilardo@gmail.com>
Date: Tue, 5 Feb 2008 03:05:07 -0800
Subject: tun: impossible to deassert IFF_ONE_QUEUE or IFF_NO_PI

From: "Nathaniel Filardo" <nwfilardo@gmail.com>

Taken from http://bugzilla.kernel.org/show_bug.cgi?id=9806

The TUN/TAP driver only permits one-way transitions of IFF_NO_PI or
IFF_ONE_QUEUE during the lifetime of a tap/tun interface.  Note that
tun_set_iff contains

 541         if (ifr->ifr_flags & IFF_NO_PI)
 542                 tun->flags |= TUN_NO_PI;
 543
 544         if (ifr->ifr_flags & IFF_ONE_QUEUE)
 545                 tun->flags |= TUN_ONE_QUEUE;

This is easily fixed by adding else branches which clear these bits.

Steps to reproduce:

This is easily reproduced by setting an interface persistant using tunctl then
attempting to open it as IFF_TAP or IFF_TUN, without asserting the IFF_NO_PI
flag.  The ioctl() will succeed and the ifr.flags word is not modified, but the
interface remains in IFF_NO_PI mode (as it was set by tunctl).

Acked-by: Maxim Krasnyansky <maxk@qualcomm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 46339f6..038c1ef 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -529,9 +529,13 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr)
 
 	if (ifr->ifr_flags & IFF_NO_PI)
 		tun->flags |= TUN_NO_PI;
+	else
+		tun->flags &= ~TUN_NO_PI;
 
 	if (ifr->ifr_flags & IFF_ONE_QUEUE)
 		tun->flags |= TUN_ONE_QUEUE;
+	else
+		tun->flags &= ~TUN_ONE_QUEUE;
 
 	file->private_data = tun;
 	tun->attached = 1;
-- 
cgit v1.1


From eff001e35a857361f3fb289fea86e97c334a5446 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Tue, 5 Feb 2008 03:07:14 -0800
Subject: bluetooth: hidp_process_hid_control remove unnecessary parameter
 dealing

According to the bluetooth HID spec v1.0 chapter 7.4.2

"This code requests a major state change in a BT-HID device.  A HID_CONTROL
request does not generate a HANDSHAKE response."

"A HID_CONTROL packet with a parameter of VIRTUAL_CABLE_UNPLUG is the only
HID_CONTROL packet a device can send to a host.  A host will ignore all other
packets."

So in the hidp_precess_hid_control function, we just need to deal with the
UNLUG packet.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/hidp/core.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 782a226..b5c40d6 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -369,30 +369,13 @@ static inline void hidp_process_hid_control(struct hidp_session *session, unsign
 {
 	BT_DBG("session %p param 0x%02x", session, param);
 
-	switch (param) {
-	case HIDP_CTRL_NOP:
-		break;
-
-	case HIDP_CTRL_VIRTUAL_CABLE_UNPLUG:
+	if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
 		/* Flush the transmit queues */
 		skb_queue_purge(&session->ctrl_transmit);
 		skb_queue_purge(&session->intr_transmit);
 
 		/* Kill session thread */
 		atomic_inc(&session->terminate);
-		break;
-
-	case HIDP_CTRL_HARD_RESET:
-	case HIDP_CTRL_SOFT_RESET:
-	case HIDP_CTRL_SUSPEND:
-	case HIDP_CTRL_EXIT_SUSPEND:
-		/* FIXME: We have to parse these and return no error */
-		break;
-
-	default:
-		__hidp_send_ctrl_message(session,
-			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
-		break;
 	}
 }
 
-- 
cgit v1.1


From 91f5cca3d1b4341624715f6dd01ee09be9af46c4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 5 Feb 2008 03:07:58 -0800
Subject: bluetooth: uninlining

Remove all those inlines which were either a) unneeded or b) increased code
size.

          text    data     bss     dec     hex filename
before:   6997      74       8    7079    1ba7 net/bluetooth/hidp/core.o
after:    6492      74       8    6574    19ae net/bluetooth/hidp/core.o

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/hidp/core.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index b5c40d6..519cdb9 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -135,8 +135,8 @@ static void __hidp_copy_session(struct hidp_session *session, struct hidp_connin
 	}
 }
 
-static inline int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
-					unsigned int type, unsigned int code, int value)
+static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
+				unsigned int type, unsigned int code, int value)
 {
 	unsigned char newleds;
 	struct sk_buff *skb;
@@ -243,7 +243,8 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
 	input_sync(dev);
 }
 
-static inline int hidp_queue_report(struct hidp_session *session, unsigned char *data, int size)
+static int hidp_queue_report(struct hidp_session *session,
+				unsigned char *data, int size)
 {
 	struct sk_buff *skb;
 
@@ -287,7 +288,7 @@ static void hidp_idle_timeout(unsigned long arg)
 	hidp_schedule(session);
 }
 
-static inline void hidp_set_timer(struct hidp_session *session)
+static void hidp_set_timer(struct hidp_session *session)
 {
 	if (session->idle_to > 0)
 		mod_timer(&session->timer, jiffies + HZ * session->idle_to);
@@ -332,7 +333,8 @@ static inline int hidp_send_ctrl_message(struct hidp_session *session,
 	return err;
 }
 
-static inline void hidp_process_handshake(struct hidp_session *session, unsigned char param)
+static void hidp_process_handshake(struct hidp_session *session,
+					unsigned char param)
 {
 	BT_DBG("session %p param 0x%02x", session, param);
 
@@ -365,7 +367,8 @@ static inline void hidp_process_handshake(struct hidp_session *session, unsigned
 	}
 }
 
-static inline void hidp_process_hid_control(struct hidp_session *session, unsigned char param)
+static void hidp_process_hid_control(struct hidp_session *session,
+					unsigned char param)
 {
 	BT_DBG("session %p param 0x%02x", session, param);
 
@@ -379,7 +382,8 @@ static inline void hidp_process_hid_control(struct hidp_session *session, unsign
 	}
 }
 
-static inline void hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param)
+static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+				unsigned char param)
 {
 	BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
 
@@ -406,7 +410,8 @@ static inline void hidp_process_data(struct hidp_session *session, struct sk_buf
 	}
 }
 
-static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_ctrl_frame(struct hidp_session *session,
+					struct sk_buff *skb)
 {
 	unsigned char hdr, type, param;
 
@@ -440,7 +445,8 @@ static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_
 	kfree_skb(skb);
 }
 
-static inline void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb)
+static void hidp_recv_intr_frame(struct hidp_session *session,
+				struct sk_buff *skb)
 {
 	unsigned char hdr;
 
@@ -608,7 +614,8 @@ static struct device *hidp_get_device(struct hidp_session *session)
 	return conn ? &conn->dev : NULL;
 }
 
-static inline int hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req)
+static int hidp_setup_input(struct hidp_session *session,
+				struct hidp_connadd_req *req)
 {
 	struct input_dev *input = session->input;
 	int i;
@@ -685,7 +692,8 @@ static void hidp_setup_quirks(struct hid_device *hid)
 			hid->quirks = hidp_blacklist[n].quirks;
 }
 
-static inline void hidp_setup_hid(struct hidp_session *session, struct hidp_connadd_req *req)
+static void hidp_setup_hid(struct hidp_session *session,
+				struct hidp_connadd_req *req)
 {
 	struct hid_device *hid = session->hid;
 	struct hid_report *report;
-- 
cgit v1.1


From cb7cd42930d4421780e78323f62243350ea14789 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 5 Feb 2008 03:08:45 -0800
Subject: drivers/bluetooth/bpa10x.c: fix memleak

This patch fixea a memleak spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/bpa10x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index 1375b53..3b28658 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -423,6 +423,7 @@ static int bpa10x_send_frame(struct sk_buff *skb)
 		break;
 
 	default:
+		usb_free_urb(urb);
 		return -EILSEQ;
 	}
 
-- 
cgit v1.1


From 2fa993423a345fd484f7295797ddb59b7738ad38 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 5 Feb 2008 03:09:17 -0800
Subject: drivers/bluetooth/btsdio.c: fix double-free

This patch fixes a double-free spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/btsdio.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
index b786f61..58630cc 100644
--- a/drivers/bluetooth/btsdio.c
+++ b/drivers/bluetooth/btsdio.c
@@ -162,10 +162,8 @@ static int btsdio_rx_packet(struct btsdio_data *data)
 	bt_cb(skb)->pkt_type = hdr[3];
 
 	err = hci_recv_frame(skb);
-	if (err < 0) {
-		kfree(skb);
+	if (err < 0)
 		return err;
-	}
 
 	sdio_writeb(data->func, 0x00, REG_PC_RRT, NULL);
 
-- 
cgit v1.1


From 6e46c8cb3cbfa7bafe78d43a3d57750605a2dfa3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andy@smile.org.ua>
Date: Tue, 5 Feb 2008 03:10:02 -0800
Subject: bluetooth: blacklist another Broadcom BCM2035 device

This device is recognized as bluetooth, but still not works.

Signed-off-by: Andy Shevchenko <andy@smile.org.ua>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/hci_usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c
index 98a9cde..372c7ef6 100644
--- a/drivers/bluetooth/hci_usb.c
+++ b/drivers/bluetooth/hci_usb.c
@@ -111,6 +111,7 @@ static struct usb_device_id blacklist_ids[] = {
 	{ USB_DEVICE(0x0a5c, 0x2033), .driver_info = HCI_IGNORE },
 
 	/* Broadcom BCM2035 */
+	{ USB_DEVICE(0x0a5c, 0x2035), .driver_info = HCI_RESET | HCI_WRONG_SCO_MTU },
 	{ USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_WRONG_SCO_MTU },
 	{ USB_DEVICE(0x0a5c, 0x2009), .driver_info = HCI_BCM92035 },
 
-- 
cgit v1.1


From 93d807401ced2320d0d1e56bf9de099bba5c0424 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Tue, 5 Feb 2008 03:12:06 -0800
Subject: bluetooth rfcomm tty: destroy before tty_close()

rfcomm dev could be deleted in tty_hangup, so we must not call
rfcomm_dev_del again to prevent from destroying rfcomm dev before tty
close.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bluetooth/rfcomm/tty.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 788c703..e4c779b 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -429,7 +429,8 @@ static int rfcomm_release_dev(void __user *arg)
 	if (dev->tty)
 		tty_vhangup(dev->tty);
 
-	rfcomm_dev_del(dev);
+	if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
+		rfcomm_dev_del(dev);
 	rfcomm_dev_put(dev);
 	return 0;
 }
-- 
cgit v1.1


From 2bfc79de2b6482955f0e352da7e53787dd8167c0 Mon Sep 17 00:00:00 2001
From: Johann Felix Soden <johfel@users.sourceforge.net>
Date: Tue, 5 Feb 2008 03:13:58 -0800
Subject: [NET]: Remove further references to net-modules.txt

The Kconfig of igb and enc28j60 contains references to
obsolet Documentation/networking/net-modules.txt.

Signed-off-by: Johann Felix Soden <johfel@users.sourceforge.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index f234ba3..7d170cd 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -920,8 +920,7 @@ config ENC28J60
 	---help---
 	  Support for the Microchip EN28J60 ethernet chip.
 
-	  To compile this driver as a module, choose M here and read
-	  <file:Documentation/networking/net-modules.txt>.  The module will be
+	  To compile this driver as a module, choose M here. The module will be
 	  called enc28j60.
 
 config ENC28J60_WRITEVERIFY
@@ -2041,8 +2040,7 @@ config IGB
          More specific information on configuring the driver is in
          <file:Documentation/networking/e1000.txt>.
 
-         To compile this driver as a module, choose M here and read
-         <file:Documentation/networking/net-modules.txt>.  The module
+         To compile this driver as a module, choose M here. The module
          will be called igb.
 
 source "drivers/net/ixp2000/Kconfig"
-- 
cgit v1.1


From 5d8c0aa9433b09387d9021358baef7939f9b32c4 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 5 Feb 2008 03:14:44 -0800
Subject: [INET]: Fix accidentally broken inet(6)_hash_connect's port offset
 calculations.

The port offset calculations depend on the protocol family, but, as
Adrian noticed, I broke this logic with the commit

	5ee31fc1ecdcbc234c8c56dcacef87c8e09909d8
	[INET]: Consolidate inet(6)_hash_connect.

Return this logic back, by passing the port offset directly into the
consolidated function.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Noticed-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 2 +-
 net/ipv4/inet_hashtables.c    | 6 +++---
 net/ipv6/inet6_hashtables.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 48ac620..97dc35a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -389,7 +389,7 @@ static inline struct sock *inet_lookup(struct net *net,
 }
 
 extern int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-		struct sock *sk,
+		struct sock *sk, u32 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
 			       void (*hash)(struct sock *sk));
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 90f422c..9cac6c0 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -398,7 +398,7 @@ out:
 EXPORT_SYMBOL_GPL(inet_unhash);
 
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-		struct sock *sk,
+		struct sock *sk, u32 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
 		void (*hash)(struct sock *sk))
@@ -413,7 +413,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	if (!snum) {
 		int i, remaining, low, high, port;
 		static u32 hint;
-		u32 offset = hint + inet_sk_port_offset(sk);
+		u32 offset = hint + port_offset;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw = NULL;
 
@@ -502,7 +502,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_connect);
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
 		      struct sock *sk)
 {
-	return __inet_hash_connect(death_row, sk,
+	return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
 			__inet_check_established, __inet_hash_nolisten);
 }
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 43f3993..99fd25f 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -236,7 +236,7 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 		       struct sock *sk)
 {
-	return __inet_hash_connect(death_row, sk,
+	return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
 			__inet6_check_established, __inet6_hash);
 }
 
-- 
cgit v1.1


From 8cf229437fd826c32a44546899412b1eb3e1db6f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 5 Feb 2008 03:15:50 -0800
Subject: [ICMP]: Restore pskb_pull calls in receive function

Somewhere along the development of my ICMP relookup patch the header
length check went AWOL on the non-IPsec path.  This patch restores the
check.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 3 ++-
 net/ipv6/icmp.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a7321a8..a13c074 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1015,7 +1015,8 @@ int icmp_rcv(struct sk_buff *skb)
 			goto error;
 	}
 
-	__skb_pull(skb, sizeof(*icmph));
+	if (!pskb_pull(skb, sizeof(*icmph)))
+		goto error;
 
 	icmph = icmp_hdr(skb);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index cbb5b9c..121d517 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -683,7 +683,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
 		}
 	}
 
-	__skb_pull(skb, sizeof(*hdr));
+	if (!pskb_pull(skb, sizeof(*hdr)))
+		goto discard_it;
 
 	hdr = icmp6_hdr(skb);
 
-- 
cgit v1.1


From 03245ce2f03228d681580c30c435225efadca602 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 5 Feb 2008 03:17:22 -0800
Subject: [NET] rtnetlink.c: remove no longer used functions

This patch removes the following no longer used functions:
- rtattr_parse()
- rtattr_strlcpy()
- __rtattr_parse_nested_compat()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 12 ------------
 net/core/rtnetlink.c      | 44 --------------------------------------------
 2 files changed, 56 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index b014f6b..b9e1740 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -602,24 +602,12 @@ struct tcamsg
 
 #include <linux/mutex.h>
 
-extern size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size);
 static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
 {
 	int len = strlen(str) + 1;
 	return len > rta->rta_len || memcmp(RTA_DATA(rta), str, len);
 }
 
-extern int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len);
-extern int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
-				        struct rtattr *rta, int len);
-
-#define rtattr_parse_nested(tb, max, rta) \
-	rtattr_parse((tb), (max), RTA_DATA((rta)), RTA_PAYLOAD((rta)))
-
-#define rtattr_parse_nested_compat(tb, max, rta, data, len) \
-({	data = RTA_PAYLOAD(rta) >= len ? RTA_DATA(rta) : NULL; \
-	__rtattr_parse_nested_compat(tb, max, rta, len); })
-
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ddbdde8..61ac8d0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -82,32 +82,6 @@ int rtnl_trylock(void)
 	return mutex_trylock(&rtnl_mutex);
 }
 
-int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
-{
-	memset(tb, 0, sizeof(struct rtattr*)*maxattr);
-
-	while (RTA_OK(rta, len)) {
-		unsigned flavor = rta->rta_type;
-		if (flavor && flavor <= maxattr)
-			tb[flavor-1] = rta;
-		rta = RTA_NEXT(rta, len);
-	}
-	return 0;
-}
-
-int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
-				 struct rtattr *rta, int len)
-{
-	if (RTA_PAYLOAD(rta) < len)
-		return -1;
-	if (RTA_PAYLOAD(rta) >= RTA_ALIGN(len) + sizeof(struct rtattr)) {
-		rta = RTA_DATA(rta) + RTA_ALIGN(len);
-		return rtattr_parse_nested(tb, maxattr, rta);
-	}
-	memset(tb, 0, sizeof(struct rtattr *) * maxattr);
-	return 0;
-}
-
 static struct rtnl_link *rtnl_msg_handlers[NPROTO];
 
 static inline int rtm_msgindex(int msgtype)
@@ -442,21 +416,6 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
 	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
 
-size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
-{
-	size_t ret = RTA_PAYLOAD(rta);
-	char *src = RTA_DATA(rta);
-
-	if (ret > 0 && src[ret - 1] == '\0')
-		ret--;
-	if (size > 0) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		memset(dest, 0, size);
-		memcpy(dest, src, len);
-	}
-	return ret;
-}
-
 int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
 {
 	struct sock *rtnl = net->rtnl;
@@ -1411,9 +1370,6 @@ void __init rtnetlink_init(void)
 }
 
 EXPORT_SYMBOL(__rta_fill);
-EXPORT_SYMBOL(rtattr_strlcpy);
-EXPORT_SYMBOL(rtattr_parse);
-EXPORT_SYMBOL(__rtattr_parse_nested_compat);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
-- 
cgit v1.1


From dded91611a728d65721cdab3dd41d801a356fa15 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen.hemminger@vyatta.com>
Date: Tue, 5 Feb 2008 03:18:51 -0800
Subject: [NET]: Add if_addrlabel.h to sanitized headers.

if_addrlabel.h is needed for iproute2 usage.

Signed-off-by: Stephen Hemminger <stephen.hemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index c0f9bb7..9363122 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -219,6 +219,7 @@ unifdef-y += i2c-dev.h
 unifdef-y += icmp.h
 unifdef-y += icmpv6.h
 unifdef-y += if_addr.h
+unifdef-y += if_addrlabel.h
 unifdef-y += if_arp.h
 unifdef-y += if_bridge.h
 unifdef-y += if_ec.h
-- 
cgit v1.1


From 3113e88c3cb3c0a22920b621f8e4d1f2ccc07f1e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 5 Feb 2008 03:20:13 -0800
Subject: [PKT_SCHED]: vlan tag match

Provide a way to use tc filters on vlan tag even if tag is buried in
skb due to hardware acceleration.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h              |  3 ++-
 include/linux/tc_ematch/tc_em_meta.h |  1 +
 net/sched/em_meta.c                  | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 1c1dba9..40fac8c 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -459,7 +459,8 @@ enum
 #define	TCF_EM_U32		3
 #define	TCF_EM_META		4
 #define	TCF_EM_TEXT		5
-#define	TCF_EM_MAX		5
+#define        TCF_EM_VLAN		6
+#define	TCF_EM_MAX		6
 
 enum
 {
diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index e21937c..c50d2ba 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -81,6 +81,7 @@ enum
  	TCF_META_ID_SK_SNDTIMEO,
  	TCF_META_ID_SK_SENDMSG_OFF,
  	TCF_META_ID_SK_WRITE_PENDING,
+	TCF_META_ID_VLAN_TAG,
 	__TCF_META_ID_MAX
 };
 #define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a1e5619..9c2ec19 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -65,6 +65,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
+#include <linux/if_vlan.h>
 #include <linux/tc_ematch/tc_em_meta.h>
 #include <net/dst.h>
 #include <net/route.h>
@@ -170,6 +171,21 @@ META_COLLECTOR(var_dev)
 }
 
 /**************************************************************************
+ * vlan tag
+ **************************************************************************/
+
+META_COLLECTOR(int_vlan_tag)
+{
+	unsigned short tag;
+	if (vlan_get_tag(skb, &tag) < 0)
+		*err = -1;
+	else
+		dst->value = tag;
+}
+
+
+
+/**************************************************************************
  * skb attributes
  **************************************************************************/
 
@@ -520,6 +536,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(SK_SNDTIMEO)]		= META_FUNC(int_sk_sndtimeo),
 		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
 		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
+		[META_ID(VLAN_TAG)]		= META_FUNC(int_vlan_tag),
 	}
 };
 
-- 
cgit v1.1


From 6f52ac29712f3eec192599249b12612360948646 Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:33 +0100
Subject: [S390] cio: make sense id procedure work with partial hardware
 response

In some cases the current sense id procedure trips over incomplete
hardware responses. In these cases, checking against the preset value
of 0xFFFF is not enough. More critically, the VM DIAG call will always be
considered to have provided data after such an incident, even if it was not
successful at all.

The solution is to always initialize the control unit data before doing a
sense id call. Check the condition code before considering the control unit
data. And initialize again, before evaluating the VM data.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/device_id.c | 107 ++++++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 43 deletions(-)

diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 918b8b8..dc4d87f 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -26,17 +26,18 @@
 #include "ioasm.h"
 #include "io_sch.h"
 
-/*
- * Input :
- *   devno - device number
- *   ps	   - pointer to sense ID data area
- * Output : none
+/**
+ * vm_vdev_to_cu_type - Convert vm virtual device into control unit type
+ *			for certain devices.
+ * @class: virtual device class
+ * @type: virtual device type
+ *
+ * Returns control unit type if a match was made or %0xffff otherwise.
  */
-static void
-VM_virtual_device_info (__u16 devno, struct senseid *ps)
+static int vm_vdev_to_cu_type(int class, int type)
 {
 	static struct {
-		int vrdcvcla, vrdcvtyp, cu_type;
+		int class, type, cu_type;
 	} vm_devices[] = {
 		{ 0x08, 0x01, 0x3480 },
 		{ 0x08, 0x02, 0x3430 },
@@ -68,8 +69,26 @@ VM_virtual_device_info (__u16 devno, struct senseid *ps)
 		{ 0x40, 0xc0, 0x5080 },
 		{ 0x80, 0x00, 0x3215 },
 	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vm_devices); i++)
+		if (class == vm_devices[i].class && type == vm_devices[i].type)
+			return vm_devices[i].cu_type;
+
+	return 0xffff;
+}
+
+/**
+ * diag_get_dev_info - retrieve device information via DIAG X'210'
+ * @devno: device number
+ * @ps: pointer to sense ID data area
+ *
+ * Returns zero on success, non-zero otherwise.
+ */
+static int diag_get_dev_info(u16 devno, struct senseid *ps)
+{
 	struct diag210 diag_data;
-	int ccode, i;
+	int ccode;
 
 	CIO_TRACE_EVENT (4, "VMvdinf");
 
@@ -79,21 +98,21 @@ VM_virtual_device_info (__u16 devno, struct senseid *ps)
 	};
 
 	ccode = diag210 (&diag_data);
-	ps->reserved = 0xff;
+	if ((ccode == 0) || (ccode == 2)) {
+		ps->reserved = 0xff;
 
-	/* Special case for bloody osa devices. */
-	if (diag_data.vrdcvcla == 0x02 &&
-	    diag_data.vrdcvtyp == 0x20) {
-		ps->cu_type = 0x3088;
-		ps->cu_model = 0x60;
-		return;
-	}
-	for (i = 0; i < ARRAY_SIZE(vm_devices); i++)
-		if (diag_data.vrdcvcla == vm_devices[i].vrdcvcla &&
-		    diag_data.vrdcvtyp == vm_devices[i].vrdcvtyp) {
-			ps->cu_type = vm_devices[i].cu_type;
-			return;
+		/* Special case for osa devices. */
+		if (diag_data.vrdcvcla == 0x02 && diag_data.vrdcvtyp == 0x20) {
+			ps->cu_type = 0x3088;
+			ps->cu_model = 0x60;
+			return 0;
 		}
+		ps->cu_type = vm_vdev_to_cu_type(diag_data.vrdcvcla,
+						diag_data.vrdcvtyp);
+		if (ps->cu_type != 0xffff)
+			return 0;
+	}
+
 	CIO_MSG_EVENT(0, "DIAG X'210' for device %04X returned (cc = %d):"
 		      "vdev class : %02X, vdev type : %04X \n ...  "
 		      "rdev class : %02X, rdev type : %04X, "
@@ -102,6 +121,8 @@ VM_virtual_device_info (__u16 devno, struct senseid *ps)
 		      diag_data.vrdcvcla, diag_data.vrdcvtyp,
 		      diag_data.vrdcrccl, diag_data.vrdccrty,
 		      diag_data.vrdccrmd);
+
+	return -ENODEV;
 }
 
 /*
@@ -130,6 +151,7 @@ __ccw_device_sense_id_start(struct ccw_device *cdev)
 	/* Try on every path. */
 	ret = -ENODEV;
 	while (cdev->private->imask != 0) {
+		cdev->private->senseid.cu_type = 0xFFFF;
 		if ((sch->opm & cdev->private->imask) != 0 &&
 		    cdev->private->iretry > 0) {
 			cdev->private->iretry--;
@@ -153,7 +175,6 @@ ccw_device_sense_id_start(struct ccw_device *cdev)
 	int ret;
 
 	memset (&cdev->private->senseid, 0, sizeof (struct senseid));
-	cdev->private->senseid.cu_type = 0xFFFF;
 	cdev->private->imask = 0x80;
 	cdev->private->iretry = 5;
 	ret = __ccw_device_sense_id_start(cdev);
@@ -173,13 +194,7 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 
 	sch = to_subchannel(cdev->dev.parent);
 	irb = &cdev->private->irb;
-	/* Did we get a proper answer ? */
-	if (cdev->private->senseid.cu_type != 0xFFFF && 
-	    cdev->private->senseid.reserved == 0xFF) {
-		if (irb->scsw.count < sizeof (struct senseid) - 8)
-			cdev->private->flags.esid = 1;
-		return 0; /* Success */
-	}
+
 	/* Check the error cases. */
 	if (irb->scsw.fctl & (SCSW_FCTL_HALT_FUNC | SCSW_FCTL_CLEAR_FUNC)) {
 		/* Retry Sense ID if requested. */
@@ -231,6 +246,15 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 				      sch->schid.ssid, sch->schid.sch_no);
 		return -EACCES;
 	}
+
+	/* Did we get a proper answer ? */
+	if (irb->scsw.cc == 0 && cdev->private->senseid.cu_type != 0xFFFF &&
+	    cdev->private->senseid.reserved == 0xFF) {
+		if (irb->scsw.count < sizeof(struct senseid) - 8)
+			cdev->private->flags.esid = 1;
+		return 0; /* Success */
+	}
+
 	/* Hmm, whatever happened, try again. */
 	CIO_MSG_EVENT(2, "SenseID : start_IO() for device %04x on "
 		      "subchannel 0.%x.%04x returns status %02X%02X\n",
@@ -283,20 +307,17 @@ ccw_device_sense_id_irq(struct ccw_device *cdev, enum dev_event dev_event)
 			break;
 		/* fall through. */
 	default:		/* Sense ID failed. Try asking VM. */
-		if (MACHINE_IS_VM) {
-			VM_virtual_device_info (cdev->private->dev_id.devno,
+		if (MACHINE_IS_VM)
+			ret = diag_get_dev_info(cdev->private->dev_id.devno,
 						&cdev->private->senseid);
-			if (cdev->private->senseid.cu_type != 0xFFFF) {
-				/* Got the device information from VM. */
-				ccw_device_sense_id_done(cdev, 0);
-				return;
-			}
-		}
-		/*
-		 * If we can't couldn't identify the device type we
-		 *  consider the device "not operational".
-		 */
-		ccw_device_sense_id_done(cdev, -ENODEV);
+		else
+			/*
+			 * If we can't couldn't identify the device type we
+			 *  consider the device "not operational".
+			 */
+			ret = -ENODEV;
+
+		ccw_device_sense_id_done(cdev, ret);
 		break;
 	}
 }
-- 
cgit v1.1


From b9c9a21a7c8faeff1d13a23d2c57a5d4b512cfa0 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:34 +0100
Subject: [S390] cio: Clean up chsc response code handling.

This provides unified return codes for common response codes and
also makes the debug feature messages more similar and informational.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/chsc.c | 147 ++++++++++++++++++------------------------------
 1 file changed, 55 insertions(+), 92 deletions(-)

diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index e7ba16a..007aaeb 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -26,6 +26,25 @@
 
 static void *sei_page;
 
+static int chsc_error_from_response(int response)
+{
+	switch (response) {
+	case 0x0001:
+		return 0;
+	case 0x0002:
+	case 0x0003:
+	case 0x0006:
+	case 0x0007:
+	case 0x0008:
+	case 0x000a:
+		return -EINVAL;
+	case 0x0004:
+		return -EOPNOTSUPP;
+	default:
+		return -EIO;
+	}
+}
+
 struct chsc_ssd_area {
 	struct chsc_header request;
 	u16 :10;
@@ -75,11 +94,11 @@ int chsc_get_ssd_info(struct subchannel_id schid, struct chsc_ssd_info *ssd)
 		ret = (ccode == 3) ? -ENODEV : -EBUSY;
 		goto out_free;
 	}
-	if (ssd_area->response.code != 0x0001) {
+	ret = chsc_error_from_response(ssd_area->response.code);
+	if (ret != 0) {
 		CIO_MSG_EVENT(2, "chsc: ssd failed for 0.%x.%04x (rc=%04x)\n",
 			      schid.ssid, schid.sch_no,
 			      ssd_area->response.code);
-		ret = -EIO;
 		goto out_free;
 	}
 	if (!ssd_area->sch_valid) {
@@ -717,36 +736,15 @@ __chsc_do_secm(struct channel_subsystem *css, int enable, void *page)
 		return (ccode == 3) ? -ENODEV : -EBUSY;
 
 	switch (secm_area->response.code) {
-	case 0x0001: /* Success. */
-		ret = 0;
-		break;
-	case 0x0003: /* Invalid block. */
-	case 0x0007: /* Invalid format. */
-	case 0x0008: /* Other invalid block. */
-		CIO_CRW_EVENT(2, "Error in chsc request block!\n");
-		ret = -EINVAL;
-		break;
-	case 0x0004: /* Command not provided in model. */
-		CIO_CRW_EVENT(2, "Model does not provide secm\n");
-		ret = -EOPNOTSUPP;
-		break;
-	case 0x0102: /* cub adresses incorrect */
-		CIO_CRW_EVENT(2, "Invalid addresses in chsc request block\n");
-		ret = -EINVAL;
-		break;
-	case 0x0103: /* key error */
-		CIO_CRW_EVENT(2, "Access key error in secm\n");
+	case 0x0102:
+	case 0x0103:
 		ret = -EINVAL;
-		break;
-	case 0x0105: /* error while starting */
-		CIO_CRW_EVENT(2, "Error while starting channel measurement\n");
-		ret = -EIO;
-		break;
 	default:
-		CIO_CRW_EVENT(2, "Unknown CHSC response %d\n",
-			      secm_area->response.code);
-		ret = -EIO;
+		ret = chsc_error_from_response(secm_area->response.code);
 	}
+	if (ret != 0)
+		CIO_CRW_EVENT(2, "chsc: secm failed (rc=%04x)\n",
+			      secm_area->response.code);
 	return ret;
 }
 
@@ -827,27 +825,14 @@ int chsc_determine_channel_path_description(struct chp_id chpid,
 		goto out;
 	}
 
-	switch (scpd_area->response.code) {
-	case 0x0001: /* Success. */
+	ret = chsc_error_from_response(scpd_area->response.code);
+	if (ret == 0)
+		/* Success. */
 		memcpy(desc, &scpd_area->desc,
 		       sizeof(struct channel_path_desc));
-		ret = 0;
-		break;
-	case 0x0003: /* Invalid block. */
-	case 0x0007: /* Invalid format. */
-	case 0x0008: /* Other invalid block. */
-		CIO_CRW_EVENT(2, "Error in chsc request block!\n");
-		ret = -EINVAL;
-		break;
-	case 0x0004: /* Command not provided in model. */
-		CIO_CRW_EVENT(2, "Model does not provide scpd\n");
-		ret = -EOPNOTSUPP;
-		break;
-	default:
-		CIO_CRW_EVENT(2, "Unknown CHSC response %d\n",
+	else
+		CIO_CRW_EVENT(2, "chsc: scpd failed (rc=%04x)\n",
 			      scpd_area->response.code);
-		ret = -EIO;
-	}
 out:
 	free_page((unsigned long)scpd_area);
 	return ret;
@@ -923,8 +908,9 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
 		goto out;
 	}
 
-	switch (scmc_area->response.code) {
-	case 0x0001: /* Success. */
+	ret = chsc_error_from_response(scmc_area->response.code);
+	if (ret == 0) {
+		/* Success. */
 		if (!scmc_area->not_valid) {
 			chp->cmg = scmc_area->cmg;
 			chp->shared = scmc_area->shared;
@@ -935,22 +921,9 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
 			chp->cmg = -1;
 			chp->shared = -1;
 		}
-		ret = 0;
-		break;
-	case 0x0003: /* Invalid block. */
-	case 0x0007: /* Invalid format. */
-	case 0x0008: /* Invalid bit combination. */
-		CIO_CRW_EVENT(2, "Error in chsc request block!\n");
-		ret = -EINVAL;
-		break;
-	case 0x0004: /* Command not provided. */
-		CIO_CRW_EVENT(2, "Model does not provide scmc\n");
-		ret = -EOPNOTSUPP;
-		break;
-	default:
-		CIO_CRW_EVENT(2, "Unknown CHSC response %d\n",
+	} else {
+		CIO_CRW_EVENT(2, "chsc: scmc failed (rc=%04x)\n",
 			      scmc_area->response.code);
-		ret = -EIO;
 	}
 out:
 	free_page((unsigned long)scmc_area);
@@ -1002,21 +975,17 @@ chsc_enable_facility(int operation_code)
 		ret = (ret == 3) ? -ENODEV : -EBUSY;
 		goto out;
 	}
+
 	switch (sda_area->response.code) {
-	case 0x0001: /* everything ok */
-		ret = 0;
-		break;
-	case 0x0003: /* invalid request block */
-	case 0x0007:
-		ret = -EINVAL;
-		break;
-	case 0x0004: /* command not provided */
-	case 0x0101: /* facility not provided */
+	case 0x0101:
 		ret = -EOPNOTSUPP;
 		break;
-	default: /* something went wrong */
-		ret = -EIO;
+	default:
+		ret = chsc_error_from_response(sda_area->response.code);
 	}
+	if (ret != 0)
+		CIO_CRW_EVENT(2, "chsc: sda (oc=%x) failed (rc=%04x)\n",
+			      operation_code, sda_area->response.code);
  out:
 	free_page((unsigned long)sda_area);
 	return ret;
@@ -1041,33 +1010,27 @@ chsc_determine_css_characteristics(void)
 	} __attribute__ ((packed)) *scsc_area;
 
 	scsc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
-	if (!scsc_area) {
-		CIO_MSG_EVENT(0, "Was not able to determine available "
-			      "CHSCs due to no memory.\n");
+	if (!scsc_area)
 		return -ENOMEM;
-	}
 
 	scsc_area->request.length = 0x0010;
 	scsc_area->request.code = 0x0010;
 
 	result = chsc(scsc_area);
 	if (result) {
-		CIO_MSG_EVENT(0, "Was not able to determine available CHSCs, "
-			      "cc=%i.\n", result);
-		result = -EIO;
+		result = (result == 3) ? -ENODEV : -EBUSY;
 		goto exit;
 	}
 
-	if (scsc_area->response.code != 1) {
-		CIO_MSG_EVENT(0, "Was not able to determine "
-			      "available CHSCs.\n");
-		result = -EIO;
-		goto exit;
-	}
-	memcpy(&css_general_characteristics, scsc_area->general_char,
-	       sizeof(css_general_characteristics));
-	memcpy(&css_chsc_characteristics, scsc_area->chsc_char,
-	       sizeof(css_chsc_characteristics));
+	result = chsc_error_from_response(scsc_area->response.code);
+	if (result == 0) {
+		memcpy(&css_general_characteristics, scsc_area->general_char,
+		       sizeof(css_general_characteristics));
+		memcpy(&css_chsc_characteristics, scsc_area->chsc_char,
+		       sizeof(css_chsc_characteristics));
+	} else
+		CIO_CRW_EVENT(2, "chsc: scsc failed (rc=%04x)\n",
+			      scsc_area->response.code);
 exit:
 	free_page ((unsigned long) scsc_area);
 	return result;
-- 
cgit v1.1


From 2fffc9355e6240466d1af764b0dcdede52085f7c Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:35 +0100
Subject: [S390] cio: Update documentation.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 Documentation/DocBook/s390-drivers.tmpl | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
index 3d2f31b..4acc732 100644
--- a/Documentation/DocBook/s390-drivers.tmpl
+++ b/Documentation/DocBook/s390-drivers.tmpl
@@ -59,7 +59,7 @@
    <title>Introduction</title>
   <para>
     This document describes the interfaces available for device drivers that
-    drive s390 based channel attached devices. This includes interfaces for
+    drive s390 based channel attached I/O devices. This includes interfaces for
     interaction with the hardware and interfaces for interacting with the
     common driver core. Those interfaces are provided by the s390 common I/O
     layer.
@@ -86,9 +86,10 @@
 	The ccw bus typically contains the majority of devices available to
 	a s390 system. Named after the channel command word (ccw), the basic
 	command structure used to address its devices, the ccw bus contains
-	so-called channel attached devices. They are addressed via subchannels,
-	visible on the css bus. A device driver, however, will never interact
-	with the subchannel directly, but only via the device on the ccw bus,
+	so-called channel attached devices. They are addressed via I/O
+	subchannels, visible on the css bus. A device driver for
+	channel-attached devices, however, will never interact	with the
+	subchannel directly, but only via the I/O device on the ccw bus,
 	the ccw device.
   </para>
     <sect1 id="channelIO">
@@ -116,7 +117,6 @@
 !Iinclude/asm-s390/ccwdev.h
 !Edrivers/s390/cio/device.c
 !Edrivers/s390/cio/device_ops.c
-!Edrivers/s390/cio/airq.c
     </sect1>
     <sect1 id="cmf">
      <title>The channel-measurement facility</title>
@@ -147,4 +147,15 @@
    </sect1>
   </chapter>
 
+  <chapter id="genericinterfaces">
+   <title>Generic interfaces</title>
+  <para>
+	Some interfaces are available to other drivers that do not necessarily
+	have anything to do with the busses described above, but still are
+	indirectly using basic infrastructure in the common I/O layer.
+	One example is the support for adapter interrupts.
+  </para>
+!Edrivers/s390/cio/airq.c
+  </chapter>
+
 </book>
-- 
cgit v1.1


From 01bc8ad165490458a8feb744c8f401c1a7098e3a Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:36 +0100
Subject: [S390] cio: Add shutdown callback for ccwgroup.

This intendeds to make proper shutdown of qeth devices easier.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/ccwgroup.c | 12 ++++++++++++
 include/asm-s390/ccwgroup.h |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index 3964056..03914fa 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -391,12 +391,24 @@ ccwgroup_remove (struct device *dev)
 	return 0;
 }
 
+static void ccwgroup_shutdown(struct device *dev)
+{
+	struct ccwgroup_device *gdev;
+	struct ccwgroup_driver *gdrv;
+
+	gdev = to_ccwgroupdev(dev);
+	gdrv = to_ccwgroupdrv(dev->driver);
+	if (gdrv && gdrv->shutdown)
+		gdrv->shutdown(gdev);
+}
+
 static struct bus_type ccwgroup_bus_type = {
 	.name   = "ccwgroup",
 	.match  = ccwgroup_bus_match,
 	.uevent = ccwgroup_uevent,
 	.probe  = ccwgroup_probe,
 	.remove = ccwgroup_remove,
+	.shutdown = ccwgroup_shutdown,
 };
 
 /**
diff --git a/include/asm-s390/ccwgroup.h b/include/asm-s390/ccwgroup.h
index 7109c7c..289053e 100644
--- a/include/asm-s390/ccwgroup.h
+++ b/include/asm-s390/ccwgroup.h
@@ -37,6 +37,7 @@ struct ccwgroup_device {
  * @remove: function called on remove
  * @set_online: function called when device is set online
  * @set_offline: function called when device is set offline
+ * @shutdown: function called when device is shut down
  * @driver: embedded driver structure
  */
 struct ccwgroup_driver {
@@ -49,6 +50,7 @@ struct ccwgroup_driver {
 	void (*remove) (struct ccwgroup_device *);
 	int (*set_online) (struct ccwgroup_device *);
 	int (*set_offline) (struct ccwgroup_device *);
+	void (*shutdown)(struct ccwgroup_device *);
 
 	struct device_driver driver;
 };
-- 
cgit v1.1


From 2485579bf5d3ea30d39b251defa1620ad77168bd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:37 +0100
Subject: [S390] DEBUG_PAGEALLOC support for s390.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig.debug       |  8 ++++++++
 arch/s390/kernel/traps.c      |  5 ++++-
 arch/s390/mm/init.c           | 27 +++++++++++++++++++++++++++
 include/asm-s390/cacheflush.h |  4 ++++
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug
index 2283933..4599fa0 100644
--- a/arch/s390/Kconfig.debug
+++ b/arch/s390/Kconfig.debug
@@ -6,4 +6,12 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config DEBUG_PAGEALLOC
+	bool "Debug page memory allocations"
+	depends on DEBUG_KERNEL
+	help
+	  Unmap pages from the kernel linear mapping after free_pages().
+	  This results in a slowdown, but helps to find certain types of
+	  memory corruptions.
+
 endmenu
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 52b8342..1a2fdb69 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -271,7 +271,10 @@ void die(const char * str, struct pt_regs * regs, long err)
 	printk("PREEMPT ");
 #endif
 #ifdef CONFIG_SMP
-	printk("SMP");
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
 	notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index b234bb4..983ec6e 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -167,6 +167,33 @@ void __init mem_init(void)
 	       PFN_ALIGN((unsigned long)&_eshared) - 1);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long address;
+	int i;
+
+	for (i = 0; i < numpages; i++) {
+		address = page_to_phys(page + i);
+		pgd = pgd_offset_k(address);
+		pud = pud_offset(pgd, address);
+		pmd = pmd_offset(pud, address);
+		pte = pte_offset_kernel(pmd, address);
+		if (!enable) {
+			ptep_invalidate(address, pte);
+			continue;
+		}
+		*pte = mk_pte_phys(address, __pgprot(_PAGE_TYPE_RW));
+		/* Flush cpu write queue. */
+		mb();
+	}
+}
+#endif
+
 void free_initmem(void)
 {
         unsigned long addr;
diff --git a/include/asm-s390/cacheflush.h b/include/asm-s390/cacheflush.h
index f7cade8..49d5af9 100644
--- a/include/asm-s390/cacheflush.h
+++ b/include/asm-s390/cacheflush.h
@@ -24,4 +24,8 @@
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy(dst, src, len)
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable);
+#endif
+
 #endif /* _S390_CACHEFLUSH_H */
-- 
cgit v1.1


From a817a61f85c75181bde6c3d83ae11a24b089dd6a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:38 +0100
Subject: [S390] Fix linker script.

Fixes this warning:
vmlinux: warning: allocated section `.text' not in segment

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/vmlinux.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 7d43c3c..b460715 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -35,7 +35,7 @@ SECTIONS
 		KPROBES_TEXT
 		*(.fixup)
 		*(.gnu.warning)
-	} = 0x0700
+	} :text = 0x0700
 
 	_etext = .;		/* End of text section */
 
-- 
cgit v1.1


From 37c5f719e71882b759fa8acbdd11d5ca3a7965bb Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:39 +0100
Subject: [S390] Fix smp_call_function_mask semantics.

Make sure func isn't called on the local cpu just like on all other
architectures that implement this function.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/smp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index aa37fa1..f5046fd 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -225,12 +225,11 @@ EXPORT_SYMBOL(smp_call_function_single);
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int
-smp_call_function_mask(cpumask_t mask,
-			void (*func)(void *), void *info,
-			int wait)
+int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
+			   int wait)
 {
 	preempt_disable();
+	cpu_clear(smp_processor_id(), mask);
 	__smp_call_function_map(func, info, 0, wait, mask);
 	preempt_enable();
 	return 0;
-- 
cgit v1.1


From 2bc89b5ece48dc888734e8760ba5ad8566431912 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:40 +0100
Subject: [S390] Fix couple of section mismatches.

Fix couple of section mismatches. And since we touch the code
anyway change the IPL code to use C99 initializers.

Cc: Michael Holzheu <holzheu@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S   |  7 ++-----
 arch/s390/kernel/entry64.S |  7 ++-----
 arch/s390/kernel/ipl.c     | 27 ++++++++++++++++++---------
 arch/s390/kernel/setup.c   |  2 +-
 arch/s390/kernel/smp.c     |  6 +++---
 arch/s390/mm/vmem.c        |  2 +-
 6 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 1a6dac8..6766e37 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -11,6 +11,7 @@
 
 #include <linux/sys.h>
 #include <linux/linkage.h>
+#include <linux/init.h>
 #include <asm/cache.h>
 #include <asm/lowcore.h>
 #include <asm/errno.h>
@@ -830,9 +831,7 @@ mcck_return:
  * Restart interruption handler, kick starter for additional CPUs
  */
 #ifdef CONFIG_SMP
-#ifndef CONFIG_HOTPLUG_CPU
-	.section .init.text,"ax"
-#endif
+	__CPUINIT
 	.globl restart_int_handler
 restart_int_handler:
 	l	%r15,__LC_SAVE_AREA+60	# load ksp
@@ -845,9 +844,7 @@ restart_int_handler:
 	br	%r14			# branch to start_secondary
 restart_addr:
 	.long	start_secondary
-#ifndef CONFIG_HOTPLUG_CPU
 	.previous
-#endif
 #else
 /*
  * If we do not run with SMP enabled, let the new CPU crash ...
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index a3e47b8..efde6e1 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -11,6 +11,7 @@
 
 #include <linux/sys.h>
 #include <linux/linkage.h>
+#include <linux/init.h>
 #include <asm/cache.h>
 #include <asm/lowcore.h>
 #include <asm/errno.h>
@@ -801,9 +802,7 @@ mcck_return:
  * Restart interruption handler, kick starter for additional CPUs
  */
 #ifdef CONFIG_SMP
-#ifndef CONFIG_HOTPLUG_CPU
-	.section .init.text,"ax"
-#endif
+	__CPUINIT
 	.globl restart_int_handler
 restart_int_handler:
 	lg	%r15,__LC_SAVE_AREA+120 # load ksp
@@ -814,9 +813,7 @@ restart_int_handler:
 	lmg	%r6,%r15,__SF_GPRS(%r15) # load registers from clone
 	stosm	__SF_EMPTY(%r15),0x04	# now we can turn dat on
 	jg	start_secondary
-#ifndef CONFIG_HOTPLUG_CPU
 	.previous
-#endif
 #else
 /*
  * If we do not run with SMP enabled, let the new CPU crash ...
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index db28cca..60acdc2 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -439,7 +439,7 @@ static void ipl_run(struct shutdown_trigger *trigger)
 		reipl_ccw_dev(&ipl_info.data.ccw.dev_id);
 }
 
-static int ipl_init(void)
+static int __init ipl_init(void)
 {
 	int rc;
 
@@ -471,8 +471,11 @@ out:
 	return 0;
 }
 
-static struct shutdown_action ipl_action = {SHUTDOWN_ACTION_IPL_STR, ipl_run,
-					    ipl_init};
+static struct shutdown_action __refdata ipl_action = {
+	.name	= SHUTDOWN_ACTION_IPL_STR,
+	.fn	= ipl_run,
+	.init	= ipl_init,
+};
 
 /*
  * reipl shutdown action: Reboot Linux on shutdown.
@@ -792,7 +795,7 @@ static int __init reipl_fcp_init(void)
 	return 0;
 }
 
-static int reipl_init(void)
+static int __init reipl_init(void)
 {
 	int rc;
 
@@ -819,8 +822,11 @@ static int reipl_init(void)
 	return 0;
 }
 
-static struct shutdown_action reipl_action = {SHUTDOWN_ACTION_REIPL_STR,
-					      reipl_run, reipl_init};
+static struct shutdown_action __refdata reipl_action = {
+	.name	= SHUTDOWN_ACTION_REIPL_STR,
+	.fn	= reipl_run,
+	.init	= reipl_init,
+};
 
 /*
  * dump shutdown action: Dump Linux on shutdown.
@@ -998,7 +1004,7 @@ static int __init dump_fcp_init(void)
 	return 0;
 }
 
-static int dump_init(void)
+static int __init dump_init(void)
 {
 	int rc;
 
@@ -1020,8 +1026,11 @@ static int dump_init(void)
 	return 0;
 }
 
-static struct shutdown_action dump_action = {SHUTDOWN_ACTION_DUMP_STR,
-					     dump_run, dump_init};
+static struct shutdown_action __refdata dump_action = {
+	.name	= SHUTDOWN_ACTION_DUMP_STR,
+	.fn	= dump_run,
+	.init	= dump_init,
+};
 
 /*
  * vmcmd shutdown action: Trigger vm command on shutdown.
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 766c783..2d266f4 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -77,7 +77,7 @@ unsigned long machine_flags = 0;
 unsigned long elf_hwcap = 0;
 char elf_platform[ELF_PLATFORM_SIZE];
 
-struct mem_chunk __initdata memory_chunk[MEMORY_CHUNKS];
+struct mem_chunk __meminitdata memory_chunk[MEMORY_CHUNKS];
 volatile int __cpu_logical_map[NR_CPUS]; /* logical cpu to cpu address */
 static unsigned long __initdata memory_end;
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index f5046fd..8506065 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -1007,7 +1007,7 @@ static struct notifier_block __cpuinitdata smp_cpu_nb = {
 	.notifier_call = smp_cpu_notify,
 };
 
-static int smp_add_present_cpu(int cpu)
+static int __devinit smp_add_present_cpu(int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
 	struct sys_device *s = &c->sysdev;
@@ -1035,8 +1035,8 @@ out:
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t rescan_store(struct sys_device *dev, const char *buf,
-			    size_t count)
+static ssize_t __ref rescan_store(struct sys_device *dev,
+				  const char *buf, size_t count)
 {
 	cpumask_t newcpus;
 	int cpu;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 79d13a1..07e0467 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -62,7 +62,7 @@ void __meminit memmap_init(unsigned long size, int nid, unsigned long zone,
 	}
 }
 
-static void __init_refok *vmem_alloc_pages(unsigned int order)
+static void __ref *vmem_alloc_pages(unsigned int order)
 {
 	if (slab_is_available())
 		return (void *)__get_free_pages(GFP_KERNEL, order);
-- 
cgit v1.1


From 8c0933eeb701eb8f526d88b1915af7bb35748e7b Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:41 +0100
Subject: [S390] console: allow vt220 console to be the only console

Fix console detection logic to support configurations in which the
vt220 console is the only available Linux console.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/setup.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 2d266f4..72d20f7 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -145,7 +145,7 @@ __setup("condev=", condev_setup);
 
 static int __init conmode_setup(char *str)
 {
-#if defined(CONFIG_SCLP_CONSOLE)
+#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
 	if (strncmp(str, "hwc", 4) == 0 || strncmp(str, "sclp", 5) == 0)
                 SET_CONSOLE_SCLP;
 #endif
@@ -183,7 +183,7 @@ static void __init conmode_default(void)
 		 */
 		cpcmd("TERM CONMODE 3215", NULL, 0, NULL);
 		if (ptr == NULL) {
-#if defined(CONFIG_SCLP_CONSOLE)
+#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
 			SET_CONSOLE_SCLP;
 #endif
 			return;
@@ -193,7 +193,7 @@ static void __init conmode_default(void)
 			SET_CONSOLE_3270;
 #elif defined(CONFIG_TN3215_CONSOLE)
 			SET_CONSOLE_3215;
-#elif defined(CONFIG_SCLP_CONSOLE)
+#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
 			SET_CONSOLE_SCLP;
 #endif
 		} else if (strncmp(ptr + 8, "3215", 4) == 0) {
@@ -201,7 +201,7 @@ static void __init conmode_default(void)
 			SET_CONSOLE_3215;
 #elif defined(CONFIG_TN3270_CONSOLE)
 			SET_CONSOLE_3270;
-#elif defined(CONFIG_SCLP_CONSOLE)
+#elif defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
 			SET_CONSOLE_SCLP;
 #endif
 		}
@@ -212,7 +212,7 @@ static void __init conmode_default(void)
 		SET_CONSOLE_3270;
 #endif
 	} else {
-#if defined(CONFIG_SCLP_CONSOLE)
+#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
 		SET_CONSOLE_SCLP;
 #endif
 	}
-- 
cgit v1.1


From b6b40c532a36f91df9c21caf9baba3b635e99d4e Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:42 +0100
Subject: [S390] Define GENERIC_LOCKBREAK.

Fix compile error:

  CC      arch/s390/kernel/asm-offsets.s
In file included from
arch/s390/kernel/asm-offsets.c:7:
include/linux/sched.h: In function 'spin_needbreak':
include/linux/sched.h:1931: error: implicit declaration of function '__raw_spin_is_contended'
make[2]: *** [arch/s390/kernel/asm-offsets.s] Error 1

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 82cbffd..f0e7ccf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -47,6 +47,11 @@ config NO_IOMEM
 config NO_DMA
 	def_bool y
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
-- 
cgit v1.1


From 0abbf05cdd69d74f92628bf444cd210ba046f6eb Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:43 +0100
Subject: [S390] Cleanup & optimize bitops.

The bitops header is now a bit shorter and easier to understand since
it uses less inline assembly. It requires some tricks to persuade the
compiler to generate decent code. The ffz/ffs functions now use the
_zb_findmap/_sb_findmap table as well.
With this cleanup the new bitops for ext4 can be implemented with a
few lines, instead of another large inline assembly.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/asm-s390/bitops.h | 515 ++++++++++++++++++++--------------------------
 1 file changed, 219 insertions(+), 296 deletions(-)

diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index dba6fec..95d9395 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -440,242 +440,256 @@ __constant_test_bit(unsigned long nr, const volatile unsigned long *addr) {
  __test_bit((nr),(addr)) )
 
 /*
- * ffz = Find First Zero in word. Undefined if no zero exists,
- * so code should check against ~0UL first..
+ * Optimized find bit helper functions.
  */
-static inline unsigned long ffz(unsigned long word)
+
+/**
+ * __ffz_word_loop - find byte offset of first long != -1UL
+ * @addr: pointer to array of unsigned long
+ * @size: size of the array in bits
+ */
+static inline unsigned long __ffz_word_loop(const unsigned long *addr,
+					    unsigned long size)
+{
+	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
+	unsigned long bytes = 0;
+
+	asm volatile(
+#ifndef __s390x__
+		"	ahi	%1,31\n"
+		"	srl	%1,5\n"
+		"0:	c	%2,0(%0,%3)\n"
+		"	jne	1f\n"
+		"	la	%0,4(%0)\n"
+		"	brct	%1,0b\n"
+		"1:\n"
+#else
+		"	aghi	%1,63\n"
+		"	srlg	%1,%1,6\n"
+		"0:	cg	%2,0(%0,%3)\n"
+		"	jne	1f\n"
+		"	la	%0,8(%0)\n"
+		"	brct	%1,0b\n"
+		"1:\n"
+#endif
+		: "+a" (bytes), "+d" (size)
+		: "d" (-1UL), "a" (addr), "m" (*(addrtype *) addr)
+		: "cc" );
+	return bytes;
+}
+
+/**
+ * __ffs_word_loop - find byte offset of first long != 0UL
+ * @addr: pointer to array of unsigned long
+ * @size: size of the array in bits
+ */
+static inline unsigned long __ffs_word_loop(const unsigned long *addr,
+					    unsigned long size)
 {
-        unsigned long bit = 0;
+	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
+	unsigned long bytes = 0;
 
+	asm volatile(
+#ifndef __s390x__
+		"	ahi	%1,31\n"
+		"	srl	%1,5\n"
+		"0:	c	%2,0(%0,%3)\n"
+		"	jne	1f\n"
+		"	la	%0,4(%0)\n"
+		"	brct	%1,0b\n"
+		"1:\n"
+#else
+		"	aghi	%1,63\n"
+		"	srlg	%1,%1,6\n"
+		"0:	cg	%2,0(%0,%3)\n"
+		"	jne	1f\n"
+		"	la	%0,8(%0)\n"
+		"	brct	%1,0b\n"
+		"1:\n"
+#endif
+		: "+a" (bytes), "+a" (size)
+		: "d" (0UL), "a" (addr), "m" (*(addrtype *) addr)
+		: "cc" );
+	return bytes;
+}
+
+/**
+ * __ffz_word - add number of the first unset bit
+ * @nr: base value the bit number is added to
+ * @word: the word that is searched for unset bits
+ */
+static inline unsigned long __ffz_word(unsigned long nr, unsigned long word)
+{
 #ifdef __s390x__
 	if (likely((word & 0xffffffff) == 0xffffffff)) {
 		word >>= 32;
-		bit += 32;
+		nr += 32;
 	}
 #endif
 	if (likely((word & 0xffff) == 0xffff)) {
 		word >>= 16;
-		bit += 16;
+		nr += 16;
 	}
 	if (likely((word & 0xff) == 0xff)) {
 		word >>= 8;
-		bit += 8;
+		nr += 8;
 	}
-	return bit + _zb_findmap[word & 0xff];
+	return nr + _zb_findmap[(unsigned char) word];
 }
 
-/*
- * __ffs = find first bit in word. Undefined if no bit exists,
- * so code should check against 0UL first..
+/**
+ * __ffs_word - add number of the first set bit
+ * @nr: base value the bit number is added to
+ * @word: the word that is searched for set bits
  */
-static inline unsigned long __ffs (unsigned long word)
+static inline unsigned long __ffs_word(unsigned long nr, unsigned long word)
 {
-	unsigned long bit = 0;
-
 #ifdef __s390x__
 	if (likely((word & 0xffffffff) == 0)) {
 		word >>= 32;
-		bit += 32;
+		nr += 32;
 	}
 #endif
 	if (likely((word & 0xffff) == 0)) {
 		word >>= 16;
-		bit += 16;
+		nr += 16;
 	}
 	if (likely((word & 0xff) == 0)) {
 		word >>= 8;
-		bit += 8;
+		nr += 8;
 	}
-	return bit + _sb_findmap[word & 0xff];
+	return nr + _sb_findmap[(unsigned char) word];
 }
 
-/*
- * Find-bit routines..
- */
 
-#ifndef __s390x__
+/**
+ * __load_ulong_be - load big endian unsigned long
+ * @p: pointer to array of unsigned long
+ * @offset: byte offset of source value in the array
+ */
+static inline unsigned long __load_ulong_be(const unsigned long *p,
+					    unsigned long offset)
+{
+	p = (unsigned long *)((unsigned long) p + offset);
+	return *p;
+}
 
-static inline int
-find_first_zero_bit(const unsigned long * addr, unsigned long size)
+/**
+ * __load_ulong_le - load little endian unsigned long
+ * @p: pointer to array of unsigned long
+ * @offset: byte offset of source value in the array
+ */
+static inline unsigned long __load_ulong_le(const unsigned long *p,
+					    unsigned long offset)
 {
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-	unsigned long cmp, count;
-        unsigned int res;
+	unsigned long word;
 
-        if (!size)
-                return 0;
+	p = (unsigned long *)((unsigned long) p + offset);
+#ifndef __s390x__
 	asm volatile(
-		"	lhi	%1,-1\n"
-		"	lr	%2,%3\n"
-		"	slr	%0,%0\n"
-		"	ahi	%2,31\n"
-		"	srl	%2,5\n"
-		"0:	c	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	la	%0,4(%0)\n"
-		"	brct	%2,0b\n"
-		"	lr	%0,%3\n"
-		"	j	4f\n"
-		"1:	l	%2,0(%0,%4)\n"
-		"	sll	%0,3\n"
-		"	lhi	%1,0xff\n"
-		"	tml	%2,0xffff\n"
-		"	jno	2f\n"
-		"	ahi	%0,16\n"
-		"	srl	%2,16\n"
-		"2:	tml	%2,0x00ff\n"
-		"	jno	3f\n"
-		"	ahi	%0,8\n"
-		"	srl	%2,8\n"
-		"3:	nr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	alr	%0,%2\n"
-		"4:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (addr), "a" (&_zb_findmap),
-		  "m" (*(addrtype *) addr) : "cc");
-        return (res < size) ? res : size;
+		"	ic	%0,0(%1)\n"
+		"	icm	%0,2,1(%1)\n"
+		"	icm	%0,4,2(%1)\n"
+		"	icm	%0,8,3(%1)"
+		: "=&d" (word) : "a" (p), "m" (*p) : "cc");
+#else
+	asm volatile(
+		"	lrvg	%0,%1"
+		: "=d" (word) : "m" (*p) );
+#endif
+	return word;
 }
 
-static inline int
-find_first_bit(const unsigned long * addr, unsigned long size)
+/*
+ * The various find bit functions.
+ */
+
+/*
+ * ffz - find first zero in word.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
 {
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-	unsigned long cmp, count;
-        unsigned int res;
+	return __ffz_word(0, word);
+}
 
-        if (!size)
-                return 0;
-	asm volatile(
-		"	slr	%1,%1\n"
-		"	lr	%2,%3\n"
-		"	slr	%0,%0\n"
-		"	ahi	%2,31\n"
-		"	srl	%2,5\n"
-		"0:	c	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	la	%0,4(%0)\n"
-		"	brct	%2,0b\n"
-		"	lr	%0,%3\n"
-		"	j	4f\n"
-		"1:	l	%2,0(%0,%4)\n"
-		"	sll	%0,3\n"
-		"	lhi	%1,0xff\n"
-		"	tml	%2,0xffff\n"
-		"	jnz	2f\n"
-		"	ahi	%0,16\n"
-		"	srl	%2,16\n"
-		"2:	tml	%2,0x00ff\n"
-		"	jnz	3f\n"
-		"	ahi	%0,8\n"
-		"	srl	%2,8\n"
-		"3:	nr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	alr	%0,%2\n"
-		"4:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (addr), "a" (&_sb_findmap),
-		  "m" (*(addrtype *) addr) : "cc");
-        return (res < size) ? res : size;
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs (unsigned long word)
+{
+	return __ffs_word(0, word);
 }
 
-#else /* __s390x__ */
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as
+ * the libc and compiler builtin ffs routines, therefore
+ * differs in spirit from the above ffz (man ffs).
+ */
+static inline int ffs(int x)
+{
+	if (!x)
+		return 0;
+	return __ffs_word(1, x);
+}
 
-static inline unsigned long
-find_first_zero_bit(const unsigned long * addr, unsigned long size)
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+static inline unsigned long find_first_zero_bit(const unsigned long *addr,
+						unsigned long size)
 {
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-        unsigned long res, cmp, count;
+	unsigned long bytes, bits;
 
         if (!size)
                 return 0;
-	asm volatile(
-		"	lghi	%1,-1\n"
-		"	lgr	%2,%3\n"
-		"	slgr	%0,%0\n"
-		"	aghi	%2,63\n"
-		"	srlg	%2,%2,6\n"
-		"0:	cg	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	la	%0,8(%0)\n"
-		"	brct	%2,0b\n"
-		"	lgr	%0,%3\n"
-		"	j	5f\n"
-		"1:	lg	%2,0(%0,%4)\n"
-		"	sllg	%0,%0,3\n"
-		"	clr	%2,%1\n"
-		"	jne	2f\n"
-		"	aghi	%0,32\n"
-		"	srlg	%2,%2,32\n"
-		"2:	lghi	%1,0xff\n"
-		"	tmll	%2,0xffff\n"
-		"	jno	3f\n"
-		"	aghi	%0,16\n"
-		"	srl	%2,16\n"
-		"3:	tmll	%2,0x00ff\n"
-		"	jno	4f\n"
-		"	aghi	%0,8\n"
-		"	srl	%2,8\n"
-		"4:	ngr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	algr	%0,%2\n"
-		"5:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (addr), "a" (&_zb_findmap),
-		  "m" (*(addrtype *) addr) : "cc");
-        return (res < size) ? res : size;
-}
-
-static inline unsigned long
-find_first_bit(const unsigned long * addr, unsigned long size)
+	bytes = __ffz_word_loop(addr, size);
+	bits = __ffz_word(bytes*8, __load_ulong_be(addr, bytes));
+	return (bits < size) ? bits : size;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+static inline unsigned long find_first_bit(const unsigned long * addr,
+					   unsigned long size)
 {
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-        unsigned long res, cmp, count;
+	unsigned long bytes, bits;
 
         if (!size)
                 return 0;
-	asm volatile(
-		"	slgr	%1,%1\n"
-		"	lgr	%2,%3\n"
-		"	slgr	%0,%0\n"
-		"	aghi	%2,63\n"
-		"	srlg	%2,%2,6\n"
-		"0:	cg	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	aghi	%0,8\n"
-		"	brct	%2,0b\n"
-		"	lgr	%0,%3\n"
-		"	j	5f\n"
-		"1:	lg	%2,0(%0,%4)\n"
-		"	sllg	%0,%0,3\n"
-		"	clr	%2,%1\n"
-		"	jne	2f\n"
-		"	aghi	%0,32\n"
-		"	srlg	%2,%2,32\n"
-		"2:	lghi	%1,0xff\n"
-		"	tmll	%2,0xffff\n"
-		"	jnz	3f\n"
-		"	aghi	%0,16\n"
-		"	srl	%2,16\n"
-		"3:	tmll	%2,0x00ff\n"
-		"	jnz	4f\n"
-		"	aghi	%0,8\n"
-		"	srl	%2,8\n"
-		"4:	ngr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	algr	%0,%2\n"
-		"5:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (addr), "a" (&_sb_findmap),
-		  "m" (*(addrtype *) addr) : "cc");
-        return (res < size) ? res : size;
+	bytes = __ffs_word_loop(addr, size);
+	bits = __ffs_word(bytes*8, __load_ulong_be(addr, bytes));
+	return (bits < size) ? bits : size;
 }
 
-#endif /* __s390x__ */
-
-static inline int
-find_next_zero_bit (const unsigned long * addr, unsigned long size,
-		    unsigned long offset)
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+static inline int find_next_zero_bit (const unsigned long * addr,
+				      unsigned long size,
+				      unsigned long offset)
 {
         const unsigned long *p;
 	unsigned long bit, set;
@@ -688,10 +702,10 @@ find_next_zero_bit (const unsigned long * addr, unsigned long size,
 	p = addr + offset / __BITOPS_WORDSIZE;
 	if (bit) {
 		/*
-		 * s390 version of ffz returns __BITOPS_WORDSIZE
+		 * __ffz_word returns __BITOPS_WORDSIZE
 		 * if no zero bit is present in the word.
 		 */
-		set = ffz(*p >> bit) + bit;
+		set = __ffz_word(0, *p >> bit) + bit;
 		if (set >= size)
 			return size + offset;
 		if (set < __BITOPS_WORDSIZE)
@@ -703,9 +717,15 @@ find_next_zero_bit (const unsigned long * addr, unsigned long size,
 	return offset + find_first_zero_bit(p, size);
 }
 
-static inline int
-find_next_bit (const unsigned long * addr, unsigned long size,
-	       unsigned long offset)
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+static inline int find_next_bit (const unsigned long * addr,
+				 unsigned long size,
+				 unsigned long offset)
 {
         const unsigned long *p;
 	unsigned long bit, set;
@@ -718,10 +738,10 @@ find_next_bit (const unsigned long * addr, unsigned long size,
 	p = addr + offset / __BITOPS_WORDSIZE;
 	if (bit) {
 		/*
-		 * s390 version of __ffs returns __BITOPS_WORDSIZE
+		 * __ffs_word returns __BITOPS_WORDSIZE
 		 * if no one bit is present in the word.
 		 */
-		set = __ffs(*p & (~0UL << bit));
+		set = __ffs_word(0, *p & (~0UL << bit));
 		if (set >= size)
 			return size + offset;
 		if (set < __BITOPS_WORDSIZE)
@@ -744,8 +764,6 @@ static inline int sched_find_first_bit(unsigned long *b)
 	return find_first_bit(b, 140);
 }
 
-#include <asm-generic/bitops/ffs.h>
-
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/fls64.h>
 
@@ -775,105 +793,22 @@ static inline int sched_find_first_bit(unsigned long *b)
 #define ext2_find_next_bit(addr, size, off) \
 	generic_find_next_le_bit((unsigned long *)(addr), (size), (off))
 
-#ifndef __s390x__
-
-static inline int 
-ext2_find_first_zero_bit(void *vaddr, unsigned int size)
+static inline int ext2_find_first_zero_bit(void *vaddr, unsigned int size)
 {
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-	unsigned long cmp, count;
-        unsigned int res;
+	unsigned long bytes, bits;
 
         if (!size)
                 return 0;
-	asm volatile(
-		"	lhi	%1,-1\n"
-		"	lr	%2,%3\n"
-		"	ahi	%2,31\n"
-		"	srl	%2,5\n"
-		"	slr	%0,%0\n"
-		"0:	cl	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	ahi	%0,4\n"
-		"	brct	%2,0b\n"
-		"	lr	%0,%3\n"
-		"	j	4f\n"
-		"1:	l	%2,0(%0,%4)\n"
-		"	sll	%0,3\n"
-		"	ahi	%0,24\n"
-		"	lhi	%1,0xff\n"
-		"	tmh	%2,0xffff\n"
-		"	jo	2f\n"
-		"	ahi	%0,-16\n"
-		"	srl	%2,16\n"
-		"2:	tml	%2,0xff00\n"
-		"	jo	3f\n"
-		"	ahi	%0,-8\n"
-		"	srl	%2,8\n"
-		"3:	nr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	alr	%0,%2\n"
-		"4:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (vaddr), "a" (&_zb_findmap),
-		  "m" (*(addrtype *) vaddr) : "cc");
-        return (res < size) ? res : size;
+	bytes = __ffz_word_loop(vaddr, size);
+	bits = __ffz_word(bytes*8, __load_ulong_le(vaddr, bytes));
+	return (bits < size) ? bits : size;
 }
 
-#else /* __s390x__ */
-
-static inline unsigned long
-ext2_find_first_zero_bit(void *vaddr, unsigned long size)
-{
-	typedef struct { long _[__BITOPS_WORDS(size)]; } addrtype;
-        unsigned long res, cmp, count;
-
-        if (!size)
-                return 0;
-	asm volatile(
-		"	lghi	%1,-1\n"
-		"	lgr	%2,%3\n"
-		"	aghi	%2,63\n"
-		"	srlg	%2,%2,6\n"
-		"	slgr	%0,%0\n"
-		"0:	clg	%1,0(%0,%4)\n"
-		"	jne	1f\n"
-		"	aghi	%0,8\n"
-		"	brct	%2,0b\n"
-		"	lgr	%0,%3\n"
-		"	j	5f\n"
-		"1:	cl	%1,0(%0,%4)\n"
-		"	jne	2f\n"
-		"	aghi	%0,4\n"
-		"2:	l	%2,0(%0,%4)\n"
-		"	sllg	%0,%0,3\n"
-		"	aghi	%0,24\n"
-		"	lghi	%1,0xff\n"
-		"	tmlh	%2,0xffff\n"
-		"	jo	3f\n"
-		"	aghi	%0,-16\n"
-		"	srl	%2,16\n"
-		"3:	tmll	%2,0xff00\n"
-		"	jo	4f\n"
-		"	aghi	%0,-8\n"
-		"	srl	%2,8\n"
-		"4:	ngr	%2,%1\n"
-		"	ic	%2,0(%2,%5)\n"
-		"	algr	%0,%2\n"
-		"5:"
-		: "=&a" (res), "=&d" (cmp), "=&a" (count)
-		: "a" (size), "a" (vaddr), "a" (&_zb_findmap),
-		  "m" (*(addrtype *) vaddr) : "cc");
-        return (res < size) ? res : size;
-}
-
-#endif /* __s390x__ */
-
-static inline int
-ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset)
+static inline int ext2_find_next_zero_bit(void *vaddr, unsigned long size,
+					  unsigned long offset)
 {
         unsigned long *addr = vaddr, *p;
-	unsigned long word, bit, set;
+	unsigned long bit, set;
 
         if (offset >= size)
                 return size;
@@ -882,23 +817,11 @@ ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset)
 	size -= offset;
 	p = addr + offset / __BITOPS_WORDSIZE;
         if (bit) {
-#ifndef __s390x__
-		asm volatile(
-			"	ic	%0,0(%1)\n"
-			"	icm	%0,2,1(%1)\n"
-			"	icm	%0,4,2(%1)\n"
-			"	icm	%0,8,3(%1)"
-			: "=&a" (word) : "a" (p), "m" (*p) : "cc");
-#else
-		asm volatile(
-			"	lrvg	%0,%1"
-			: "=a" (word) : "m" (*p) );
-#endif
 		/*
 		 * s390 version of ffz returns __BITOPS_WORDSIZE
 		 * if no zero bit is present in the word.
 		 */
-		set = ffz(word >> bit) + bit;
+		set = ffz(__load_ulong_le(p, 0) >> bit) + bit;
 		if (set >= size)
 			return size + offset;
 		if (set < __BITOPS_WORDSIZE)
-- 
cgit v1.1


From 67fe9251bba510572feb6c3357636148bbd17e30 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:44 +0100
Subject: [S390] Implement ext2_find_next_bit.

Fixes this compile error:

fs/ext4/mballoc.c:
	In function 'ext4_mb_generate_buddy':
fs/ext4/mballoc.c:954:
	error: implicit declaration of function 'generic_find_next_le_bit'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/asm-s390/bitops.h | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index 95d9395..882db05 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -790,8 +790,6 @@ static inline int sched_find_first_bit(unsigned long *b)
 	test_and_clear_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr)
-#define ext2_find_next_bit(addr, size, off) \
-	generic_find_next_le_bit((unsigned long *)(addr), (size), (off))
 
 static inline int ext2_find_first_zero_bit(void *vaddr, unsigned int size)
 {
@@ -833,6 +831,47 @@ static inline int ext2_find_next_zero_bit(void *vaddr, unsigned long size,
 	return offset + ext2_find_first_zero_bit(p, size);
 }
 
+static inline unsigned long ext2_find_first_bit(void *vaddr,
+						unsigned long size)
+{
+	unsigned long bytes, bits;
+
+	if (!size)
+		return 0;
+	bytes = __ffs_word_loop(vaddr, size);
+	bits = __ffs_word(bytes*8, __load_ulong_le(vaddr, bytes));
+	return (bits < size) ? bits : size;
+}
+
+static inline int ext2_find_next_bit(void *vaddr, unsigned long size,
+				     unsigned long offset)
+{
+	unsigned long *addr = vaddr, *p;
+	unsigned long bit, set;
+
+	if (offset >= size)
+		return size;
+	bit = offset & (__BITOPS_WORDSIZE - 1);
+	offset -= bit;
+	size -= offset;
+	p = addr + offset / __BITOPS_WORDSIZE;
+	if (bit) {
+		/*
+		 * s390 version of ffz returns __BITOPS_WORDSIZE
+		 * if no zero bit is present in the word.
+		 */
+		set = ffs(__load_ulong_le(p, 0) >> bit) + bit;
+		if (set >= size)
+			return size + offset;
+		if (set < __BITOPS_WORDSIZE)
+			return set + offset;
+		offset += __BITOPS_WORDSIZE;
+		size -= __BITOPS_WORDSIZE;
+		p++;
+	}
+	return offset + ext2_find_first_bit(p, size);
+}
+
 #include <asm-generic/bitops/minix.h>
 
 #endif /* __KERNEL__ */
-- 
cgit v1.1


From a3afe70b83fdbbd4d757d2911900d168bc798a31 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:45 +0100
Subject: [S390] latencytop s390 support.

Cc: Holger Wolf <wolf@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig             |  3 +++
 arch/s390/kernel/stacktrace.c | 31 +++++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f0e7ccf..92a4f7b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -16,6 +16,9 @@ config LOCKDEP_SUPPORT
 config STACKTRACE_SUPPORT
 	def_bool y
 
+config HAVE_LATENCYTOP_SUPPORT
+	def_bool y
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index da69247..85e46a5 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -14,7 +14,8 @@
 static unsigned long save_context_stack(struct stack_trace *trace,
 					unsigned long sp,
 					unsigned long low,
-					unsigned long high)
+					unsigned long high,
+					int savesched)
 {
 	struct stack_frame *sf;
 	struct pt_regs *regs;
@@ -47,10 +48,12 @@ static unsigned long save_context_stack(struct stack_trace *trace,
 			return sp;
 		regs = (struct pt_regs *)sp;
 		addr = regs->psw.addr & PSW_ADDR_INSN;
-		if (!trace->skip)
-			trace->entries[trace->nr_entries++] = addr;
-		else
-			trace->skip--;
+		if (savesched || !in_sched_functions(addr)) {
+			if (!trace->skip)
+				trace->entries[trace->nr_entries++] = addr;
+			else
+				trace->skip--;
+		}
 		if (trace->nr_entries >= trace->max_entries)
 			return sp;
 		low = sp;
@@ -66,15 +69,27 @@ void save_stack_trace(struct stack_trace *trace)
 	orig_sp = sp & PSW_ADDR_INSN;
 	new_sp = save_context_stack(trace, orig_sp,
 				    S390_lowcore.panic_stack - PAGE_SIZE,
-				    S390_lowcore.panic_stack);
+				    S390_lowcore.panic_stack, 1);
 	if (new_sp != orig_sp)
 		return;
 	new_sp = save_context_stack(trace, new_sp,
 				    S390_lowcore.async_stack - ASYNC_SIZE,
-				    S390_lowcore.async_stack);
+				    S390_lowcore.async_stack, 1);
 	if (new_sp != orig_sp)
 		return;
 	save_context_stack(trace, new_sp,
 			   S390_lowcore.thread_info,
-			   S390_lowcore.thread_info + THREAD_SIZE);
+			   S390_lowcore.thread_info + THREAD_SIZE, 1);
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	unsigned long sp, low, high;
+
+	sp = tsk->thread.ksp & PSW_ADDR_INSN;
+	low = (unsigned long) task_stack_page(tsk);
+	high = (unsigned long) task_pt_regs(tsk);
+	save_context_stack(trace, sp, low, high, 0);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
-- 
cgit v1.1


From 6c5f57c7884a7e0806ae9af86de243321cab4953 Mon Sep 17 00:00:00 2001
From: Stefan Haberland <stefan.haberland@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:46 +0100
Subject: [S390] dasd: add ifcc handling

Adding interface control check (ifcc) handling in error recovery.
First retry up to 255 times and if all retries fail try an alternate
path if possible.

Signed-off-by: Stefan Haberland <stefan.haberland@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd.c          | 17 ++++-------
 drivers/s390/block/dasd_3990_erp.c | 62 ++++++++++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index d640427..ab4f64c 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1057,12 +1057,11 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
 		if (device->features & DASD_FEATURE_ERPLOG) {
 			dasd_log_sense(cqr, irb);
 		}
-		/* If we have no sense data, or we just don't want complex ERP
-		 * for this request, but if we have retries left, then just
-		 * reset this request and retry it in the fastpath
+		/*
+		 * If we don't want complex ERP for this request, then just
+		 * reset this and retry it in the fastpath
 		 */
-		if (!(cqr->irb.esw.esw0.erw.cons &&
-		      test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) &&
+		if (!test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags) &&
 		    cqr->retries > 0) {
 			DEV_MESSAGE(KERN_DEBUG, device,
 				    "default ERP in fastpath (%i retries left)",
@@ -1742,12 +1741,8 @@ restart:
 
 		/*  Process requests that may be recovered */
 		if (cqr->status == DASD_CQR_NEED_ERP) {
-			if (cqr->irb.esw.esw0.erw.cons &&
-			    test_bit(DASD_CQR_FLAGS_USE_ERP,
-				     &cqr->flags)) {
-				erp_fn = base->discipline->erp_action(cqr);
-				erp_fn(cqr);
-			}
+			erp_fn = base->discipline->erp_action(cqr);
+			erp_fn(cqr);
 			goto restart;
 		}
 
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index c361ab6..f69714a 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -164,7 +164,7 @@ dasd_3990_erp_alternate_path(struct dasd_ccw_req * erp)
 
 		/* reset status to submit the request again... */
 		erp->status = DASD_CQR_FILLED;
-		erp->retries = 1;
+		erp->retries = 10;
 	} else {
 		DEV_MESSAGE(KERN_ERR, device,
 			    "No alternate channel path left (lpum=%x / "
@@ -301,8 +301,7 @@ dasd_3990_erp_action_4(struct dasd_ccw_req * erp, char *sense)
 		erp->function = dasd_3990_erp_action_4;
 
 	} else {
-
-		if (sense[25] == 0x1D) {	/* state change pending */
+		if (sense && (sense[25] == 0x1D)) { /* state change pending */
 
 			DEV_MESSAGE(KERN_INFO, device,
 				    "waiting for state change pending "
@@ -311,7 +310,7 @@ dasd_3990_erp_action_4(struct dasd_ccw_req * erp, char *sense)
 
 			dasd_3990_erp_block_queue(erp, 30*HZ);
 
-                } else if (sense[25] == 0x1E) {	/* busy */
+		} else if (sense && (sense[25] == 0x1E)) {	/* busy */
 			DEV_MESSAGE(KERN_INFO, device,
 				    "busy - redriving request later, "
 				    "%d retries left",
@@ -2120,6 +2119,34 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense)
  */
 
 /*
+ * DASD_3990_ERP_CONTROL_CHECK
+ *
+ * DESCRIPTION
+ *   Does a generic inspection if a control check occured and sets up
+ *   the related error recovery procedure
+ *
+ * PARAMETER
+ *   erp		pointer to the currently created default ERP
+ *
+ * RETURN VALUES
+ *   erp_filled		pointer to the erp
+ */
+
+static struct dasd_ccw_req *
+dasd_3990_erp_control_check(struct dasd_ccw_req *erp)
+{
+	struct dasd_device *device = erp->startdev;
+
+	if (erp->refers->irb.scsw.cstat & (SCHN_STAT_INTF_CTRL_CHK
+					   | SCHN_STAT_CHN_CTRL_CHK)) {
+		DEV_MESSAGE(KERN_DEBUG, device, "%s",
+			    "channel or interface control check");
+		erp = dasd_3990_erp_action_4(erp, NULL);
+	}
+	return erp;
+}
+
+/*
  * DASD_3990_ERP_INSPECT
  *
  * DESCRIPTION
@@ -2145,8 +2172,11 @@ dasd_3990_erp_inspect(struct dasd_ccw_req * erp)
 	if (erp_new)
 		return erp_new;
 
+	/* check if no concurrent sens is available */
+	if (!erp->refers->irb.esw.esw0.erw.cons)
+		erp_new = dasd_3990_erp_control_check(erp);
 	/* distinguish between 24 and 32 byte sense data */
-	if (sense[27] & DASD_SENSE_BIT_0) {
+	else if (sense[27] & DASD_SENSE_BIT_0) {
 
 		/* inspect the 24 byte sense data */
 		erp_new = dasd_3990_erp_inspect_24(erp, sense);
@@ -2285,6 +2315,17 @@ dasd_3990_erp_error_match(struct dasd_ccw_req *cqr1, struct dasd_ccw_req *cqr2)
 		//	return 0;	/* CCW doesn't match */
 	}
 
+	if (cqr1->irb.esw.esw0.erw.cons != cqr2->irb.esw.esw0.erw.cons)
+		return 0;
+
+	if ((cqr1->irb.esw.esw0.erw.cons == 0) &&
+	    (cqr2->irb.esw.esw0.erw.cons == 0))	{
+		if ((cqr1->irb.scsw.cstat & (SCHN_STAT_INTF_CTRL_CHK |
+					     SCHN_STAT_CHN_CTRL_CHK)) ==
+		    (cqr2->irb.scsw.cstat & (SCHN_STAT_INTF_CTRL_CHK |
+					     SCHN_STAT_CHN_CTRL_CHK)))
+			return 1; /* match with ifcc*/
+	}
 	/* check sense data; byte 0-2,25,27 */
 	if (!((memcmp (cqr1->irb.ecw, cqr2->irb.ecw, 3) == 0) &&
 	      (cqr1->irb.ecw[27] == cqr2->irb.ecw[27]) &&
@@ -2560,17 +2601,6 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
 
 		return cqr;
 	}
-	/* check if sense data are available */
-	if (!cqr->irb.ecw) {
-		DEV_MESSAGE(KERN_DEBUG, device,
-			    "ERP called witout sense data avail ..."
-			    "request %p - NO ERP possible", cqr);
-
-		cqr->status = DASD_CQR_FAILED;
-
-		return cqr;
-
-	}
 
 	/* check if error happened before */
 	erp = dasd_3990_erp_in_erp(cqr);
-- 
cgit v1.1


From fe6b8e76d920b93fd445382aff7ff24082af8874 Mon Sep 17 00:00:00 2001
From: Stefan Weinhuber <wein@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:47 +0100
Subject: [S390] dasd: fix panic caused by alias device offline

When an alias device is set offline while it is in use this may
result in a panic in the cleanup part of the dasd_block_tasklet.
The problem here is that there may exist some ccw requests that were
originally created for the alias device and transferred to the base
device when the alias was set offline. When these request are
cleaned up later, the discipline pointer in the alias device may not
be valid anymore. To fix this use the base device discipline to find
the cleanup function.

Signed-off-by: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index ab4f64c..d984e0f 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1706,7 +1706,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
 
 	req = (struct request *) cqr->callback_data;
 	dasd_profile_end(cqr->block, cqr, req);
-	status = cqr->memdev->discipline->free_cp(cqr, req);
+	status = cqr->block->base->discipline->free_cp(cqr, req);
 	if (status <= 0)
 		error = status ? status : -EIO;
 	dasd_end_request(req, error);
-- 
cgit v1.1


From e35e1fadb4585e3143fab34dd4f5070698b3305b Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:48 +0100
Subject: [S390] sclp_tty/sclp_vt220: Fix scheduling while atomic

Under load the following bug message appeared while using sysrq-t:

BUG: scheduling while atomic: bash/3662/0x00000004
0000000000105b74 000000003ba17740 0000000000000002 0000000000000000
       000000003ba177e0 000000003ba17758 000000003ba17758 0000000000105bfe
       0000000000817ba8 000000003f2a5350 0000000000000000 0000000000000000
       000000003ba17740 000000000000000c 000000003ba17740 000000003ba177b0
       0000000000568630 0000000000105bfe 000000003ba17740 000000003ba17790
Call Trace:
([<0000000000105b74>] show_trace+0x13c/0x158)
 [<0000000000105c58>] show_stack+0xc8/0xfc
 [<0000000000105cbc>] dump_stack+0x30/0x40
 [<000000000012a0c8>] __schedule_bug+0x84/0x94
 [<000000000056234e>] schedule+0x5ea/0x970
 [<0000000000477cd2>] __sclp_vt220_write+0x1f6/0x3ec
 [<0000000000477f00>] sclp_vt220_con_write+0x38/0x48
 [<0000000000130b4a>] __call_console_drivers+0xbe/0xd8
 [<0000000000130bf0>] _call_console_drivers+0x8c/0xd0
 [<0000000000130eea>] release_console_sem+0x1a6/0x2fc
 [<0000000000131786>] vprintk+0x262/0x480
 [<00000000001319fa>] printk+0x56/0x68
 [<0000000000125aaa>] print_cfs_rq+0x45e/0x4a4
 [<000000000012614e>] sched_debug_show+0x65e/0xee8
 [<000000000012a8fc>] show_state_filter+0x1cc/0x1f0
 [<000000000044d39c>] sysrq_handle_showstate+0x2c/0x3c
 [<000000000044d1fe>] __handle_sysrq+0xae/0x18c
 [<00000000002001f2>] write_sysrq_trigger+0x8a/0x90
 [<00000000001f7862>] proc_reg_write+0x9a/0xc4
 [<00000000001a83d4>] vfs_write+0xb8/0x174
 [<00000000001a8b88>] sys_write+0x58/0x8c
 [<0000000000112e7c>] sysc_noemu+0x10/0x16
 [<0000020000116f68>] 0x20000116f68

The problem seems to be, that with a full console buffer, release_console_sem
disables interrupts with spin_lock_irqsave and then calls the console function
without enabling interrupts. __sclp_vt220_write checks for in_interrupt, to
decide if it can schedule. It should check for in_atomic instead.

The same is true for sclp_tty.c.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/char/sclp_tty.c   | 2 +-
 drivers/s390/char/sclp_vt220.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index e3b3d39..2e616e3 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -332,7 +332,7 @@ sclp_tty_write_string(const unsigned char *str, int count)
 		if (sclp_ttybuf == NULL) {
 			while (list_empty(&sclp_tty_pages)) {
 				spin_unlock_irqrestore(&sclp_tty_lock, flags);
-				if (in_interrupt())
+				if (in_atomic())
 					sclp_sync_wait();
 				else
 					wait_event(sclp_tty_waitq,
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 40cd21b..6807162 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -400,7 +400,7 @@ __sclp_vt220_write(const unsigned char *buf, int count, int do_schedule,
 			while (list_empty(&sclp_vt220_empty)) {
 				spin_unlock_irqrestore(&sclp_vt220_lock,
 						       flags);
-				if (in_interrupt())
+				if (in_atomic())
 					sclp_sync_wait();
 				else
 					wait_event(sclp_vt220_waitq,
-- 
cgit v1.1


From 0189103c69f47712a0c542a8bc28ff46ebe53a8a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:49 +0100
Subject: [S390] Remove BUILD_BUG_ON() in vmem code.

Remove BUILD_BUG_ON() in vmem code since it causes build failures if
the size of struct page increases. Instead calculate at compile time
the address of the highest physical address that can be added to the
1:1 mapping.
This supposed to fix a build failure with the page owner tracking leak
detector patches as reported by akpm.

page-owner-tracking-leak-detector-broken-on-s390.patch can be removed
from -mm again when this is merged.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/setup.c   |  2 +-
 arch/s390/mm/vmem.c        |  3 +--
 include/asm-s390/pgtable.h | 12 +++++++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 72d20f7..29ae165 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -528,7 +528,7 @@ static void __init setup_memory_end(void)
 	memory_size = 0;
 	memory_end &= PAGE_MASK;
 
-	max_mem = memory_end ? min(VMALLOC_START, memory_end) : VMALLOC_START;
+	max_mem = memory_end ? min(VMEM_MAX_PHYS, memory_end) : VMEM_MAX_PHYS;
 	memory_end = min(max_mem, memory_end);
 
 	/*
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 07e0467..7c1287c 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -250,7 +250,7 @@ static int insert_memory_segment(struct memory_segment *seg)
 {
 	struct memory_segment *tmp;
 
-	if (seg->start + seg->size >= VMALLOC_START ||
+	if (seg->start + seg->size >= VMEM_MAX_PHYS ||
 	    seg->start + seg->size < seg->start)
 		return -ERANGE;
 
@@ -360,7 +360,6 @@ void __init vmem_map_init(void)
 {
 	int i;
 
-	BUILD_BUG_ON((unsigned long)VMEM_MAP + VMEM_MAP_SIZE > VMEM_MAP_MAX);
 	NODE_DATA(0)->node_mem_map = VMEM_MAP;
 	for (i = 0; i < MEMORY_CHUNKS && memory_chunk[i].size > 0; i++)
 		vmem_add_mem(memory_chunk[i].addr, memory_chunk[i].size);
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index 79b9eab..3f52075 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -115,15 +115,21 @@ extern char empty_zero_page[PAGE_SIZE];
 #ifndef __s390x__
 #define VMALLOC_START	0x78000000UL
 #define VMALLOC_END	0x7e000000UL
-#define VMEM_MAP_MAX	0x80000000UL
+#define VMEM_MAP_END	0x80000000UL
 #else /* __s390x__ */
 #define VMALLOC_START	0x3e000000000UL
 #define VMALLOC_END	0x3e040000000UL
-#define VMEM_MAP_MAX	0x40000000000UL
+#define VMEM_MAP_END	0x40000000000UL
 #endif /* __s390x__ */
 
+/*
+ * VMEM_MAX_PHYS is the highest physical address that can be added to the 1:1
+ * mapping. This needs to be calculated at compile time since the size of the
+ * VMEM_MAP is static but the size of struct page can change.
+ */
+#define VMEM_MAX_PHYS	min(VMALLOC_START, ((VMEM_MAP_END - VMALLOC_END) / \
+			  sizeof(struct page) * PAGE_SIZE) & ~((16 << 20) - 1))
 #define VMEM_MAP	((struct page *) VMALLOC_END)
-#define VMEM_MAP_SIZE	((VMALLOC_START / PAGE_SIZE) * sizeof(struct page))
 
 /*
  * A 31 bit pagetable entry of S390 has following format:
-- 
cgit v1.1


From c5411dba58c28736d25cffef65da1e01ed7d1423 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 5 Feb 2008 16:50:50 +0100
Subject: [S390] dcss: Initialize workqueue before using it.

In case a dcss segment cannot be loaded blk_cleanup_queue
will be called before blk_queue_make_request, leaving the
struct work unplug_work of the request queue uninitialized
before it is used.
That leads also to the lockdep message below.
To avoid that call blk_queue_make_request right after the
request_queue has been allocated.
This makes sure that the struct work is always initialized
before it is used.

INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 2 Not tainted 2.6.24 #6
Process swapper (pid: 1, task: 000000000f854038, ksp: 000000000f85f980)
040000000f85f860 000000000f85f880 0000000000000002 0000000000000000
       000000000f85f920 000000000f85f898 000000000f85f898 000000000001622e
       0000000000000000 000000000f85f980 0000000000000000 0000000000000000
       000000000f85f880 000000000000000c 000000000f85f880 000000000f85f8f0
       0000000000342908 000000000001622e 000000000f85f880 000000000f85f8d0
Call Trace:
([<000000000001619e>] show_trace+0xda/0x104)
 [<0000000000016288>] show_stack+0xc0/0xf8
 [<00000000000163d0>] dump_stack+0xb0/0xc0
 [<000000000006e4ea>] __lock_acquire+0x47e/0x1160
 [<000000000006f27c>] lock_acquire+0xb0/0xd8
 [<000000000005a522>] __cancel_work_timer+0x9e/0x240
 [<000000000005a72e>] cancel_work_sync+0x2a/0x3c
 [<0000000000165c46>] kblockd_flush_work+0x26/0x34
 [<0000000000169034>] blk_sync_queue+0x38/0x48
 [<0000000000169080>] blk_release_queue+0x3c/0xa8
 [<000000000017bce8>] kobject_cleanup+0x58/0xac
 [<000000000017bd66>] kobject_release+0x2a/0x38
 [<000000000017d28e>] kref_put+0x6e/0x94
 [<000000000017bc80>] kobject_put+0x38/0x48
 [<00000000001653be>] blk_put_queue+0x2a/0x38
 [<0000000000168fee>] blk_cleanup_queue+0x82/0x90
 [<0000000000213e7e>] dcssblk_add_store+0x34e/0x700
 [<00000000005243b8>] dcssblk_init+0x1a0/0x308
 [<000000000050a3c2>] kernel_init+0x1b2/0x3a4
 [<000000000001ac82>] kernel_thread_starter+0x6/0xc
 [<000000000001ac7c>] kernel_thread_starter+0x0/0xc

INFO: lockdep is turned off.

Cc: Gerald Schaefer <geraldsc@de.ibm.com>
Cc: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dcssblk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 7779bfc..3faf053 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -415,6 +415,8 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->queue = dev_info->dcssblk_queue;
 	dev_info->gd->private_data = dev_info;
 	dev_info->gd->driverfs_dev = &dev_info->dev;
+	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
+	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
 	/*
 	 * load the segment
 	 */
@@ -472,9 +474,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	if (rc)
 		goto unregister_dev;
 
-	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
-	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
-
 	add_disk(dev_info->gd);
 
 	switch (dev_info->segment_type) {
-- 
cgit v1.1


From ef3c4cb936d854d1564172f2dcce9c20d1b08761 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 4 Feb 2008 23:43:02 -0800
Subject: [IA64] remove dead code: __cpu_{down,die} from !HOTPLUG_CPU

Neither __cpu_down() nor __cpu_die() are being referenced without
CONFIG_HOTPLUG_CPU.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/smpboot.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index f0fc4d8..480b1a5 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -767,17 +767,6 @@ void __cpu_die(unsigned int cpu)
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
-#else /* !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
-{
-	return -ENOSYS;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-	/* We said "no" in __cpu_disable */
-	BUG();
-}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void
-- 
cgit v1.1


From 620de2f5dc697f906408743b1139fe5fb7b0b7f8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 4 Feb 2008 23:43:03 -0800
Subject: [IA64] honor notify_die() returning NOTIFY_STOP

This requires making die() and die_if_kernel() return a value, and their
callers to honor this (and be prepared that it returns).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/ia32/ia32_support.c |  5 +++--
 arch/ia64/kernel/traps.c      | 35 +++++++++++++++++++++++------------
 arch/ia64/kernel/unaligned.c  | 13 ++++++++-----
 arch/ia64/mm/fault.c          |  8 +++++---
 4 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/ia32/ia32_support.c b/arch/ia64/ia32/ia32_support.c
index d1d50cd..896b1eb 100644
--- a/arch/ia64/ia32/ia32_support.c
+++ b/arch/ia64/ia32/ia32_support.c
@@ -27,7 +27,7 @@
 
 #include "ia32priv.h"
 
-extern void die_if_kernel (char *str, struct pt_regs *regs, long err);
+extern int die_if_kernel (char *str, struct pt_regs *regs, long err);
 
 struct exec_domain ia32_exec_domain;
 struct page *ia32_shared_page[NR_CPUS];
@@ -217,7 +217,8 @@ ia32_bad_interrupt (unsigned long int_num, struct pt_regs *regs)
 {
 	siginfo_t siginfo;
 
-	die_if_kernel("Bad IA-32 interrupt", regs, int_num);
+	if (die_if_kernel("Bad IA-32 interrupt", regs, int_num))
+		return;
 
 	siginfo.si_signo = SIGTRAP;
 	siginfo.si_errno = int_num;	/* XXX is it OK to abuse si_errno like this? */
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 78d65cb..f0cda76 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -35,7 +35,7 @@ trap_init (void)
 		fpswa_interface = __va(ia64_boot_param->fpswa);
 }
 
-void
+int
 die (const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
@@ -62,8 +62,11 @@ die (const char *str, struct pt_regs *regs, long err)
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
 		current->comm, task_pid_nr(current), str, err, ++die_counter);
-		(void) notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
-		show_regs(regs);
+		if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV)
+	            != NOTIFY_STOP)
+			show_regs(regs);
+		else
+			regs = NULL;
   	} else
 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
 
@@ -72,17 +75,22 @@ die (const char *str, struct pt_regs *regs, long err)
 	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die.lock);
 
+	if (!regs)
+		return 1;
+
 	if (panic_on_oops)
 		panic("Fatal exception");
 
   	do_exit(SIGSEGV);
+	return 0;
 }
 
-void
+int
 die_if_kernel (char *str, struct pt_regs *regs, long err)
 {
 	if (!user_mode(regs))
-		die(str, regs, err);
+		return die(str, regs, err);
+	return 0;
 }
 
 void
@@ -102,7 +110,8 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
 			       	== NOTIFY_STOP)
 			return;
-		die_if_kernel("bugcheck!", regs, break_num);
+		if (die_if_kernel("bugcheck!", regs, break_num))
+			return;
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
 
@@ -155,8 +164,9 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		break;
 
 	      default:
-		if (break_num < 0x40000 || break_num > 0x100000)
-			die_if_kernel("Bad break", regs, break_num);
+		if ((break_num < 0x40000 || break_num > 0x100000)
+		    && die_if_kernel("Bad break", regs, break_num))
+			return;
 
 		if (break_num < 0x80000) {
 			sig = SIGILL; code = __ILL_BREAK;
@@ -402,14 +412,15 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 #endif
 
 	sprintf(buf, "IA-64 Illegal operation fault");
-	die_if_kernel(buf, &regs, 0);
+	rv.fkt = 0;
+	if (die_if_kernel(buf, &regs, 0))
+		return rv;
 
 	memset(&si, 0, sizeof(si));
 	si.si_signo = SIGILL;
 	si.si_code = ILL_ILLOPC;
 	si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
 	force_sig_info(SIGILL, &si, current);
-	rv.fkt = 0;
 	return rv;
 }
 
@@ -644,6 +655,6 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		sprintf(buf, "Fault %lu", vector);
 		break;
 	}
-	die_if_kernel(buf, &regs, error);
-	force_sig(SIGILL, current);
+	if (!die_if_kernel(buf, &regs, error))
+		force_sig(SIGILL, current);
 }
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index f6a1aeb..52f70bb 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -23,7 +23,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
-extern void die_if_kernel(char *str, struct pt_regs *regs, long err);
+extern int die_if_kernel(char *str, struct pt_regs *regs, long err);
 
 #undef DEBUG_UNALIGNED_TRAP
 
@@ -675,8 +675,9 @@ emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsi
 	 */
 	if (ld.x6_op == 1 || ld.x6_op == 3) {
 		printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__);
-		die_if_kernel("unaligned reference on speculative load with register update\n",
-			      regs, 30);
+		if (die_if_kernel("unaligned reference on speculative load with register update\n",
+				  regs, 30))
+			return;
 	}
 
 
@@ -1317,7 +1318,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 
 	if (ia64_psr(regs)->be) {
 		/* we don't support big-endian accesses */
-		die_if_kernel("big-endian unaligned accesses are not supported", regs, 0);
+		if (die_if_kernel("big-endian unaligned accesses are not supported", regs, 0))
+			return;
 		goto force_sigbus;
 	}
 
@@ -1534,7 +1536,8 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			ia64_handle_exception(regs, eh);
 			goto done;
 		}
-		die_if_kernel("error during unaligned kernel access\n", regs, ret);
+		if (die_if_kernel("error during unaligned kernel access\n", regs, ret))
+			return;
 		/* NOT_REACHED */
 	}
   force_sigbus:
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 7571076..3e69881 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -16,7 +16,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-extern void die (char *, struct pt_regs *, long);
+extern int die(char *, struct pt_regs *, long);
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -267,9 +267,11 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	else
 		printk(KERN_ALERT "Unable to handle kernel paging request at "
 		       "virtual address %016lx\n", address);
-	die("Oops", regs, isr);
+	if (die("Oops", regs, isr))
+		regs = NULL;
 	bust_spinlocks(0);
-	do_exit(SIGKILL);
+	if (regs)
+		do_exit(SIGKILL);
 	return;
 
   out_of_memory:
-- 
cgit v1.1


From e1b0d4ba46b42909d11ea152a6b56ee76f062ca3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 4 Feb 2008 23:43:03 -0800
Subject: [IA64] make pfm_get_task work with virtual pids

This pid comes from user space, so treat it accordingly.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/perfmon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 48e5609..78acd9f 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2654,11 +2654,11 @@ pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
 	/* XXX: need to add more checks here */
 	if (pid < 2) return -EPERM;
 
-	if (pid != current->pid) {
+	if (pid != task_pid_vnr(current)) {
 
 		read_lock(&tasklist_lock);
 
-		p = find_task_by_pid(pid);
+		p = find_task_by_vpid(pid);
 
 		/* make sure task cannot go away while we operate on it */
 		if (p) get_task_struct(p);
-- 
cgit v1.1


From c0b49b0d164c4902e53c17d90e2c5e5a2ac9e132 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:27:18 -0800
Subject: kvm: i386 fix

arch/x86/kvm/x86.c: In function 'emulator_cmpxchg_emulated':
arch/x86/kvm/x86.c:1746: warning: passing argument 2 of 'vcpu->arch.mmu.gva_to_gpa' makes integer from pointer without a cast
arch/x86/kvm/x86.c:1746: warning: 'addr' is used uninitialized in this function

Is true.  Local variable `addr' shadows incoming arg `addr'.  Avi is on
vacation for a while, so...

Cc: Avi Kivity <avi@qumranet.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f94a0b..cf53081 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1739,7 +1739,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 	if (bytes == 8) {
 		gpa_t gpa;
 		struct page *page;
-		char *addr;
+		char *kaddr;
 		u64 val;
 
 		down_read(&current->mm->mmap_sem);
@@ -1754,9 +1754,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 
 		val = *(u64 *)new;
 		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-		addr = kmap_atomic(page, KM_USER0);
-		set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
-		kunmap_atomic(addr, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
+		kunmap_atomic(kaddr, KM_USER0);
 		kvm_release_page_dirty(page);
 	emul_write:
 		up_read(&current->mm->mmap_sem);
-- 
cgit v1.1


From 8a459e44ad837018ea5c34a9efe8eb4ad27ded26 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 4 Feb 2008 22:27:18 -0800
Subject: sys_remap_file_pages: fix ->vm_file accounting

Fix ->vm_file accounting, mmap_region() may do do_munmap().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/fremap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/fremap.c b/mm/fremap.c
index 14bd3bf..69a37c2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 		 */
 		if (mapping_cap_account_dirty(mapping)) {
 			unsigned long addr;
+			struct file *file = vma->vm_file;
 
 			flags &= MAP_NONBLOCK;
-			addr = mmap_region(vma->vm_file, start, size,
+			get_file(file);
+			addr = mmap_region(file, start, size,
 					flags, vma->vm_flags, pgoff, 1);
+			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
 			} else {
-- 
cgit v1.1


From 96cf49a2c13e8dcf442abaadf6645f6a1fb3ae92 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:27:19 -0800
Subject: drivers/net/wireless/b43/main.c needs io.h

m68k:

drivers/net/wireless/b43/main.c:251: error: implicit declaration of function 'mmiowb'

Cc: "John W. Linville" <linville@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/wireless/b43/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 64c154d..6faa57a 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -38,6 +38,7 @@
 #include <linux/wireless.h>
 #include <linux/workqueue.h>
 #include <linux/skbuff.h>
+#include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <asm/unaligned.h>
 
-- 
cgit v1.1


From 0ccf831cbee94df9c5006dd46248c0f07847dd7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 4 Feb 2008 22:27:20 -0800
Subject: lockdep: annotate epoll

On Sat, 2008-01-05 at 13:35 -0800, Davide Libenzi wrote:

> I remember I talked with Arjan about this time ago. Basically, since 1)
> you can drop an epoll fd inside another epoll fd 2) callback-based wakeups
> are used, you can see a wake_up() from inside another wake_up(), but they
> will never refer to the same lock instance.
> Think about:
>
> 	dfd = socket(...);
> 	efd1 = epoll_create();
> 	efd2 = epoll_create();
> 	epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
> 	epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
>
> When a packet arrives to the device underneath "dfd", the net code will
> issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
> callback wakeup entry on that queue, and the wake_up() performed by the
> "dfd" net code will end up in ep_poll_callback(). At this point epoll
> (efd1) notices that it may have some event ready, so it needs to wake up
> the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
> that ends up in another wake_up(), after having checked about the
> recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
> avoid stack blasting. Never hit the same queue, to avoid loops like:
>
> 	epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
> 	epoll_ctl(efd3, EPOLL_CTL_ADD, efd2, ...);
> 	epoll_ctl(efd4, EPOLL_CTL_ADD, efd3, ...);
> 	epoll_ctl(efd1, EPOLL_CTL_ADD, efd4, ...);
>
> The code "if (tncur->wq == wq || ..." prevents re-entering the same
> queue/lock.

Since the epoll code is very careful to not nest same instance locks
allow the recursion.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c       |  2 +-
 include/linux/wait.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 81c04ab..a415f42 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -353,7 +353,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 	spin_unlock_irqrestore(&psw->lock, flags);
 
 	/* Do really wake up now */
-	wake_up(wq);
+	wake_up_nested(wq, 1 + wake_nests);
 
 	/* Remove the current task from the list */
 	spin_lock_irqsave(&psw->lock, flags);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1f4fb0a..33a2aa9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -162,6 +162,22 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * macro to avoid include hell
+ */
+#define wake_up_nested(x, s)						\
+do {									\
+	unsigned long flags;						\
+									\
+	spin_lock_irqsave_nested(&(x)->lock, flags, (s));		\
+	wake_up_locked(x); 						\
+	spin_unlock_irqrestore(&(x)->lock, flags);			\
+} while (0)
+#else
+#define wake_up_nested(x, s)		wake_up(x)
+#endif
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	DEFINE_WAIT(__wait);						\
-- 
cgit v1.1


From 59714d65dfbc86d5cb93adc5bac57a921cc2fa84 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:27:21 -0800
Subject: get_task_comm(): return the result

It was dumb to make get_task_comm() return void.  Change it to return a
pointer to the resulting output for caller convenience.

Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 3 ++-
 include/linux/sched.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 282240a..966c5c5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -947,12 +947,13 @@ static void flush_old_files(struct files_struct * files)
 	spin_unlock(&files->file_lock);
 }
 
-void get_task_comm(char *buf, struct task_struct *tsk)
+char *get_task_comm(char *buf, struct task_struct *tsk)
 {
 	/* buf must be at least sizeof(tsk->comm) in size */
 	task_lock(tsk);
 	strncpy(buf, tsk->comm, sizeof(tsk->comm));
 	task_unlock(tsk);
+	return buf;
 }
 
 void set_task_comm(struct task_struct *tsk, char *buf)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index af6947e..680bb03a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1770,7 +1770,7 @@ extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned lon
 struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
-extern void get_task_comm(char *to, struct task_struct *tsk);
+extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
 extern void wait_task_inactive(struct task_struct * p);
-- 
cgit v1.1


From bdff746a3915f109bd13730b6847e33e17e91ed3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:27:22 -0800
Subject: clone: prepare to recycle CLONE_STOPPED

Ulrich says that we never used this clone flags and that nothing should be
using it.

As we're down to only a single bit left in clone's flags argument, let's add a
warning to check that no userspace is actually using it.  Hopefully we will
be able to recycle it.

Roland said:

  CLONE_STOPPED was previously used by some NTPL versions when under
  thread_db (i.e.  only when being actively debugged by gdb), but not for a
  long time now, and it never worked reliably when it was used.  Removing it
  seems fine to me.

[akpm@linux-foundation.org: it looks like CLONE_DETACHED is being used]
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 05e0b6f..6caf4f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1450,6 +1450,23 @@ long do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
+	/*
+	 * We hope to recycle these flags after 2.6.26
+	 */
+	if (unlikely(clone_flags & CLONE_STOPPED)) {
+		static int __read_mostly count = 100;
+
+		if (count > 0 && printk_ratelimit()) {
+			char comm[TASK_COMM_LEN];
+
+			count--;
+			printk(KERN_INFO "fork(): process `%s' used deprecated "
+					"clone flags 0x%lx\n",
+				get_task_comm(comm, current),
+				clone_flags & CLONE_STOPPED);
+		}
+	}
+
 	if (unlikely(current->ptrace)) {
 		trace = fork_traceflag (clone_flags);
 		if (trace)
-- 
cgit v1.1


From 198466b41d11dd062fb26ee0376080458d7bfcaf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 4 Feb 2008 22:27:23 -0800
Subject: __group_complete_signal(): fix coredump with group stop race

When __group_complete_signal() sees sig_kernel_coredump() signal, it starts
the group stop, but sets ->group_exit_task = t in a hope that "t" will
actually dequeue this signal and invoke do_coredump().  However, by the
time "t" enters get_signal_to_deliver() it is possible that the signal was
blocked/ignored or we have another pending !SIG_KERNEL_COREDUMP_MASK signal
which will be dequeued first.  This means the task could be stopped but not
killed.

Remove this code from __group_complete_signal().  Note also this patch
removes the bogus signal_wake_up(t, 1).  This thread can't be
STOPPED/TRACED, note the corresponding check in wants_signal().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Robin Holt <holt@sgi.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 4333b6d..ea90c34 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -911,27 +911,6 @@ __group_complete_signal(int sig, struct task_struct *p)
 			} while_each_thread(p, t);
 			return;
 		}
-
-		/*
-		 * There will be a core dump.  We make all threads other
-		 * than the chosen one go into a group stop so that nothing
-		 * happens until it gets scheduled, takes the signal off
-		 * the shared queue, and does the core dump.  This is a
-		 * little more complicated than strictly necessary, but it
-		 * keeps the signal state that winds up in the core dump
-		 * unchanged from the death state, e.g. which thread had
-		 * the core-dump signal unblocked.
-		 */
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
-		p->signal->group_stop_count = 0;
-		p->signal->group_exit_task = t;
-		p = t;
-		do {
-			p->signal->group_stop_count++;
-			signal_wake_up(t, t == p);
-		} while_each_thread(p, t);
-		return;
 	}
 
 	/*
@@ -1762,15 +1741,6 @@ static int handle_group_stop(void)
 {
 	int stop_count;
 
-	if (current->signal->group_exit_task == current) {
-		/*
-		 * Group stop is so we can do a core dump,
-		 * We are the initiating thread, so get on with it.
-		 */
-		current->signal->group_exit_task = NULL;
-		return 0;
-	}
-
 	if (current->signal->flags & SIGNAL_GROUP_EXIT)
 		/*
 		 * Group stop is so another thread can do a core dump,
-- 
cgit v1.1


From f558b7e408026eb3c6afcd0e8fc1f7fe31195a6a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 4 Feb 2008 22:27:24 -0800
Subject: remove handle_group_stop() in favor of do_signal_stop()

Every time we set SIGNAL_GROUP_EXIT or clear SIGNAL_STOP_DEQUEUED we also
reset ->group_stop_count.

This means that the SIGNAL_GROUP_EXIT check in handle_group_stop() is not
needed, and do_signal_stop() should check SIGNAL_STOP_DEQUEUED only when
->group_stop_count == 0. With these changes handle_group_stop() becomes the
subset of do_signal_stop(), we can kill it and use do_signal_stop() instead.

Also, a preparation for the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Robin Holt <holt@sgi.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 43 +++++--------------------------------------
 1 file changed, 5 insertions(+), 38 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index ea90c34..1117b28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1688,9 +1688,6 @@ static int do_signal_stop(int signr)
 	struct signal_struct *sig = current->signal;
 	int stop_count;
 
-	if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
-		return 0;
-
 	if (sig->group_stop_count > 0) {
 		/*
 		 * There is a group stop in progress.  We don't need to
@@ -1698,12 +1695,14 @@ static int do_signal_stop(int signr)
 		 */
 		stop_count = --sig->group_stop_count;
 	} else {
+		struct task_struct *t;
+
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
+			return 0;
 		/*
 		 * There is no group stop already in progress.
 		 * We must initiate one now.
 		 */
-		struct task_struct *t;
-
 		sig->group_exit_code = signr;
 
 		stop_count = 0;
@@ -1731,38 +1730,6 @@ static int do_signal_stop(int signr)
 	return 1;
 }
 
-/*
- * Do appropriate magic when group_stop_count > 0.
- * We return nonzero if we stopped, after releasing the siglock.
- * We return zero if we still hold the siglock and should look
- * for another signal without checking group_stop_count again.
- */
-static int handle_group_stop(void)
-{
-	int stop_count;
-
-	if (current->signal->flags & SIGNAL_GROUP_EXIT)
-		/*
-		 * Group stop is so another thread can do a core dump,
-		 * or else we are racing against a death signal.
-		 * Just punt the stop so we can get the next signal.
-		 */
-		return 0;
-
-	/*
-	 * There is a group stop in progress.  We stop
-	 * without any associated signal being in our queue.
-	 */
-	stop_count = --current->signal->group_stop_count;
-	if (stop_count == 0)
-		current->signal->flags = SIGNAL_STOP_STOPPED;
-	current->exit_code = current->signal->group_exit_code;
-	set_current_state(TASK_STOPPED);
-	spin_unlock_irq(&current->sighand->siglock);
-	finish_stop(stop_count);
-	return 1;
-}
-
 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 			  struct pt_regs *regs, void *cookie)
 {
@@ -1777,7 +1744,7 @@ relock:
 		struct k_sigaction *ka;
 
 		if (unlikely(current->signal->group_stop_count > 0) &&
-		    handle_group_stop())
+		    do_signal_stop(0))
 			goto relock;
 
 		signr = dequeue_signal(current, mask, info);
-- 
cgit v1.1


From ed5d2cac114202fe2978a9cbcab8f5032796d538 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 4 Feb 2008 22:27:24 -0800
Subject: exec: rework the group exit and fix the race with kill

As Roland pointed out, we have the very old problem with exec.  de_thread()
sets SIGNAL_GROUP_EXIT, kills other threads, changes ->group_leader and then
clears signal->flags.  All signals (even fatal ones) sent in this window
(which is not too small) will be lost.

With this patch exec doesn't abuse SIGNAL_GROUP_EXIT.  signal_group_exit(),
the new helper, should be used to detect exit_group() or exec() in progress.
It can have more users, but this patch does only strictly necessary changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Robin Holt <holt@sgi.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             | 13 ++++---------
 include/linux/sched.h |  7 +++++++
 kernel/exit.c         |  3 ++-
 kernel/signal.c       |  4 ++--
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 966c5c5..be923e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -760,7 +760,7 @@ static int de_thread(struct task_struct *tsk)
 	 */
 	read_lock(&tasklist_lock);
 	spin_lock_irq(lock);
-	if (sig->flags & SIGNAL_GROUP_EXIT) {
+	if (signal_group_exit(sig)) {
 		/*
 		 * Another group action in progress, just
 		 * return so that the signal is processed.
@@ -778,6 +778,7 @@ static int de_thread(struct task_struct *tsk)
 	if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
 		task_active_pid_ns(tsk)->child_reaper = tsk;
 
+	sig->group_exit_task = tsk;
 	zap_other_threads(tsk);
 	read_unlock(&tasklist_lock);
 
@@ -802,7 +803,6 @@ static int de_thread(struct task_struct *tsk)
 	}
 
 	sig->notify_count = count;
-	sig->group_exit_task = tsk;
 	while (atomic_read(&sig->count) > count) {
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(lock);
@@ -871,15 +871,10 @@ static int de_thread(struct task_struct *tsk)
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-        }
+	}
 
 	sig->group_exit_task = NULL;
 	sig->notify_count = 0;
-	/*
-	 * There may be one thread left which is just exiting,
-	 * but it's safe to stop telling the group to kill themselves.
-	 */
-	sig->flags = 0;
 
 no_thread_group:
 	exit_itimers(sig);
@@ -1549,7 +1544,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	int err = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
-	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+	if (!signal_group_exit(tsk->signal)) {
 		tsk->signal->group_exit_code = exit_code;
 		zap_process(tsk);
 		err = 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 680bb03a..483ea4e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -555,6 +555,13 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
+/* If true, all threads except ->group_exit_task have pending SIGKILL */
+static inline int signal_group_exit(const struct signal_struct *sig)
+{
+	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
+		(sig->group_exit_task != NULL);
+}
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e459fe..9d3d0f0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1083,11 +1083,12 @@ do_group_exit(int exit_code)
 		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
-		if (sig->flags & SIGNAL_GROUP_EXIT)
+		if (signal_group_exit(sig))
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
 			sig->group_exit_code = exit_code;
+			sig->flags = SIGNAL_GROUP_EXIT;
 			zap_other_threads(current);
 		}
 		spin_unlock_irq(&sighand->siglock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1117b28..6a5f97c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -957,7 +957,6 @@ void zap_other_threads(struct task_struct *p)
 {
 	struct task_struct *t;
 
-	p->signal->flags = SIGNAL_GROUP_EXIT;
 	p->signal->group_stop_count = 0;
 
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
@@ -1697,7 +1696,8 @@ static int do_signal_stop(int signr)
 	} else {
 		struct task_struct *t;
 
-		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+		    unlikely(sig->group_exit_task))
 			return 0;
 		/*
 		 * There is no group stop already in progress.
-- 
cgit v1.1


From 5e05ad7d4e3b11f935998882b5d9c3b257137f1b Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 4 Feb 2008 22:27:25 -0800
Subject: timerfd: introduce a new hrtimer_forward_now() function

I think that advancing the timer against the timer's current "now" can be a
pretty common usage, so, w/out exposing hrtimer's internals, we add a new
hrtimer_forward_now() function.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hrtimer.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f79dcba..3fed27c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -304,6 +304,13 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
 extern unsigned long
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
+/* Forward a hrtimer so it expires after the hrtimer's current now */
+static inline unsigned long hrtimer_forward_now(struct hrtimer *timer,
+						ktime_t interval)
+{
+	return hrtimer_forward(timer, timer->base->get_time(), interval);
+}
+
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
 			      struct timespec *rmtp,
-- 
cgit v1.1


From 4d672e7ac79b5ec5cdc90e450823441e20464691 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 4 Feb 2008 22:27:26 -0800
Subject: timerfd: new timerfd API

This is the new timerfd API as it is implemented by the following patch:

int timerfd_create(int clockid, int flags);
int timerfd_settime(int ufd, int flags,
		    const struct itimerspec *utmr,
		    struct itimerspec *otmr);
int timerfd_gettime(int ufd, struct itimerspec *otmr);

The timerfd_create() API creates an un-programmed timerfd fd.  The "clockid"
parameter can be either CLOCK_MONOTONIC or CLOCK_REALTIME.

The timerfd_settime() API give new settings by the timerfd fd, by optionally
retrieving the previous expiration time (in case the "otmr" parameter is not
NULL).

The time value specified in "utmr" is absolute, if the TFD_TIMER_ABSTIME bit
is set in the "flags" parameter.  Otherwise it's a relative time.

The timerfd_gettime() API returns the next expiration time of the timer, or
{0, 0} if the timerfd has not been set yet.

Like the previous timerfd API implementation, read(2) and poll(2) are
supported (with the same interface).  Here's a simple test program I used to
exercise the new timerfd APIs:

http://www.xmailserver.org/timerfd-test2.c

[akpm@linux-foundation.org: coding-style cleanups]
[akpm@linux-foundation.org: fix ia64 build]
[akpm@linux-foundation.org: fix m68k build]
[akpm@linux-foundation.org: fix mips build]
[akpm@linux-foundation.org: fix alpha, arm, blackfin, cris, m68k, s390, sparc and sparc64 builds]
[heiko.carstens@de.ibm.com: fix s390]
[akpm@linux-foundation.org: fix powerpc build]
[akpm@linux-foundation.org: fix sparc64 more]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/systbls.S          |   2 +-
 arch/arm/kernel/calls.S              |   2 +-
 arch/blackfin/mach-common/entry.S    |   2 +-
 arch/cris/arch-v10/kernel/entry.S    |   2 +-
 arch/ia64/kernel/entry.S             |   2 +-
 arch/m68k/kernel/entry.S             |   2 +-
 arch/m68knommu/kernel/syscalltable.S |   2 +-
 arch/mips/kernel/scall32-o32.S       |   2 +-
 arch/mips/kernel/scall64-64.S        |   2 +-
 arch/mips/kernel/scall64-n32.S       |   2 +-
 arch/mips/kernel/scall64-o32.S       |   2 +-
 arch/s390/kernel/compat_wrapper.S    |   8 --
 arch/s390/kernel/syscalls.S          |   2 +-
 arch/sparc/kernel/systbls.S          |   2 +-
 arch/sparc64/kernel/systbls.S        |   4 +-
 fs/compat.c                          |  32 +++++-
 fs/timerfd.c                         | 207 +++++++++++++++++++++++------------
 include/asm-powerpc/systbl.h         |   2 +-
 include/linux/compat.h               |   7 +-
 include/linux/hrtimer.h              |  10 +-
 include/linux/syscalls.h             |   7 +-
 kernel/hrtimer.c                     |   9 +-
 kernel/posix-timers.c                |   9 +-
 kernel/sys_ni.c                      |   7 +-
 24 files changed, 210 insertions(+), 118 deletions(-)

diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 79de99e3..ba914af 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -495,7 +495,7 @@ sys_call_table:
 	.quad sys_epoll_pwait
 	.quad sys_utimensat			/* 475 */
 	.quad sys_signalfd
-	.quad sys_timerfd
+	.quad sys_ni_syscall
 	.quad sys_eventfd
 
 	.size sys_call_table, . - sys_call_table
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index cecf658..283e14f 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -359,7 +359,7 @@
 		CALL(sys_kexec_load)
 		CALL(sys_utimensat)
 		CALL(sys_signalfd)
-/* 350 */	CALL(sys_timerfd)
+/* 350 */	CALL(sys_ni_syscall)
 		CALL(sys_eventfd)
 		CALL(sys_fallocate)
 #ifndef syscalls_counted
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 56ff51b..fdd9bf43 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1373,7 +1373,7 @@ ENTRY(_sys_call_table)
 	.long _sys_epoll_pwait
 	.long _sys_utimensat
 	.long _sys_signalfd
-	.long _sys_timerfd
+	.long _sys_ni_syscall
 	.long _sys_eventfd	/* 350 */
 	.long _sys_pread64
 	.long _sys_pwrite64
diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S
index ec62c95..d1361dc 100644
--- a/arch/cris/arch-v10/kernel/entry.S
+++ b/arch/cris/arch-v10/kernel/entry.S
@@ -1167,7 +1167,7 @@ sys_call_table:
 	.long sys_epoll_pwait
 	.long sys_utimensat		/* 320 */
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate
 
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index c36f43c..f5d3efb 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1586,7 +1586,7 @@ sys_call_table:
 	data8 sys_epoll_pwait			// 1305
 	data8 sys_utimensat
 	data8 sys_signalfd
-	data8 sys_timerfd
+	data8 sys_ni_syscall
 	data8 sys_eventfd
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 918f5db..6dfa3b3 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -742,7 +742,7 @@ sys_call_table:
 	.long sys_epoll_pwait		/* 315 */
 	.long sys_utimensat
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate		/* 320 */
 
diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S
index 9620093..1b02b88 100644
--- a/arch/m68knommu/kernel/syscalltable.S
+++ b/arch/m68knommu/kernel/syscalltable.S
@@ -336,7 +336,7 @@ ENTRY(sys_call_table)
 	.long sys_epoll_pwait		/* 315 */
 	.long sys_utimensat
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate		/* 320 */
 
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 82480a1..f798139 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -660,7 +660,7 @@ einval:	li	v0, -EINVAL
 	sys	sys_ioprio_get		2	/* 4315 */
 	sys	sys_utimensat		4
 	sys	sys_signalfd		3
-	sys	sys_timerfd		4
+	sys	sys_ni_syscall		0
 	sys	sys_eventfd		1
 	sys	sys_fallocate		6	/* 4320 */
 	.endm
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index c2c1087..a626be6 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -475,7 +475,7 @@ sys_call_table:
 	PTR	sys_ioprio_get
 	PTR	sys_utimensat			/* 5275 */
 	PTR	sys_signalfd
-	PTR	sys_timerfd
+	PTR	sys_ni_syscall
 	PTR	sys_eventfd
 	PTR	sys_fallocate
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 01993ec..9d5bcaf 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -401,7 +401,7 @@ EXPORT(sysn32_call_table)
 	PTR	sys_ioprio_get
 	PTR	compat_sys_utimensat
 	PTR	compat_sys_signalfd		/* 5280 */
-	PTR	compat_sys_timerfd
+	PTR	sys_ni_syscall
 	PTR	sys_eventfd
 	PTR	sys_fallocate
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index dd68afc..fd2019c 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -523,7 +523,7 @@ sys_call_table:
 	PTR	sys_ioprio_get			/* 4315 */
 	PTR	compat_sys_utimensat
 	PTR	compat_sys_signalfd
-	PTR	compat_sys_timerfd
+	PTR	sys_ni_syscall
 	PTR	sys_eventfd
 	PTR	sys32_fallocate			/* 4320 */
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 6ee1bed..062c3d4 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1698,14 +1698,6 @@ compat_sys_signalfd_wrapper:
 	llgfr	%r4,%r4			# compat_size_t
 	jg	compat_sys_signalfd
 
-	.globl	compat_sys_timerfd_wrapper
-compat_sys_timerfd_wrapper:
-	lgfr	%r2,%r2			# int
-	lgfr	%r3,%r3			# int
-	lgfr	%r4,%r4			# int
-	llgtr	%r5,%r5			# struct compat_itimerspec *
-	jg	compat_sys_timerfd
-
 	.globl	sys_eventfd_wrapper
 sys_eventfd_wrapper:
 	llgfr	%r2,%r2			# unsigned int
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 9e26ed9..25eac78 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -325,5 +325,5 @@ SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
 SYSCALL(s390_fallocate,sys_fallocate,sys_fallocate_wrapper)
 SYSCALL(sys_utimensat,sys_utimensat,compat_sys_utimensat_wrapper)	/* 315 */
 SYSCALL(sys_signalfd,sys_signalfd,compat_sys_signalfd_wrapper)
-SYSCALL(sys_timerfd,sys_timerfd,compat_sys_timerfd_wrapper)
+NI_SYSCALL						/* 317 old sys_timer_fd */
 SYSCALL(sys_eventfd,sys_eventfd,sys_eventfd_wrapper)
diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S
index 5572284..ee010f4 100644
--- a/arch/sparc/kernel/systbls.S
+++ b/arch/sparc/kernel/systbls.S
@@ -79,7 +79,7 @@ sys_call_table:
 /*295*/	.long sys_fchmodat, sys_faccessat, sys_pselect6, sys_ppoll, sys_unshare
 /*300*/	.long sys_set_robust_list, sys_get_robust_list, sys_migrate_pages, sys_mbind, sys_get_mempolicy
 /*305*/	.long sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
-/*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate
+/*310*/	.long sys_utimensat, sys_signalfd, sys_ni_syscall, sys_eventfd, sys_fallocate
 
 #ifdef CONFIG_SUNOS_EMUL
 	/* Now the SunOS syscall table. */
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index 06d1090..b805890 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -80,7 +80,7 @@ sys_call_table32:
 	.word sys_fchmodat, sys_faccessat, compat_sys_pselect6, compat_sys_ppoll, sys_unshare
 /*300*/	.word compat_sys_set_robust_list, compat_sys_get_robust_list, compat_sys_migrate_pages, compat_sys_mbind, compat_sys_get_mempolicy
 	.word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait
-/*310*/	.word compat_sys_utimensat, compat_sys_signalfd, compat_sys_timerfd, sys_eventfd, compat_sys_fallocate
+/*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_ni_syscall, sys_eventfd, compat_sys_fallocate
 
 #endif /* CONFIG_COMPAT */
 
@@ -152,7 +152,7 @@ sys_call_table:
 	.word sys_fchmodat, sys_faccessat, sys_pselect6, sys_ppoll, sys_unshare
 /*300*/	.word sys_set_robust_list, sys_get_robust_list, sys_migrate_pages, sys_mbind, sys_get_mempolicy
 	.word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
-/*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate
+/*310*/	.word sys_utimensat, sys_signalfd, sys_ni_syscall, sys_eventfd, sys_fallocate
 
 #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \
     defined(CONFIG_SOLARIS_EMUL_MODULE)
diff --git a/fs/compat.c b/fs/compat.c
index 5216c3f..69baca5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2206,19 +2206,41 @@ asmlinkage long compat_sys_signalfd(int ufd,
 
 #ifdef CONFIG_TIMERFD
 
-asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags,
-				   const struct compat_itimerspec __user *utmr)
+asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
+				   const struct compat_itimerspec __user *utmr,
+				   struct compat_itimerspec __user *otmr)
 {
+	int error;
 	struct itimerspec t;
 	struct itimerspec __user *ut;
 
 	if (get_compat_itimerspec(&t, utmr))
 		return -EFAULT;
-	ut = compat_alloc_user_space(sizeof(*ut));
-	if (copy_to_user(ut, &t, sizeof(t)))
+	ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
+	if (copy_to_user(&ut[0], &t, sizeof(t)))
 		return -EFAULT;
+	error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
+	if (!error && otmr)
+		error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
+			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
+
+	return error;
+}
+
+asmlinkage long compat_sys_timerfd_gettime(int ufd,
+				   struct compat_itimerspec __user *otmr)
+{
+	int error;
+	struct itimerspec t;
+	struct itimerspec __user *ut;
 
-	return sys_timerfd(ufd, clockid, flags, ut);
+	ut = compat_alloc_user_space(sizeof(struct itimerspec));
+	error = sys_timerfd_gettime(ufd, ut);
+	if (!error)
+		error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
+			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
+
+	return error;
 }
 
 #endif /* CONFIG_TIMERFD */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 61983f3..10c80b5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -25,13 +25,15 @@ struct timerfd_ctx {
 	struct hrtimer tmr;
 	ktime_t tintv;
 	wait_queue_head_t wqh;
+	u64 ticks;
 	int expired;
+	int clockid;
 };
 
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
- * tintv.tv64 != 0) until the timer is read.
+ * tintv.tv64 != 0) until the timer is accessed.
  */
 static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 {
@@ -40,13 +42,24 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	ctx->expired = 1;
+	ctx->ticks++;
 	wake_up_locked(&ctx->wqh);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return HRTIMER_NORESTART;
 }
 
-static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags,
+static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
+{
+	ktime_t now, remaining;
+
+	now = ctx->tmr.base->get_time();
+	remaining = ktime_sub(ctx->tmr.expires, now);
+
+	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+}
+
+static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
 			  const struct itimerspec *ktmr)
 {
 	enum hrtimer_mode htmode;
@@ -57,8 +70,9 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags,
 
 	texp = timespec_to_ktime(ktmr->it_value);
 	ctx->expired = 0;
+	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-	hrtimer_init(&ctx->tmr, clockid, htmode);
+	hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
 	ctx->tmr.expires = texp;
 	ctx->tmr.function = timerfd_tmrproc;
 	if (texp.tv64 != 0)
@@ -83,7 +97,7 @@ static unsigned int timerfd_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &ctx->wqh, wait);
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	if (ctx->expired)
+	if (ctx->ticks)
 		events |= POLLIN;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
@@ -102,11 +116,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	if (!ctx->expired && !(file->f_flags & O_NONBLOCK)) {
+	if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (res = 0;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (ctx->expired) {
+			if (ctx->ticks) {
 				res = 0;
 				break;
 			}
@@ -121,22 +135,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (ctx->expired) {
-		ctx->expired = 0;
-		if (ctx->tintv.tv64 != 0) {
+	if (ctx->ticks) {
+		ticks = ctx->ticks;
+		if (ctx->expired && ctx->tintv.tv64) {
 			/*
 			 * If tintv.tv64 != 0, this is a periodic timer that
 			 * needs to be re-armed. We avoid doing it in the timer
 			 * callback to avoid DoS attacks specifying a very
 			 * short timer period.
 			 */
-			ticks = (u64)
-				hrtimer_forward(&ctx->tmr,
-						hrtimer_cb_get_time(&ctx->tmr),
-						ctx->tintv);
+			ticks += hrtimer_forward_now(&ctx->tmr,
+						     ctx->tintv) - 1;
 			hrtimer_restart(&ctx->tmr);
-		} else
-			ticks = 1;
+		}
+		ctx->expired = 0;
+		ctx->ticks = 0;
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 	if (ticks)
@@ -150,76 +163,132 @@ static const struct file_operations timerfd_fops = {
 	.read		= timerfd_read,
 };
 
-asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
-			    const struct itimerspec __user *utmr)
+static struct file *timerfd_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	if (file->f_op != &timerfd_fops) {
+		fput(file);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return file;
+}
+
+asmlinkage long sys_timerfd_create(int clockid, int flags)
 {
-	int error;
+	int error, ufd;
 	struct timerfd_ctx *ctx;
 	struct file *file;
 	struct inode *inode;
-	struct itimerspec ktmr;
-
-	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
-		return -EFAULT;
 
+	if (flags)
+		return -EINVAL;
 	if (clockid != CLOCK_MONOTONIC &&
 	    clockid != CLOCK_REALTIME)
 		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	init_waitqueue_head(&ctx->wqh);
+	ctx->clockid = clockid;
+	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+
+	error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]",
+				 &timerfd_fops, ctx);
+	if (error) {
+		kfree(ctx);
+		return error;
+	}
+
+	return ufd;
+}
+
+asmlinkage long sys_timerfd_settime(int ufd, int flags,
+				    const struct itimerspec __user *utmr,
+				    struct itimerspec __user *otmr)
+{
+	struct file *file;
+	struct timerfd_ctx *ctx;
+	struct itimerspec ktmr, kotmr;
+
+	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
+		return -EFAULT;
+
 	if (!timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
-	if (ufd == -1) {
-		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
-		if (!ctx)
-			return -ENOMEM;
-
-		init_waitqueue_head(&ctx->wqh);
-
-		timerfd_setup(ctx, clockid, flags, &ktmr);
-
-		/*
-		 * When we call this, the initialization must be complete, since
-		 * anon_inode_getfd() will install the fd.
-		 */
-		error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]",
-					 &timerfd_fops, ctx);
-		if (error)
-			goto err_tmrcancel;
-	} else {
-		file = fget(ufd);
-		if (!file)
-			return -EBADF;
-		ctx = file->private_data;
-		if (file->f_op != &timerfd_fops) {
-			fput(file);
-			return -EINVAL;
-		}
-		/*
-		 * We need to stop the existing timer before reprogramming
-		 * it to the new values.
-		 */
-		for (;;) {
-			spin_lock_irq(&ctx->wqh.lock);
-			if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
-				break;
-			spin_unlock_irq(&ctx->wqh.lock);
-			cpu_relax();
-		}
-		/*
-		 * Re-program the timer to the new value ...
-		 */
-		timerfd_setup(ctx, clockid, flags, &ktmr);
+	file = timerfd_fget(ufd);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	ctx = file->private_data;
 
+	/*
+	 * We need to stop the existing timer before reprogramming
+	 * it to the new values.
+	 */
+	for (;;) {
+		spin_lock_irq(&ctx->wqh.lock);
+		if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
+			break;
 		spin_unlock_irq(&ctx->wqh.lock);
-		fput(file);
+		cpu_relax();
 	}
 
-	return ufd;
+	/*
+	 * If the timer is expired and it's periodic, we need to advance it
+	 * because the caller may want to know the previous expiration time.
+	 * We do not update "ticks" and "expired" since the timer will be
+	 * re-programmed again in the following timerfd_setup() call.
+	 */
+	if (ctx->expired && ctx->tintv.tv64)
+		hrtimer_forward_now(&ctx->tmr, ctx->tintv);
 
-err_tmrcancel:
-	hrtimer_cancel(&ctx->tmr);
-	kfree(ctx);
-	return error;
+	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+
+	/*
+	 * Re-program the timer to the new value ...
+	 */
+	timerfd_setup(ctx, flags, &ktmr);
+
+	spin_unlock_irq(&ctx->wqh.lock);
+	fput(file);
+	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
+		return -EFAULT;
+
+	return 0;
+}
+
+asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr)
+{
+	struct file *file;
+	struct timerfd_ctx *ctx;
+	struct itimerspec kotmr;
+
+	file = timerfd_fget(ufd);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	ctx = file->private_data;
+
+	spin_lock_irq(&ctx->wqh.lock);
+	if (ctx->expired && ctx->tintv.tv64) {
+		ctx->expired = 0;
+		ctx->ticks +=
+			hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
+		hrtimer_restart(&ctx->tmr);
+	}
+	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+	spin_unlock_irq(&ctx->wqh.lock);
+	fput(file);
+
+	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
 }
 
diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h
index 0c8b0d6..e996521 100644
--- a/include/asm-powerpc/systbl.h
+++ b/include/asm-powerpc/systbl.h
@@ -309,7 +309,7 @@ SYSCALL_SPU(getcpu)
 COMPAT_SYS(epoll_pwait)
 COMPAT_SYS_SPU(utimensat)
 COMPAT_SYS_SPU(signalfd)
-COMPAT_SYS_SPU(timerfd)
+SYSCALL(ni_syscall)
 SYSCALL_SPU(eventfd)
 COMPAT_SYS_SPU(sync_file_range2)
 COMPAT_SYS(fallocate)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index d38655f..ae0a483 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -279,8 +279,11 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
 asmlinkage long compat_sys_signalfd(int ufd,
 				const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize);
-asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags,
-				const struct compat_itimerspec __user *utmr);
+asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
+				   const struct compat_itimerspec __user *utmr,
+				   struct compat_itimerspec __user *otmr);
+asmlinkage long compat_sys_timerfd_gettime(int ufd,
+				   struct compat_itimerspec __user *otmr);
 
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3fed27c..8371b66 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -301,12 +301,12 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
 }
 
 /* Forward a hrtimer so it expires after now: */
-extern unsigned long
+extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
 /* Forward a hrtimer so it expires after the hrtimer's current now */
-static inline unsigned long hrtimer_forward_now(struct hrtimer *timer,
-						ktime_t interval)
+static inline u64 hrtimer_forward_now(struct hrtimer *timer,
+				      ktime_t interval)
 {
 	return hrtimer_forward(timer, timer->base->get_time(), interval);
 }
@@ -329,9 +329,9 @@ extern void hrtimer_run_pending(void);
 extern void __init hrtimers_init(void);
 
 #if BITS_PER_LONG < 64
-extern unsigned long ktime_divns(const ktime_t kt, s64 div);
+extern u64 ktime_divns(const ktime_t kt, s64 div);
 #else /* BITS_PER_LONG < 64 */
-# define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
+# define ktime_divns(kt, div)		(u64)((kt).tv64 / (div))
 #endif
 
 /* Show pending timers: */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 61def7c8..4c2577b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -607,8 +607,11 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
-asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
-			    const struct itimerspec __user *utmr);
+asmlinkage long sys_timerfd_create(int clockid, int flags);
+asmlinkage long sys_timerfd_settime(int ufd, int flags,
+				    const struct itimerspec __user *utmr,
+				    struct itimerspec __user *otmr);
+asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1069998..668f396 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
 /*
  * Divide a ktime value by a nanosecond value
  */
-unsigned long ktime_divns(const ktime_t kt, s64 div)
+u64 ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc, inc, dns;
 	int sft = 0;
@@ -321,7 +321,7 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
 	dclc >>= sft;
 	do_div(dclc, (unsigned long) div);
 
-	return (unsigned long) dclc;
+	return dclc;
 }
 #endif /* BITS_PER_LONG >= 64 */
 
@@ -656,10 +656,9 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
  */
-unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
+u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
-	unsigned long orun = 1;
+	u64 orun = 1;
 	ktime_t delta;
 
 	delta = ktime_sub(now, timer->expires);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 36d563f..122d5c7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,8 +256,9 @@ static void schedule_next_timer(struct k_itimer *timr)
 	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
-					    timr->it.real.interval);
+	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
+						timer->base->get_time(),
+						timr->it.real.interval);
 
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
@@ -386,7 +387,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 					now = ktime_add(now, kj);
 			}
 #endif
-			timr->it_overrun +=
+			timr->it_overrun += (unsigned int)
 				hrtimer_forward(timer, now,
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
@@ -662,7 +663,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	 */
 	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
 	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
-		timr->it_overrun += hrtimer_forward(timer, now, iv);
+		timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
 
 	remaining = ktime_sub(timer->expires, now);
 	/* Return 0 only, when the timer is expired and not pending */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index beee5b3..5b9b467 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -154,7 +154,10 @@ cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
 cond_syscall(sys_signalfd);
-cond_syscall(sys_timerfd);
 cond_syscall(compat_sys_signalfd);
-cond_syscall(compat_sys_timerfd);
+cond_syscall(sys_timerfd_create);
+cond_syscall(sys_timerfd_settime);
+cond_syscall(sys_timerfd_gettime);
+cond_syscall(compat_sys_timerfd_settime);
+cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
-- 
cgit v1.1


From cb9282ee589e57360ab701a7e450e03e7890401d Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 4 Feb 2008 22:27:28 -0800
Subject: timerfd: wire the new timerfd API to the x86 family

Wires up the new timerfd API to the x86 family.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          | 4 +++-
 arch/x86/kernel/syscall_table_32.S | 4 +++-
 include/asm-x86/unistd_32.h        | 4 +++-
 include/asm-x86/unistd_64.h        | 9 +++++++--
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0db0a62..8022d3c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -722,7 +722,9 @@ ia32_sys_call_table:
 	.quad sys_epoll_pwait
 	.quad compat_sys_utimensat	/* 320 */
 	.quad compat_sys_signalfd
-	.quad compat_sys_timerfd
+	.quad sys_timerfd_create
 	.quad sys_eventfd
 	.quad sys32_fallocate
+	.quad compat_sys_timerfd_settime	/* 325 */
+	.quad compat_sys_timerfd_gettime
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8344c70..adff556 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -321,6 +321,8 @@ ENTRY(sys_call_table)
 	.long sys_epoll_pwait
 	.long sys_utimensat		/* 320 */
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_timerfd_create
 	.long sys_eventfd
 	.long sys_fallocate
+	.long sys_timerfd_settime	/* 325 */
+	.long sys_timerfd_gettime
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 8d8f9b5..984123a 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -327,9 +327,11 @@
 #define __NR_epoll_pwait	319
 #define __NR_utimensat		320
 #define __NR_signalfd		321
-#define __NR_timerfd		322
+#define __NR_timerfd_create	322
 #define __NR_eventfd		323
 #define __NR_fallocate		324
+#define __NR_timerfd_settime	325
+#define __NR_timerfd_gettime	326
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index 5ff4d3e..3883ceb 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -629,12 +629,17 @@ __SYSCALL(__NR_utimensat, sys_utimensat)
 __SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
 #define __NR_signalfd				282
 __SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_timerfd				283
-__SYSCALL(__NR_timerfd, sys_timerfd)
+#define __NR_timerfd_create			283
+__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
 #define __NR_eventfd				284
 __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate				285
 __SYSCALL(__NR_fallocate, sys_fallocate)
+#define __NR_timerfd_settime			286
+__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
+#define __NR_timerfd_gettime			287
+__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
+
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
-- 
cgit v1.1


From f79c343e2e5ba82b9661e7287a42fac596bf367a Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 4 Feb 2008 22:27:29 -0800
Subject: timerfd: un-break CONFIG_TIMERFD

Remove the broken status to CONFIG_TIMERFD.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index b2acdeb..dc28836 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -582,7 +582,6 @@ config SIGNALFD
 config TIMERFD
 	bool "Enable timerfd() system call" if EMBEDDED
 	select ANON_INODES
-	depends on BROKEN
 	default y
 	help
 	  Enable the timerfd() system call that allows to receive timer
-- 
cgit v1.1


From 7492d4a416d68ab4bd254b36ffcc4e0138daa8ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Mon, 4 Feb 2008 22:27:29 -0800
Subject: sdio: fix module device table definition for m68k

FATAL: drivers/bluetooth/btsdio: sizeof(struct sdio_device_id)=12 is not a modulo of the size of section __mod_sdio_device_table=30.
Fix definition of struct sdio_device_id in mod_devicetable.h

m68k has 16bit alignment for unsigned long.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Pierre Ossman <drzeus@drzeus.cx>
CC: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mod_devicetable.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index e9fddb4..139d49d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -343,7 +343,8 @@ struct sdio_device_id {
 	__u8	class;			/* Standard interface or SDIO_ANY_ID */
 	__u16	vendor;			/* Vendor or SDIO_ANY_ID */
 	__u16	device;			/* Device ID or SDIO_ANY_ID */
-	kernel_ulong_t driver_data;	/* Data private to the driver */
+	kernel_ulong_t driver_data	/* Data private to the driver */
+		__attribute__((aligned(sizeof(kernel_ulong_t))));
 };
 
 /* SSB core, see drivers/ssb/ */
-- 
cgit v1.1


From 134d495656d674511b7ea074f819d92fcc2024fa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:27:31 -0800
Subject: include/asm-powerpc/nvram.h needs list.h

CC [M]  sound/ppc/awacs.o
In file included from sound/ppc/awacs.c:24:
include/asm/nvram.h:62: error: field 'partition' has incomplete type

Reported-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-powerpc/nvram.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/asm-powerpc/nvram.h b/include/asm-powerpc/nvram.h
index 4e7059c..efde5ac 100644
--- a/include/asm-powerpc/nvram.h
+++ b/include/asm-powerpc/nvram.h
@@ -58,6 +58,9 @@ struct nvram_header {
 };
 
 #ifdef __KERNEL__
+
+#include <linux/list.h>
+
 struct nvram_partition {
 	struct list_head partition;
 	struct nvram_header header;
-- 
cgit v1.1


From 7852375bbbfc7fb9c1117d73914aeb3baf917539 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Mon, 4 Feb 2008 22:27:33 -0800
Subject: m32r: remove dead config symbols from M32R code

remove dead config symbols from M32R code

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m32r/boot/compressed/m32r_sio.c     | 4 ++--
 include/asm-m32r/irq.h                   | 2 +-
 include/asm-m32r/m32700ut/m32700ut_pld.h | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/m32r/boot/compressed/m32r_sio.c b/arch/m32r/boot/compressed/m32r_sio.c
index ee3c8be..01d877c 100644
--- a/arch/m32r/boot/compressed/m32r_sio.c
+++ b/arch/m32r/boot/compressed/m32r_sio.c
@@ -17,7 +17,7 @@ static int puts(const char *s)
 	return 0;
 }
 
-#if defined(CONFIG_PLAT_M32700UT_Alpha) || defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_OPSPUT)
+#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_OPSPUT)
 #include <asm/m32r.h>
 #include <asm/io.h>
 
@@ -52,7 +52,7 @@ static void putc(char c)
 	}
 	*BOOT_SIO0TXB = c;
 }
-#else /* !(CONFIG_PLAT_M32700UT_Alpha) && !(CONFIG_PLAT_M32700UT) */
+#else /* !(CONFIG_PLAT_M32700UT) */
 #if defined(CONFIG_PLAT_MAPPI2)
 #define SIO0STS	(volatile unsigned short *)(0xa0efd000 + 14)
 #define SIO0TXB	(volatile unsigned short *)(0xa0efd000 + 30)
diff --git a/include/asm-m32r/irq.h b/include/asm-m32r/irq.h
index 2f93f47..242028b 100644
--- a/include/asm-m32r/irq.h
+++ b/include/asm-m32r/irq.h
@@ -3,7 +3,7 @@
 #define _ASM_M32R_IRQ_H
 
 
-#if defined(CONFIG_PLAT_M32700UT_Alpha) || defined(CONFIG_PLAT_USRV)
+#if defined(CONFIG_PLAT_USRV)
 /*
  * IRQ definitions for M32700UT
  *  M32700 Chip: 64 interrupts
diff --git a/include/asm-m32r/m32700ut/m32700ut_pld.h b/include/asm-m32r/m32700ut/m32700ut_pld.h
index d391212..57623be 100644
--- a/include/asm-m32r/m32700ut/m32700ut_pld.h
+++ b/include/asm-m32r/m32700ut/m32700ut_pld.h
@@ -13,9 +13,7 @@
  * this archive for more details.
  */
 
-#if defined(CONFIG_PLAT_M32700UT_Alpha)
-#define PLD_PLAT_BASE		0x08c00000
-#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_USRV)
+#if defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_USRV)
 #define PLD_PLAT_BASE		0x04c00000
 #else
 #error "no platform configuration"
-- 
cgit v1.1


From ecb8a8472f6d314096f20885722f2033d2071719 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Mon, 4 Feb 2008 22:27:34 -0800
Subject: pcmcia: convert some internal-only ioaddr_t to unsigned int

Convert the io_req_t members to unsigned int, to allow use on machines with
more than 16 bits worth of IO ports (i.e.  secondary busses on ppc64, etc).

There was only a couple of places in drivers where a change was needed.  I
left printk formats alone (there are lots of %04x-style formats in there),
mostly to not change the format on the platforms that only have 16-bit io
addresses, but also because the padding doesn't really add all that much value
most of the time.

I found only one sprintf of an address, and upsized the string accordingly (I
doubt anyone will have anywhere near INT_MAX as irq value, but at least
there's room for it now).

Signed-off-by: Olof Johansson <olof@lixom.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/pcmcia/cm4000_cs.c    | 17 +++++++++--------
 drivers/pcmcia/pcmcia_resource.c   | 14 +++++++-------
 drivers/scsi/pcmcia/fdomain_stub.c |  2 +-
 include/pcmcia/cs.h                |  8 ++++----
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 02518da..454d732 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -308,7 +308,8 @@ static unsigned int calc_baudv(unsigned char fidi)
 	return (wcrcf / wbrcf);
 }
 
-static unsigned short io_read_num_rec_bytes(ioaddr_t iobase, unsigned short *s)
+static unsigned short io_read_num_rec_bytes(unsigned int iobase,
+					    unsigned short *s)
 {
 	unsigned short tmp;
 
@@ -426,7 +427,7 @@ static struct card_fixup card_fixups[] = {
 static void set_cardparameter(struct cm4000_dev *dev)
 {
 	int i;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 	u_int8_t stopbits = 0x02; /* ISO default */
 
 	DEBUGP(3, dev, "-> set_cardparameter\n");
@@ -459,7 +460,7 @@ static int set_protocol(struct cm4000_dev *dev, struct ptsreq *ptsreq)
 	unsigned short num_bytes_read;
 	unsigned char pts_reply[4];
 	ssize_t rc;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 
 	rc = 0;
 
@@ -610,7 +611,7 @@ exit_setprotocol:
 	return rc;
 }
 
-static int io_detect_cm4000(ioaddr_t iobase, struct cm4000_dev *dev)
+static int io_detect_cm4000(unsigned int iobase, struct cm4000_dev *dev)
 {
 
 	/* note: statemachine is assumed to be reset */
@@ -671,7 +672,7 @@ static void terminate_monitor(struct cm4000_dev *dev)
 static void monitor_card(unsigned long p)
 {
 	struct cm4000_dev *dev = (struct cm4000_dev *) p;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 	unsigned short s;
 	struct ptsreq ptsreq;
 	int i, atrc;
@@ -933,7 +934,7 @@ static ssize_t cmm_read(struct file *filp, __user char *buf, size_t count,
 			loff_t *ppos)
 {
 	struct cm4000_dev *dev = filp->private_data;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 	ssize_t rc;
 	int i, j, k;
 
@@ -1054,7 +1055,7 @@ static ssize_t cmm_write(struct file *filp, const char __user *buf,
 			 size_t count, loff_t *ppos)
 {
 	struct cm4000_dev *dev = (struct cm4000_dev *) filp->private_data;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 	unsigned short s;
 	unsigned char tmp;
 	unsigned char infolen;
@@ -1408,7 +1409,7 @@ static int cmm_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		     unsigned long arg)
 {
 	struct cm4000_dev *dev = filp->private_data;
-	ioaddr_t iobase = dev->p_dev->io.BasePort1;
+	unsigned int iobase = dev->p_dev->io.BasePort1;
 	struct pcmcia_device *link;
 	int size;
 	int rc;
diff --git a/drivers/pcmcia/pcmcia_resource.c b/drivers/pcmcia/pcmcia_resource.c
index 0ce39de..1d128fb 100644
--- a/drivers/pcmcia/pcmcia_resource.c
+++ b/drivers/pcmcia/pcmcia_resource.c
@@ -65,23 +65,23 @@ extern int ds_pc_debug;
  * Special stuff for managing IO windows, because they are scarce
  */
 
-static int alloc_io_space(struct pcmcia_socket *s, u_int attr, ioaddr_t *base,
-			  ioaddr_t num, u_int lines)
+static int alloc_io_space(struct pcmcia_socket *s, u_int attr,
+			  unsigned int *base, unsigned int num, u_int lines)
 {
 	int i;
-	kio_addr_t try, align;
+	unsigned int try, align;
 
 	align = (*base) ? (lines ? 1<<lines : 0) : 1;
 	if (align && (align < num)) {
 		if (*base) {
-			ds_dbg(s, 0, "odd IO request: num %#x align %#lx\n",
+			ds_dbg(s, 0, "odd IO request: num %#x align %#x\n",
 			       num, align);
 			align = 0;
 		} else
 			while (align && (align < num)) align <<= 1;
 	}
 	if (*base & ~(align-1)) {
-		ds_dbg(s, 0, "odd IO request: base %#x align %#lx\n",
+		ds_dbg(s, 0, "odd IO request: base %#x align %#x\n",
 		       *base, align);
 		align = 0;
 	}
@@ -132,8 +132,8 @@ static int alloc_io_space(struct pcmcia_socket *s, u_int attr, ioaddr_t *base,
 } /* alloc_io_space */
 
 
-static void release_io_space(struct pcmcia_socket *s, ioaddr_t base,
-			     ioaddr_t num)
+static void release_io_space(struct pcmcia_socket *s, unsigned int base,
+			     unsigned int num)
 {
 	int i;
 
diff --git a/drivers/scsi/pcmcia/fdomain_stub.c b/drivers/scsi/pcmcia/fdomain_stub.c
index 4b82b20..d8b9935 100644
--- a/drivers/scsi/pcmcia/fdomain_stub.c
+++ b/drivers/scsi/pcmcia/fdomain_stub.c
@@ -130,7 +130,7 @@ static int fdomain_config(struct pcmcia_device *link)
     cisparse_t parse;
     int i, last_ret, last_fn;
     u_char tuple_data[64];
-    char str[16];
+    char str[22];
     struct Scsi_Host *host;
 
     DEBUG(0, "fdomain_config(0x%p)\n", link);
diff --git a/include/pcmcia/cs.h b/include/pcmcia/cs.h
index d5838c3..87a260e 100644
--- a/include/pcmcia/cs.h
+++ b/include/pcmcia/cs.h
@@ -147,11 +147,11 @@ typedef struct config_req_t {
 
 /* For RequestIO and ReleaseIO */
 typedef struct io_req_t {
-    ioaddr_t	BasePort1;
-    ioaddr_t	NumPorts1;
+    u_int	BasePort1;
+    u_int	NumPorts1;
     u_int	Attributes1;
-    ioaddr_t	BasePort2;
-    ioaddr_t	NumPorts2;
+    u_int	BasePort2;
+    u_int	NumPorts2;
     u_int	Attributes2;
     u_int	IOAddrLines;
 } io_req_t;
-- 
cgit v1.1


From 906da809c5be30b4c7f32bb6a489fb25ad794878 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Mon, 4 Feb 2008 22:27:35 -0800
Subject: pcmcia: replace kio_addr_t with unsigned int everywhere

Remove kio_addr_t, and replace it with unsigned int.  No known architecture
needs more than 32 bits for IO addresses and ports and having a separate type
for it is just messy.

Signed-off-by: Olof Johansson <olof@lixom.net>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/pcmcia/driver-changes.txt |  4 +--
 drivers/bluetooth/bt3c_cs.c             |  2 +-
 drivers/bluetooth/btuart_cs.c           |  2 +-
 drivers/net/pcmcia/3c574_cs.c           | 49 +++++++++++++-------------
 drivers/net/pcmcia/3c589_cs.c           | 30 ++++++++--------
 drivers/net/pcmcia/axnet_cs.c           | 26 +++++++-------
 drivers/net/pcmcia/fmvj18x_cs.c         | 23 ++++++------
 drivers/net/pcmcia/nmclan_cs.c          | 25 ++++++-------
 drivers/net/pcmcia/pcnet_cs.c           | 36 +++++++++----------
 drivers/net/pcmcia/smc91c92_cs.c        | 62 ++++++++++++++++-----------------
 drivers/net/pcmcia/xirc2ps_cs.c         | 51 ++++++++++++++-------------
 drivers/net/wireless/netwave_cs.c       | 24 ++++++-------
 drivers/net/wireless/wavelan_cs.c       | 56 ++++++++++++++---------------
 drivers/pcmcia/i82092.c                 |  2 +-
 drivers/pcmcia/i82365.c                 | 18 +++++-----
 drivers/pcmcia/m32r_cfc.c               |  7 ++--
 drivers/pcmcia/m32r_pcc.c               |  7 ++--
 drivers/pcmcia/rsrc_nonstatic.c         | 11 +++---
 drivers/pcmcia/tcic.c                   |  2 +-
 drivers/serial/serial_cs.c              |  6 ++--
 include/pcmcia/cs_types.h               |  1 -
 include/pcmcia/ss.h                     |  6 ++--
 22 files changed, 228 insertions(+), 222 deletions(-)

diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 4739c5c..96f155e 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -33,8 +33,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors:
    and can be used (e.g. for SET_NETDEV_DEV) by using
    handle_to_dev(client_handle_t * handle).
 
-* Convert internal I/O port addresses to unsigned long (as of 2.6.11)
-   ioaddr_t should be replaced by kio_addr_t in PCMCIA card drivers.
+* Convert internal I/O port addresses to unsigned int (as of 2.6.11)
+   ioaddr_t should be replaced by unsigned int in PCMCIA card drivers.
 
 * irq_mask and irq_list parameters (as of 2.6.11)
    The irq_mask and irq_list parameters should no longer be used in
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index a18f9b8..7703d6e 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -704,7 +704,7 @@ static int next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t *
 
 static int bt3c_config(struct pcmcia_device *link)
 {
-	static kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
+	static unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
 	bt3c_info_t *info = link->priv;
 	tuple_t tuple;
 	u_short buf[256];
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index dade162..68d1d25 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -634,7 +634,7 @@ static int next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t *
 
 static int btuart_config(struct pcmcia_device *link)
 {
-	static kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
+	static unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
 	btuart_info_t *info = link->priv;
 	tuple_t tuple;
 	u_short buf[256];
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 36a7ba3..dafbbdb 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -230,10 +230,11 @@ static char mii_preamble_required = 0;
 static int tc574_config(struct pcmcia_device *link);
 static void tc574_release(struct pcmcia_device *link);
 
-static void mdio_sync(kio_addr_t ioaddr, int bits);
-static int mdio_read(kio_addr_t ioaddr, int phy_id, int location);
-static void mdio_write(kio_addr_t ioaddr, int phy_id, int location, int value);
-static unsigned short read_eeprom(kio_addr_t ioaddr, int index);
+static void mdio_sync(unsigned int ioaddr, int bits);
+static int mdio_read(unsigned int ioaddr, int phy_id, int location);
+static void mdio_write(unsigned int ioaddr, int phy_id, int location,
+		       int value);
+static unsigned short read_eeprom(unsigned int ioaddr, int index);
 static void tc574_wait_for_completion(struct net_device *dev, int cmd);
 
 static void tc574_reset(struct net_device *dev);
@@ -341,7 +342,7 @@ static int tc574_config(struct pcmcia_device *link)
 	tuple_t tuple;
 	__le16 buf[32];
 	int last_fn, last_ret, i, j;
-	kio_addr_t ioaddr;
+	unsigned int ioaddr;
 	__be16 *phys_addr;
 	char *cardname;
 	__u32 config;
@@ -515,7 +516,7 @@ static int tc574_resume(struct pcmcia_device *link)
 
 static void dump_status(struct net_device *dev)
 {
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	EL3WINDOW(1);
 	printk(KERN_INFO "  irq status %04x, rx status %04x, tx status "
 		   "%02x, tx free %04x\n", inw(ioaddr+EL3_STATUS),
@@ -544,7 +545,7 @@ static void tc574_wait_for_completion(struct net_device *dev, int cmd)
 /* Read a word from the EEPROM using the regular EEPROM access register.
    Assume that we are in register window zero.
  */
-static unsigned short read_eeprom(kio_addr_t ioaddr, int index)
+static unsigned short read_eeprom(unsigned int ioaddr, int index)
 {
 	int timer;
 	outw(EEPROM_Read + index, ioaddr + Wn0EepromCmd);
@@ -572,9 +573,9 @@ static unsigned short read_eeprom(kio_addr_t ioaddr, int index)
 
 /* Generate the preamble required for initial synchronization and
    a few older transceivers. */
-static void mdio_sync(kio_addr_t ioaddr, int bits)
+static void mdio_sync(unsigned int ioaddr, int bits)
 {
-	kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+	unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 
 	/* Establish sync by sending at least 32 logic ones. */
 	while (-- bits >= 0) {
@@ -583,12 +584,12 @@ static void mdio_sync(kio_addr_t ioaddr, int bits)
 	}
 }
 
-static int mdio_read(kio_addr_t ioaddr, int phy_id, int location)
+static int mdio_read(unsigned int ioaddr, int phy_id, int location)
 {
 	int i;
 	int read_cmd = (0xf6 << 10) | (phy_id << 5) | location;
 	unsigned int retval = 0;
-	kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+	unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 
 	if (mii_preamble_required)
 		mdio_sync(ioaddr, 32);
@@ -608,10 +609,10 @@ static int mdio_read(kio_addr_t ioaddr, int phy_id, int location)
 	return (retval>>1) & 0xffff;
 }
 
-static void mdio_write(kio_addr_t ioaddr, int phy_id, int location, int value)
+static void mdio_write(unsigned int ioaddr, int phy_id, int location, int value)
 {
 	int write_cmd = 0x50020000 | (phy_id << 23) | (location << 18) | value;
-	kio_addr_t mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+	unsigned int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 	int i;
 
 	if (mii_preamble_required)
@@ -637,7 +638,7 @@ static void tc574_reset(struct net_device *dev)
 {
 	struct el3_private *lp = netdev_priv(dev);
 	int i;
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	unsigned long flags;
 
 	tc574_wait_for_completion(dev, TotalReset|0x10);
@@ -741,7 +742,7 @@ static int el3_open(struct net_device *dev)
 static void el3_tx_timeout(struct net_device *dev)
 {
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	
 	printk(KERN_NOTICE "%s: Transmit timed out!\n", dev->name);
 	dump_status(dev);
@@ -756,7 +757,7 @@ static void el3_tx_timeout(struct net_device *dev)
 static void pop_tx_status(struct net_device *dev)
 {
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	int i;
     
 	/* Clear the Tx status stack. */
@@ -779,7 +780,7 @@ static void pop_tx_status(struct net_device *dev)
 
 static int el3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	struct el3_private *lp = netdev_priv(dev);
 	unsigned long flags;
 
@@ -813,7 +814,7 @@ static irqreturn_t el3_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = (struct net_device *) dev_id;
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr;
+	unsigned int ioaddr;
 	unsigned status;
 	int work_budget = max_interrupt_work;
 	int handled = 0;
@@ -907,7 +908,7 @@ static void media_check(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *) arg;
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	unsigned long flags;
 	unsigned short /* cable, */ media, partner;
 
@@ -996,7 +997,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev)
 static void update_stats(struct net_device *dev)
 {
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	u8 rx, tx, up;
 
 	DEBUG(2, "%s: updating the statistics.\n", dev->name);
@@ -1033,7 +1034,7 @@ static void update_stats(struct net_device *dev)
 static int el3_rx(struct net_device *dev, int worklimit)
 {
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	short rx_status;
 	
 	DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n",
@@ -1094,7 +1095,7 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct el3_private *lp = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	u16 *data = (u16 *)&rq->ifr_ifru;
 	int phy = lp->phys & 0x1f;
 
@@ -1148,7 +1149,7 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
 static void set_rx_mode(struct net_device *dev)
 {
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 
 	if (dev->flags & IFF_PROMISC)
 		outw(SetRxFilter | RxStation | RxMulticast | RxBroadcast | RxProm,
@@ -1161,7 +1162,7 @@ static void set_rx_mode(struct net_device *dev)
 
 static int el3_close(struct net_device *dev)
 {
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	struct el3_private *lp = netdev_priv(dev);
 	struct pcmcia_device *link = lp->p_dev;
 
diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c
index e862d14e..1b1abb1 100644
--- a/drivers/net/pcmcia/3c589_cs.c
+++ b/drivers/net/pcmcia/3c589_cs.c
@@ -145,7 +145,7 @@ DRV_NAME ".c " DRV_VERSION " 2001/10/13 00:08:50 (David Hinds)";
 static int tc589_config(struct pcmcia_device *link);
 static void tc589_release(struct pcmcia_device *link);
 
-static u16 read_eeprom(kio_addr_t ioaddr, int index);
+static u16 read_eeprom(unsigned int ioaddr, int index);
 static void tc589_reset(struct net_device *dev);
 static void media_check(unsigned long arg);
 static int el3_config(struct net_device *dev, struct ifmap *map);
@@ -254,7 +254,7 @@ static int tc589_config(struct pcmcia_device *link)
     __le16 buf[32];
     __be16 *phys_addr;
     int last_fn, last_ret, i, j, multi = 0, fifo;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     char *ram_split[] = {"5:3", "3:1", "1:1", "3:5"};
     DECLARE_MAC_BUF(mac);
     
@@ -403,7 +403,7 @@ static void tc589_wait_for_completion(struct net_device *dev, int cmd)
   Read a word from the EEPROM using the regular EEPROM access register.
   Assume that we are in register window zero.
 */
-static u16 read_eeprom(kio_addr_t ioaddr, int index)
+static u16 read_eeprom(unsigned int ioaddr, int index)
 {
     int i;
     outw(EEPROM_READ + index, ioaddr + 10);
@@ -421,7 +421,7 @@ static u16 read_eeprom(kio_addr_t ioaddr, int index)
 static void tc589_set_xcvr(struct net_device *dev, int if_port)
 {
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     
     EL3WINDOW(0);
     switch (if_port) {
@@ -443,7 +443,7 @@ static void tc589_set_xcvr(struct net_device *dev, int if_port)
 
 static void dump_status(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     EL3WINDOW(1);
     printk(KERN_INFO "  irq status %04x, rx status %04x, tx status "
 	   "%02x  tx free %04x\n", inw(ioaddr+EL3_STATUS),
@@ -459,7 +459,7 @@ static void dump_status(struct net_device *dev)
 /* Reset and restore all of the 3c589 registers. */
 static void tc589_reset(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i;
     
     EL3WINDOW(0);
@@ -567,7 +567,7 @@ static int el3_open(struct net_device *dev)
 static void el3_tx_timeout(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     
     printk(KERN_WARNING "%s: Transmit timed out!\n", dev->name);
     dump_status(dev);
@@ -582,7 +582,7 @@ static void el3_tx_timeout(struct net_device *dev)
 static void pop_tx_status(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i;
     
     /* Clear the Tx status stack. */
@@ -604,7 +604,7 @@ static void pop_tx_status(struct net_device *dev)
 
 static int el3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     struct el3_private *priv = netdev_priv(dev);
     unsigned long flags;
 
@@ -641,7 +641,7 @@ static irqreturn_t el3_interrupt(int irq, void *dev_id)
 {
     struct net_device *dev = (struct net_device *) dev_id;
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     __u16 status;
     int i = 0, handled = 1;
     
@@ -727,7 +727,7 @@ static void media_check(unsigned long arg)
 {
     struct net_device *dev = (struct net_device *)(arg);
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u16 media, errs;
     unsigned long flags;
 
@@ -828,7 +828,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev)
 static void update_stats(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     DEBUG(2, "%s: updating the statistics.\n", dev->name);
     /* Turn off statistics updates while reading. */
@@ -855,7 +855,7 @@ static void update_stats(struct net_device *dev)
 static int el3_rx(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int worklimit = 32;
     short rx_status;
     
@@ -909,7 +909,7 @@ static void set_multicast_list(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u16 opts = SetRxFilter | RxStation | RxBroadcast;
 
     if (!pcmcia_dev_present(link)) return;
@@ -924,7 +924,7 @@ static int el3_close(struct net_device *dev)
 {
     struct el3_private *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     
     DEBUG(1, "%s: shutting down ethercard.\n", dev->name);
 
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 6d342f6..e115881 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -96,8 +96,8 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id);
 static void ei_watchdog(u_long arg);
 static void axnet_reset_8390(struct net_device *dev);
 
-static int mdio_read(kio_addr_t addr, int phy_id, int loc);
-static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value);
+static int mdio_read(unsigned int addr, int phy_id, int loc);
+static void mdio_write(unsigned int addr, int phy_id, int loc, int value);
 
 static void get_8390_hdr(struct net_device *,
 			 struct e8390_pkt_hdr *, int);
@@ -203,7 +203,7 @@ static void axnet_detach(struct pcmcia_device *link)
 static int get_prom(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i, j;
 
     /* This is based on drivers/net/ne.c */
@@ -473,7 +473,7 @@ static int axnet_resume(struct pcmcia_device *link)
 #define MDIO_MASK		0x0f
 #define MDIO_ENB_IN		0x02
 
-static void mdio_sync(kio_addr_t addr)
+static void mdio_sync(unsigned int addr)
 {
     int bits;
     for (bits = 0; bits < 32; bits++) {
@@ -482,7 +482,7 @@ static void mdio_sync(kio_addr_t addr)
     }
 }
 
-static int mdio_read(kio_addr_t addr, int phy_id, int loc)
+static int mdio_read(unsigned int addr, int phy_id, int loc)
 {
     u_int cmd = (0xf6<<10)|(phy_id<<5)|loc;
     int i, retval = 0;
@@ -501,7 +501,7 @@ static int mdio_read(kio_addr_t addr, int phy_id, int loc)
     return (retval>>1) & 0xffff;
 }
 
-static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value)
+static void mdio_write(unsigned int addr, int phy_id, int loc, int value)
 {
     u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value;
     int i;
@@ -575,7 +575,7 @@ static int axnet_close(struct net_device *dev)
 
 static void axnet_reset_8390(struct net_device *dev)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     int i;
 
     ei_status.txing = ei_status.dmaing = 0;
@@ -610,8 +610,8 @@ static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)(arg);
     axnet_dev_t *info = PRIV(dev);
-    kio_addr_t nic_base = dev->base_addr;
-    kio_addr_t mii_addr = nic_base + AXNET_MII_EEP;
+    unsigned int nic_base = dev->base_addr;
+    unsigned int mii_addr = nic_base + AXNET_MII_EEP;
     u_short link;
 
     if (!netif_device_present(dev)) goto reschedule;
@@ -681,7 +681,7 @@ static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     axnet_dev_t *info = PRIV(dev);
     u16 *data = (u16 *)&rq->ifr_ifru;
-    kio_addr_t mii_addr = dev->base_addr + AXNET_MII_EEP;
+    unsigned int mii_addr = dev->base_addr + AXNET_MII_EEP;
     switch (cmd) {
     case SIOCGMIIPHY:
 	data[0] = info->phy_id;
@@ -703,7 +703,7 @@ static void get_8390_hdr(struct net_device *dev,
 			 struct e8390_pkt_hdr *hdr,
 			 int ring_page)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
 
     outb_p(0, nic_base + EN0_RSARLO);		/* On page boundary */
     outb_p(ring_page, nic_base + EN0_RSARHI);
@@ -721,7 +721,7 @@ static void get_8390_hdr(struct net_device *dev,
 static void block_input(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     int xfer_count = count;
     char *buf = skb->data;
 
@@ -744,7 +744,7 @@ static void block_input(struct net_device *dev, int count,
 static void block_output(struct net_device *dev, int count,
 			 const u_char *buf, const int start_page)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
 
 #ifdef PCMCIA_DEBUG
     if (ei_debug > 4)
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 949c6df..7cb2253 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -298,7 +298,8 @@ do { last_fn = (fn); if ((last_ret = (ret)) != 0) goto cs_failed; } while (0)
 static int mfc_try_io_port(struct pcmcia_device *link)
 {
     int i, ret;
-    static const kio_addr_t serial_base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
+    static const unsigned int serial_base[5] =
+	{ 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
 
     for (i = 0; i < 5; i++) {
 	link->io.BasePort2 = serial_base[i];
@@ -316,7 +317,7 @@ static int mfc_try_io_port(struct pcmcia_device *link)
 static int ungermann_try_io_port(struct pcmcia_device *link)
 {
     int ret;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     /*
 	Ungermann-Bass Access/CARD accepts 0x300,0x320,0x340,0x360
 	0x380,0x3c0 only for ioport.
@@ -342,7 +343,7 @@ static int fmvj18x_config(struct pcmcia_device *link)
     cisparse_t parse;
     u_short buf[32];
     int i, last_fn = 0, last_ret = 0, ret;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     cardtype_t cardtype;
     char *card_name = "unknown";
     u_char *node_id;
@@ -610,7 +611,7 @@ static int fmvj18x_setup_mfc(struct pcmcia_device *link)
     u_char __iomem *base;
     int i, j;
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
 
     /* Allocate a small memory window */
     req.Attributes = WIN_DATA_WIDTH_8|WIN_MEMORY_TYPE_AM|WIN_ENABLE;
@@ -735,7 +736,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 {
     struct net_device *dev = dev_id;
     local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     unsigned short tx_stat, rx_stat;
 
     ioaddr = dev->base_addr;
@@ -789,7 +790,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 static void fjn_tx_timeout(struct net_device *dev)
 {
     struct local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     printk(KERN_NOTICE "%s: transmit timed out with status %04x, %s?\n",
 	   dev->name, htons(inw(ioaddr + TX_STATUS)),
@@ -819,7 +820,7 @@ static void fjn_tx_timeout(struct net_device *dev)
 static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
     struct local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     short length = skb->len;
     
     if (length < ETH_ZLEN)
@@ -892,7 +893,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev)
 static void fjn_reset(struct net_device *dev)
 {
     struct local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i;
 
     DEBUG(4, "fjn_reset(%s) called.\n",dev->name);
@@ -971,7 +972,7 @@ static void fjn_reset(struct net_device *dev)
 static void fjn_rx(struct net_device *dev)
 {
     struct local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int boguscount = 10;	/* 5 -> 10: by agy 19940922 */
 
     DEBUG(4, "%s: in rx_packet(), rx_status %02x.\n",
@@ -1125,7 +1126,7 @@ static int fjn_close(struct net_device *dev)
 {
     struct local_info_t *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     DEBUG(4, "fjn_close('%s').\n", dev->name);
 
@@ -1168,7 +1169,7 @@ static struct net_device_stats *fjn_get_stats(struct net_device *dev)
 
 static void set_rx_mode(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_char mc_filter[8];		 /* Multicast hash filter */
     u_long flags;
     int i;
diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c
index a355a93..cfcbea9 100644
--- a/drivers/net/pcmcia/nmclan_cs.c
+++ b/drivers/net/pcmcia/nmclan_cs.c
@@ -518,7 +518,7 @@ mace_read
 	assuming that during normal operation, the MACE is always in
 	bank 0.
 ---------------------------------------------------------------------------- */
-static int mace_read(mace_private *lp, kio_addr_t ioaddr, int reg)
+static int mace_read(mace_private *lp, unsigned int ioaddr, int reg)
 {
   int data = 0xFF;
   unsigned long flags;
@@ -545,7 +545,8 @@ mace_write
 	are assuming that during normal operation, the MACE is always in
 	bank 0.
 ---------------------------------------------------------------------------- */
-static void mace_write(mace_private *lp, kio_addr_t ioaddr, int reg, int data)
+static void mace_write(mace_private *lp, unsigned int ioaddr, int reg,
+		       int data)
 {
   unsigned long flags;
 
@@ -567,7 +568,7 @@ static void mace_write(mace_private *lp, kio_addr_t ioaddr, int reg, int data)
 mace_init
 	Resets the MACE chip.
 ---------------------------------------------------------------------------- */
-static int mace_init(mace_private *lp, kio_addr_t ioaddr, char *enet_addr)
+static int mace_init(mace_private *lp, unsigned int ioaddr, char *enet_addr)
 {
   int i;
   int ct = 0;
@@ -657,7 +658,7 @@ static int nmclan_config(struct pcmcia_device *link)
   tuple_t tuple;
   u_char buf[64];
   int i, last_ret, last_fn;
-  kio_addr_t ioaddr;
+  unsigned int ioaddr;
   DECLARE_MAC_BUF(mac);
 
   DEBUG(0, "nmclan_config(0x%p)\n", link);
@@ -839,7 +840,7 @@ mace_open
 ---------------------------------------------------------------------------- */
 static int mace_open(struct net_device *dev)
 {
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
   mace_private *lp = netdev_priv(dev);
   struct pcmcia_device *link = lp->p_dev;
 
@@ -862,7 +863,7 @@ mace_close
 ---------------------------------------------------------------------------- */
 static int mace_close(struct net_device *dev)
 {
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
   mace_private *lp = netdev_priv(dev);
   struct pcmcia_device *link = lp->p_dev;
 
@@ -935,7 +936,7 @@ static void mace_tx_timeout(struct net_device *dev)
 static int mace_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
   mace_private *lp = netdev_priv(dev);
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
 
   netif_stop_queue(dev);
 
@@ -996,7 +997,7 @@ static irqreturn_t mace_interrupt(int irq, void *dev_id)
 {
   struct net_device *dev = (struct net_device *) dev_id;
   mace_private *lp = netdev_priv(dev);
-  kio_addr_t ioaddr;
+  unsigned int ioaddr;
   int status;
   int IntrCnt = MACE_MAX_IR_ITERATIONS;
 
@@ -1140,7 +1141,7 @@ mace_rx
 static int mace_rx(struct net_device *dev, unsigned char RxCnt)
 {
   mace_private *lp = netdev_priv(dev);
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
   unsigned char rx_framecnt;
   unsigned short rx_status;
 
@@ -1302,7 +1303,7 @@ update_stats
 	card's SRAM fast enough.  If this happens, something is
 	seriously wrong with the hardware.
 ---------------------------------------------------------------------------- */
-static void update_stats(kio_addr_t ioaddr, struct net_device *dev)
+static void update_stats(unsigned int ioaddr, struct net_device *dev)
 {
   mace_private *lp = netdev_priv(dev);
 
@@ -1448,7 +1449,7 @@ static void restore_multicast_list(struct net_device *dev)
   mace_private *lp = netdev_priv(dev);
   int num_addrs = lp->multicast_num_addrs;
   int *ladrf = lp->multicast_ladrf;
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
   int i;
 
   DEBUG(2, "%s: restoring Rx mode to %d addresses.\n",
@@ -1540,7 +1541,7 @@ static void set_multicast_list(struct net_device *dev)
 
 static void restore_multicast_list(struct net_device *dev)
 {
-  kio_addr_t ioaddr = dev->base_addr;
+  unsigned int ioaddr = dev->base_addr;
   mace_private *lp = netdev_priv(dev);
 
   DEBUG(2, "%s: restoring Rx mode to %d addresses.\n", dev->name,
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 9ba56aa..6bc48a0 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -349,7 +349,7 @@ static hw_info_t *get_hwinfo(struct pcmcia_device *link)
 static hw_info_t *get_prom(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_char prom[32];
     int i, j;
 
@@ -425,7 +425,7 @@ static hw_info_t *get_dl10019(struct pcmcia_device *link)
 static hw_info_t *get_ax88190(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i, j;
 
     /* Not much of a test, but the alternatives are messy */
@@ -756,7 +756,7 @@ static int pcnet_resume(struct pcmcia_device *link)
 #define MDIO_DATA_READ		0x10
 #define MDIO_MASK		0x0f
 
-static void mdio_sync(kio_addr_t addr)
+static void mdio_sync(unsigned int addr)
 {
     int bits, mask = inb(addr) & MDIO_MASK;
     for (bits = 0; bits < 32; bits++) {
@@ -765,7 +765,7 @@ static void mdio_sync(kio_addr_t addr)
     }
 }
 
-static int mdio_read(kio_addr_t addr, int phy_id, int loc)
+static int mdio_read(unsigned int addr, int phy_id, int loc)
 {
     u_int cmd = (0x06<<10)|(phy_id<<5)|loc;
     int i, retval = 0, mask = inb(addr) & MDIO_MASK;
@@ -784,7 +784,7 @@ static int mdio_read(kio_addr_t addr, int phy_id, int loc)
     return (retval>>1) & 0xffff;
 }
 
-static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value)
+static void mdio_write(unsigned int addr, int phy_id, int loc, int value)
 {
     u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value;
     int i, mask = inb(addr) & MDIO_MASK;
@@ -818,10 +818,10 @@ static void mdio_write(kio_addr_t addr, int phy_id, int loc, int value)
 
 #define DL19FDUPLX	0x0400	/* DL10019 Full duplex mode */
 
-static int read_eeprom(kio_addr_t ioaddr, int location)
+static int read_eeprom(unsigned int ioaddr, int location)
 {
     int i, retval = 0;
-    kio_addr_t ee_addr = ioaddr + DLINK_EEPROM;
+    unsigned int ee_addr = ioaddr + DLINK_EEPROM;
     int read_cmd = location | (EE_READ_CMD << 8);
 
     outb(0, ee_addr);
@@ -852,10 +852,10 @@ static int read_eeprom(kio_addr_t ioaddr, int location)
     In ASIC mode, EE_ADOT is used to output the data to the ASIC.
 */
 
-static void write_asic(kio_addr_t ioaddr, int location, short asic_data)
+static void write_asic(unsigned int ioaddr, int location, short asic_data)
 {
 	int i;
-	kio_addr_t ee_addr = ioaddr + DLINK_EEPROM;
+	unsigned int ee_addr = ioaddr + DLINK_EEPROM;
 	short dataval;
 	int read_cmd = location | (EE_READ_CMD << 8);
 
@@ -897,7 +897,7 @@ static void write_asic(kio_addr_t ioaddr, int location, short asic_data)
 
 static void set_misc_reg(struct net_device *dev)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     pcnet_dev_t *info = PRIV(dev);
     u_char tmp;
 
@@ -936,7 +936,7 @@ static void set_misc_reg(struct net_device *dev)
 static void mii_phy_probe(struct net_device *dev)
 {
     pcnet_dev_t *info = PRIV(dev);
-    kio_addr_t mii_addr = dev->base_addr + DLINK_GPIO;
+    unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
     int i;
     u_int tmp, phyid;
 
@@ -1014,7 +1014,7 @@ static int pcnet_close(struct net_device *dev)
 
 static void pcnet_reset_8390(struct net_device *dev)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     int i;
 
     ei_status.txing = ei_status.dmaing = 0;
@@ -1074,8 +1074,8 @@ static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)arg;
     pcnet_dev_t *info = PRIV(dev);
-    kio_addr_t nic_base = dev->base_addr;
-    kio_addr_t mii_addr = nic_base + DLINK_GPIO;
+    unsigned int nic_base = dev->base_addr;
+    unsigned int mii_addr = nic_base + DLINK_GPIO;
     u_short link;
 
     if (!netif_device_present(dev)) goto reschedule;
@@ -1177,7 +1177,7 @@ static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     pcnet_dev_t *info = PRIV(dev);
     u16 *data = (u16 *)&rq->ifr_ifru;
-    kio_addr_t mii_addr = dev->base_addr + DLINK_GPIO;
+    unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
     switch (cmd) {
     case SIOCGMIIPHY:
 	data[0] = info->phy_id;
@@ -1199,7 +1199,7 @@ static void dma_get_8390_hdr(struct net_device *dev,
 			     struct e8390_pkt_hdr *hdr,
 			     int ring_page)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
 
     if (ei_status.dmaing) {
 	printk(KERN_NOTICE "%s: DMAing conflict in dma_block_input."
@@ -1230,7 +1230,7 @@ static void dma_get_8390_hdr(struct net_device *dev,
 static void dma_block_input(struct net_device *dev, int count,
 			    struct sk_buff *skb, int ring_offset)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     int xfer_count = count;
     char *buf = skb->data;
 
@@ -1285,7 +1285,7 @@ static void dma_block_input(struct net_device *dev, int count,
 static void dma_block_output(struct net_device *dev, int count,
 			     const u_char *buf, const int start_page)
 {
-    kio_addr_t nic_base = dev->base_addr;
+    unsigned int nic_base = dev->base_addr;
     pcnet_dev_t *info = PRIV(dev);
 #ifdef PCMCIA_DEBUG
     int retries = 0;
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index c9868e9..f18eca9 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -295,7 +295,7 @@ static int s9k_config(struct net_device *dev, struct ifmap *map);
 static void smc_set_xcvr(struct net_device *dev, int if_port);
 static void smc_reset(struct net_device *dev);
 static void media_check(u_long arg);
-static void mdio_sync(kio_addr_t addr);
+static void mdio_sync(unsigned int addr);
 static int mdio_read(struct net_device *dev, int phy_id, int loc);
 static void mdio_write(struct net_device *dev, int phy_id, int loc, int value);
 static int smc_link_ok(struct net_device *dev);
@@ -601,8 +601,8 @@ static void mot_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
-    kio_addr_t iouart = link->io.BasePort2;
+    unsigned int ioaddr = dev->base_addr;
+    unsigned int iouart = link->io.BasePort2;
 
     /* Set UART base address and force map with COR bit 1 */
     writeb(iouart & 0xff,        smc->base + MOT_UART + CISREG_IOBASE_0);
@@ -621,7 +621,7 @@ static void mot_config(struct pcmcia_device *link)
 static int mot_setup(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int i, wait, loop;
     u_int addr;
 
@@ -754,7 +754,7 @@ free_cfg_mem:
 static int osi_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    static const kio_addr_t com[4] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
+    static const unsigned int com[4] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
     int i, j;
 
     link->conf.Attributes |= CONF_ENABLE_SPKR;
@@ -900,7 +900,7 @@ static int smc91c92_resume(struct pcmcia_device *link)
 static int check_sig(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int width;
     u_short s;
 
@@ -960,7 +960,7 @@ static int smc91c92_config(struct pcmcia_device *link)
     struct smc_private *smc = netdev_priv(dev);
     char *name;
     int i, j, rev;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     u_long mir;
     DECLARE_MAC_BUF(mac);
 
@@ -1136,7 +1136,7 @@ static void smc91c92_release(struct pcmcia_device *link)
 #define MDIO_DATA_WRITE1	(MDIO_DIR_WRITE | MDIO_DATA_OUT)
 #define MDIO_DATA_READ		0x02
 
-static void mdio_sync(kio_addr_t addr)
+static void mdio_sync(unsigned int addr)
 {
     int bits;
     for (bits = 0; bits < 32; bits++) {
@@ -1147,7 +1147,7 @@ static void mdio_sync(kio_addr_t addr)
 
 static int mdio_read(struct net_device *dev, int phy_id, int loc)
 {
-    kio_addr_t addr = dev->base_addr + MGMT;
+    unsigned int addr = dev->base_addr + MGMT;
     u_int cmd = (0x06<<10)|(phy_id<<5)|loc;
     int i, retval = 0;
 
@@ -1167,7 +1167,7 @@ static int mdio_read(struct net_device *dev, int phy_id, int loc)
 
 static void mdio_write(struct net_device *dev, int phy_id, int loc, int value)
 {
-    kio_addr_t addr = dev->base_addr + MGMT;
+    unsigned int addr = dev->base_addr + MGMT;
     u_int cmd = (0x05<<28)|(phy_id<<23)|(loc<<18)|(1<<17)|value;
     int i;
 
@@ -1193,7 +1193,7 @@ static void mdio_write(struct net_device *dev, int phy_id, int loc, int value)
 #ifdef PCMCIA_DEBUG
 static void smc_dump(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_short i, w, save;
     save = inw(ioaddr + BANK_SELECT);
     for (w = 0; w < 4; w++) {
@@ -1248,7 +1248,7 @@ static int smc_close(struct net_device *dev)
 {
     struct smc_private *smc = netdev_priv(dev);
     struct pcmcia_device *link = smc->p_dev;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     DEBUG(0, "%s: smc_close(), status %4.4x.\n",
 	  dev->name, inw(ioaddr + BANK_SELECT));
@@ -1285,7 +1285,7 @@ static void smc_hardware_send_packet(struct net_device * dev)
 {
     struct smc_private *smc = netdev_priv(dev);
     struct sk_buff *skb = smc->saved_skb;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_char packet_no;
 
     if (!skb) {
@@ -1349,7 +1349,7 @@ static void smc_hardware_send_packet(struct net_device * dev)
 static void smc_tx_timeout(struct net_device *dev)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     printk(KERN_NOTICE "%s: SMC91c92 transmit timed out, "
 	   "Tx_status %2.2x status %4.4x.\n",
@@ -1364,7 +1364,7 @@ static void smc_tx_timeout(struct net_device *dev)
 static int smc_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_short num_pages;
     short time_out, ir;
     unsigned long flags;
@@ -1434,7 +1434,7 @@ static int smc_start_xmit(struct sk_buff *skb, struct net_device *dev)
 static void smc_tx_err(struct net_device * dev)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int saved_packet = inw(ioaddr + PNR_ARR) & 0xff;
     int packet_no = inw(ioaddr + FIFO_PORTS) & 0x7f;
     int tx_status;
@@ -1478,7 +1478,7 @@ static void smc_tx_err(struct net_device * dev)
 static void smc_eph_irq(struct net_device *dev)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_short card_stats, ephs;
 
     SMC_SELECT_BANK(0);
@@ -1513,7 +1513,7 @@ static irqreturn_t smc_interrupt(int irq, void *dev_id)
 {
     struct net_device *dev = dev_id;
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     u_short saved_bank, saved_pointer, mask, status;
     unsigned int handled = 1;
     char bogus_cnt = INTR_WORK;		/* Work we are willing to do. */
@@ -1633,7 +1633,7 @@ irq_done:
 static void smc_rx(struct net_device *dev)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int rx_status;
     int packet_length;	/* Caution: not frame length, rather words
 			   to transfer from the chip. */
@@ -1738,7 +1738,7 @@ static void fill_multicast_tbl(int count, struct dev_mc_list *addrs,
 
 static void set_rx_mode(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     struct smc_private *smc = netdev_priv(dev);
     u_int multicast_table[ 2 ] = { 0, };
     unsigned long flags;
@@ -1804,7 +1804,7 @@ static int s9k_config(struct net_device *dev, struct ifmap *map)
 static void smc_set_xcvr(struct net_device *dev, int if_port)
 {
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_short saved_bank;
 
     saved_bank = inw(ioaddr + BANK_SELECT);
@@ -1827,7 +1827,7 @@ static void smc_set_xcvr(struct net_device *dev, int if_port)
 
 static void smc_reset(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     struct smc_private *smc = netdev_priv(dev);
     int i;
 
@@ -1904,7 +1904,7 @@ static void media_check(u_long arg)
 {
     struct net_device *dev = (struct net_device *) arg;
     struct smc_private *smc = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u_short i, media, saved_bank;
     u_short link;
     unsigned long flags;
@@ -2021,7 +2021,7 @@ reschedule:
 
 static int smc_link_ok(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     struct smc_private *smc = netdev_priv(dev);
 
     if (smc->cfg & CFG_MII_SELECT) {
@@ -2035,7 +2035,7 @@ static int smc_link_ok(struct net_device *dev)
 static int smc_netdev_get_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd)
 {
     u16 tmp;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     ecmd->supported = (SUPPORTED_TP | SUPPORTED_AUI |
 	SUPPORTED_10baseT_Half | SUPPORTED_10baseT_Full);
@@ -2057,7 +2057,7 @@ static int smc_netdev_get_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd)
 static int smc_netdev_set_ecmd(struct net_device *dev, struct ethtool_cmd *ecmd)
 {
     u16 tmp;
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     if (ecmd->speed != SPEED_10)
     	return -EINVAL;
@@ -2100,7 +2100,7 @@ static void smc_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info
 static int smc_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 {
 	struct smc_private *smc = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	u16 saved_bank = inw(ioaddr + BANK_SELECT);
 	int ret;
 
@@ -2118,7 +2118,7 @@ static int smc_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 static int smc_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 {
 	struct smc_private *smc = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	u16 saved_bank = inw(ioaddr + BANK_SELECT);
 	int ret;
 
@@ -2136,7 +2136,7 @@ static int smc_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
 static u32 smc_get_link(struct net_device *dev)
 {
 	struct smc_private *smc = netdev_priv(dev);
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 	u16 saved_bank = inw(ioaddr + BANK_SELECT);
 	u32 ret;
 
@@ -2164,7 +2164,7 @@ static int smc_nway_reset(struct net_device *dev)
 {
 	struct smc_private *smc = netdev_priv(dev);
 	if (smc->cfg & CFG_MII_SELECT) {
-		kio_addr_t ioaddr = dev->base_addr;
+		unsigned int ioaddr = dev->base_addr;
 		u16 saved_bank = inw(ioaddr + BANK_SELECT);
 		int res;
 
@@ -2196,7 +2196,7 @@ static int smc_ioctl (struct net_device *dev, struct ifreq *rq, int cmd)
 	struct mii_ioctl_data *mii = if_mii(rq);
 	int rc = 0;
 	u16 saved_bank;
-	kio_addr_t ioaddr = dev->base_addr;
+	unsigned int ioaddr = dev->base_addr;
 
 	if (!netif_running(dev))
 		return -EINVAL;
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index 1f09bea..d041f83 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -273,12 +273,12 @@ INT_MODULE_PARM(lockup_hack,	0);  /* anti lockup hack */
 static unsigned maxrx_bytes = 22000;
 
 /* MII management prototypes */
-static void mii_idle(kio_addr_t ioaddr);
-static void mii_putbit(kio_addr_t ioaddr, unsigned data);
-static int  mii_getbit(kio_addr_t ioaddr);
-static void mii_wbits(kio_addr_t ioaddr, unsigned data, int len);
-static unsigned mii_rd(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg);
-static void mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg,
+static void mii_idle(unsigned int ioaddr);
+static void mii_putbit(unsigned int ioaddr, unsigned data);
+static int  mii_getbit(unsigned int ioaddr);
+static void mii_wbits(unsigned int ioaddr, unsigned data, int len);
+static unsigned mii_rd(unsigned int ioaddr, u_char phyaddr, u_char phyreg);
+static void mii_wr(unsigned int ioaddr, u_char phyaddr, u_char phyreg,
 		   unsigned data, int len);
 
 /*
@@ -403,7 +403,7 @@ next_tuple(struct pcmcia_device *handle, tuple_t *tuple, cisparse_t *parse)
 static void
 PrintRegisters(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     if (pc_debug > 1) {
 	int i, page;
@@ -439,7 +439,7 @@ PrintRegisters(struct net_device *dev)
  * Turn around for read
  */
 static void
-mii_idle(kio_addr_t ioaddr)
+mii_idle(unsigned int ioaddr)
 {
     PutByte(XIRCREG2_GPR2, 0x04|0); /* drive MDCK low */
     udelay(1);
@@ -451,7 +451,7 @@ mii_idle(kio_addr_t ioaddr)
  * Write a bit to MDI/O
  */
 static void
-mii_putbit(kio_addr_t ioaddr, unsigned data)
+mii_putbit(unsigned int ioaddr, unsigned data)
 {
   #if 1
     if (data) {
@@ -484,7 +484,7 @@ mii_putbit(kio_addr_t ioaddr, unsigned data)
  * Get a bit from MDI/O
  */
 static int
-mii_getbit(kio_addr_t ioaddr)
+mii_getbit(unsigned int ioaddr)
 {
     unsigned d;
 
@@ -497,7 +497,7 @@ mii_getbit(kio_addr_t ioaddr)
 }
 
 static void
-mii_wbits(kio_addr_t ioaddr, unsigned data, int len)
+mii_wbits(unsigned int ioaddr, unsigned data, int len)
 {
     unsigned m = 1 << (len-1);
     for (; m; m >>= 1)
@@ -505,7 +505,7 @@ mii_wbits(kio_addr_t ioaddr, unsigned data, int len)
 }
 
 static unsigned
-mii_rd(kio_addr_t ioaddr,	u_char phyaddr, u_char phyreg)
+mii_rd(unsigned int ioaddr,	u_char phyaddr, u_char phyreg)
 {
     int i;
     unsigned data=0, m;
@@ -527,7 +527,8 @@ mii_rd(kio_addr_t ioaddr,	u_char phyaddr, u_char phyreg)
 }
 
 static void
-mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg, unsigned data, int len)
+mii_wr(unsigned int ioaddr, u_char phyaddr, u_char phyreg, unsigned data,
+       int len)
 {
     int i;
 
@@ -726,7 +727,7 @@ xirc2ps_config(struct pcmcia_device * link)
     local_info_t *local = netdev_priv(dev);
     tuple_t tuple;
     cisparse_t parse;
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     int err, i;
     u_char buf[64];
     cistpl_lan_node_id_t *node_id = (cistpl_lan_node_id_t*)parse.funce.data;
@@ -1104,7 +1105,7 @@ xirc2ps_interrupt(int irq, void *dev_id)
 {
     struct net_device *dev = (struct net_device *)dev_id;
     local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr;
+    unsigned int ioaddr;
     u_char saved_page;
     unsigned bytes_rcvd;
     unsigned int_status, eth_status, rx_status, tx_status;
@@ -1209,7 +1210,7 @@ xirc2ps_interrupt(int irq, void *dev_id)
 		    unsigned i;
 		    u_long *p = skb_put(skb, pktlen);
 		    register u_long a;
-		    kio_addr_t edpreg = ioaddr+XIRCREG_EDP-2;
+		    unsigned int edpreg = ioaddr+XIRCREG_EDP-2;
 		    for (i=0; i < len ; i += 4, p++) {
 			a = inl(edpreg);
 			__asm__("rorl $16,%0\n\t"
@@ -1346,7 +1347,7 @@ static int
 do_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
     local_info_t *lp = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     int okay;
     unsigned freespace;
     unsigned pktlen = skb->len;
@@ -1415,7 +1416,7 @@ do_get_stats(struct net_device *dev)
 static void
 set_addresses(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     local_info_t *lp = netdev_priv(dev);
     struct dev_mc_list *dmi = dev->mc_list;
     unsigned char *addr;
@@ -1459,7 +1460,7 @@ set_addresses(struct net_device *dev)
 static void
 set_multicast_list(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     SelectPage(0x42);
     if (dev->flags & IFF_PROMISC) { /* snoop */
@@ -1543,7 +1544,7 @@ static int
 do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     local_info_t *local = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     u16 *data = (u16 *)&rq->ifr_ifru;
 
     DEBUG(1, "%s: ioctl(%-.6s, %#04x) %04x %04x %04x %04x\n",
@@ -1575,7 +1576,7 @@ static void
 hardreset(struct net_device *dev)
 {
     local_info_t *local = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     SelectPage(4);
     udelay(1);
@@ -1592,7 +1593,7 @@ static void
 do_reset(struct net_device *dev, int full)
 {
     local_info_t *local = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     unsigned value;
 
     DEBUG(0, "%s: do_reset(%p,%d)\n", dev? dev->name:"eth?", dev, full);
@@ -1753,7 +1754,7 @@ static int
 init_mii(struct net_device *dev)
 {
     local_info_t *local = netdev_priv(dev);
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     unsigned control, status, linkpartner;
     int i;
 
@@ -1826,7 +1827,7 @@ static void
 do_powerdown(struct net_device *dev)
 {
 
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
 
     DEBUG(0, "do_powerdown(%p)\n", dev);
 
@@ -1838,7 +1839,7 @@ do_powerdown(struct net_device *dev)
 static int
 do_stop(struct net_device *dev)
 {
-    kio_addr_t ioaddr = dev->base_addr;
+    unsigned int ioaddr = dev->base_addr;
     local_info_t *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
diff --git a/drivers/net/wireless/netwave_cs.c b/drivers/net/wireless/netwave_cs.c
index d2fa079..f479c1a 100644
--- a/drivers/net/wireless/netwave_cs.c
+++ b/drivers/net/wireless/netwave_cs.c
@@ -195,7 +195,7 @@ static int netwave_pcmcia_config(struct pcmcia_device *arg); /* Runs after card
 static void netwave_detach(struct pcmcia_device *p_dev);    /* Destroy instance */
 
 /* Hardware configuration */
-static void netwave_doreset(kio_addr_t iobase, u_char __iomem *ramBase);
+static void netwave_doreset(unsigned int iobase, u_char __iomem *ramBase);
 static void netwave_reset(struct net_device *dev);
 
 /* Misc device stuff */
@@ -309,7 +309,7 @@ static inline void wait_WOC(unsigned int iobase)
 }
 
 static void netwave_snapshot(netwave_private *priv, u_char __iomem *ramBase, 
-			     kio_addr_t iobase) {
+			     unsigned int iobase) {
     u_short resultBuffer;
 
     /* if time since last snapshot is > 1 sec. (100 jiffies?)  then take 
@@ -340,7 +340,7 @@ static void netwave_snapshot(netwave_private *priv, u_char __iomem *ramBase,
 static struct iw_statistics *netwave_get_wireless_stats(struct net_device *dev)
 {	
     unsigned long flags;
-    kio_addr_t iobase = dev->base_addr;
+    unsigned int iobase = dev->base_addr;
     netwave_private *priv = netdev_priv(dev);
     u_char __iomem *ramBase = priv->ramBase;
     struct iw_statistics* wstats;
@@ -471,7 +471,7 @@ static int netwave_set_nwid(struct net_device *dev,
 			    char *extra)
 {
 	unsigned long flags;
-	kio_addr_t iobase = dev->base_addr;
+	unsigned int iobase = dev->base_addr;
 	netwave_private *priv = netdev_priv(dev);
 	u_char __iomem *ramBase = priv->ramBase;
 
@@ -518,7 +518,7 @@ static int netwave_set_scramble(struct net_device *dev,
 				char *key)
 {
 	unsigned long flags;
-	kio_addr_t iobase = dev->base_addr;
+	unsigned int iobase = dev->base_addr;
 	netwave_private *priv = netdev_priv(dev);
 	u_char __iomem *ramBase = priv->ramBase;
 
@@ -621,7 +621,7 @@ static int netwave_get_snap(struct net_device *dev,
 			    char *extra)
 {
 	unsigned long flags;
-	kio_addr_t iobase = dev->base_addr;
+	unsigned int iobase = dev->base_addr;
 	netwave_private *priv = netdev_priv(dev);
 	u_char __iomem *ramBase = priv->ramBase;
 
@@ -874,7 +874,7 @@ static int netwave_resume(struct pcmcia_device *link)
  *
  *    Proper hardware reset of the card.
  */
-static void netwave_doreset(kio_addr_t ioBase, u_char __iomem *ramBase)
+static void netwave_doreset(unsigned int ioBase, u_char __iomem *ramBase)
 {
     /* Reset card */
     wait_WOC(ioBase);
@@ -892,7 +892,7 @@ static void netwave_reset(struct net_device *dev) {
     /* u_char state; */
     netwave_private *priv = netdev_priv(dev);
     u_char __iomem *ramBase = priv->ramBase;
-    kio_addr_t iobase = dev->base_addr;
+    unsigned int iobase = dev->base_addr;
 
     DEBUG(0, "netwave_reset: Done with hardware reset\n");
 
@@ -973,7 +973,7 @@ static int netwave_hw_xmit(unsigned char* data, int len,
 	
     netwave_private *priv = netdev_priv(dev);
     u_char __iomem * ramBase = priv->ramBase;
-    kio_addr_t iobase = dev->base_addr;
+    unsigned int iobase = dev->base_addr;
 
     /* Disable interrupts & save flags */
     spin_lock_irqsave(&priv->spinlock, flags);
@@ -1065,7 +1065,7 @@ static int netwave_start_xmit(struct sk_buff *skb, struct net_device *dev) {
  */
 static irqreturn_t netwave_interrupt(int irq, void* dev_id)
 {
-    kio_addr_t iobase;
+    unsigned int iobase;
     u_char __iomem *ramBase;
     struct net_device *dev = (struct net_device *)dev_id;
     struct netwave_private *priv = netdev_priv(dev);
@@ -1235,7 +1235,7 @@ static int netwave_rx(struct net_device *dev)
 {
     netwave_private *priv = netdev_priv(dev);
     u_char __iomem *ramBase = priv->ramBase;
-    kio_addr_t iobase = dev->base_addr;
+    unsigned int iobase = dev->base_addr;
     u_char rxStatus;
     struct sk_buff *skb = NULL;
     unsigned int curBuffer,
@@ -1388,7 +1388,7 @@ module_exit(exit_netwave_cs);
  */
 static void set_multicast_list(struct net_device *dev)
 {
-    kio_addr_t iobase = dev->base_addr;
+    unsigned int iobase = dev->base_addr;
     netwave_private *priv = netdev_priv(dev);
     u_char __iomem * ramBase = priv->ramBase;
     u_char  rcvMode = 0;
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index c2037b2..06eea6a 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -149,7 +149,7 @@ psa_write(struct net_device *	dev,
   net_local *lp = netdev_priv(dev);
   u_char __iomem *ptr = lp->mem + PSA_ADDR + (o << 1);
   int		count = 0;
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   /* As there seem to have no flag PSA_BUSY as in the ISA model, we are
    * oblige to verify this address to know when the PSA is ready... */
   volatile u_char __iomem *verify = lp->mem + PSA_ADDR +
@@ -708,7 +708,7 @@ static void wl_update_history(wavepoint_history *wavepoint, unsigned char sigqua
 /* Perform a handover to a new WavePoint */
 static void wv_roam_handover(wavepoint_history *wavepoint, net_local *lp)
 {
-  kio_addr_t		base = lp->dev->base_addr;
+  unsigned int		base = lp->dev->base_addr;
   mm_t                  m;
   unsigned long         flags;
 
@@ -821,7 +821,7 @@ wv_82593_cmd(struct net_device *	dev,
 	     int	cmd,
 	     int	result)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   int		status;
   int		wait_completed;
   long		spin;
@@ -945,7 +945,7 @@ read_ringbuf(struct net_device *	dev,
 	     char *	buf,
 	     int	len)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   int		ring_ptr = addr;
   int		chunk_len;
   char *	buf_ptr = buf;
@@ -1096,7 +1096,7 @@ wv_psa_show(psa_t *	p)
 static void
 wv_mmc_show(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   net_local *	lp = netdev_priv(dev);
   mmr_t		m;
 
@@ -1275,7 +1275,7 @@ wv_packet_info(u_char *		p,		/* Packet to dump */
 static inline void
 wv_init_info(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   psa_t		psa;
   DECLARE_MAC_BUF(mac);
 
@@ -1294,7 +1294,7 @@ wv_init_info(struct net_device *	dev)
 
 #ifdef DEBUG_BASIC_SHOW
   /* Now, let's go for the basic stuff */
-  printk(KERN_NOTICE "%s: WaveLAN: port %#lx, irq %d, "
+  printk(KERN_NOTICE "%s: WaveLAN: port %#x, irq %d, "
 	 "hw_addr %s",
 	 dev->name, base, dev->irq,
 	 print_mac(mac, dev->dev_addr));
@@ -1828,7 +1828,7 @@ static int wavelan_set_nwid(struct net_device *dev,
 			    union iwreq_data *wrqu,
 			    char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	psa_t psa;
 	mm_t m;
@@ -1918,7 +1918,7 @@ static int wavelan_set_freq(struct net_device *dev,
 			    union iwreq_data *wrqu,
 			    char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	unsigned long flags;
 	int ret;
@@ -1948,7 +1948,7 @@ static int wavelan_get_freq(struct net_device *dev,
 			    union iwreq_data *wrqu,
 			    char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	psa_t psa;
 	unsigned long flags;
@@ -1994,7 +1994,7 @@ static int wavelan_set_sens(struct net_device *dev,
 			    union iwreq_data *wrqu,
 			    char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	psa_t psa;
 	unsigned long flags;
@@ -2060,7 +2060,7 @@ static int wavelan_set_encode(struct net_device *dev,
 			      union iwreq_data *wrqu,
 			      char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	unsigned long flags;
 	psa_t psa;
@@ -2130,7 +2130,7 @@ static int wavelan_get_encode(struct net_device *dev,
 			      union iwreq_data *wrqu,
 			      char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	psa_t psa;
 	unsigned long flags;
@@ -2349,7 +2349,7 @@ static int wavelan_get_range(struct net_device *dev,
 			     union iwreq_data *wrqu,
 			     char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	struct iw_range *range = (struct iw_range *) extra;
 	unsigned long flags;
@@ -2425,7 +2425,7 @@ static int wavelan_set_qthr(struct net_device *dev,
 			    union iwreq_data *wrqu,
 			    char *extra)
 {
-	kio_addr_t base = dev->base_addr;
+	unsigned int base = dev->base_addr;
 	net_local *lp = netdev_priv(dev);
 	psa_t psa;
 	unsigned long flags;
@@ -2701,7 +2701,7 @@ static const struct iw_handler_def	wavelan_handler_def =
 static iw_stats *
 wavelan_get_wireless_stats(struct net_device *	dev)
 {
-  kio_addr_t		base = dev->base_addr;
+  unsigned int		base = dev->base_addr;
   net_local *		lp = netdev_priv(dev);
   mmr_t			m;
   iw_stats *		wstats;
@@ -2764,7 +2764,7 @@ wv_start_of_frame(struct net_device *	dev,
 		  int		rfp,	/* end of frame */
 		  int		wrap)	/* start of buffer */
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   int		rp;
   int		len;
 
@@ -2925,7 +2925,7 @@ wv_packet_read(struct net_device *		dev,
 static inline void
 wv_packet_rcv(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   net_local *	lp = netdev_priv(dev);
   int		newrfp;
   int		rp;
@@ -3062,7 +3062,7 @@ wv_packet_write(struct net_device *	dev,
 		short		length)
 {
   net_local *		lp = netdev_priv(dev);
-  kio_addr_t		base = dev->base_addr;
+  unsigned int		base = dev->base_addr;
   unsigned long		flags;
   int			clen = length;
   register u_short	xmtdata_base = TX_BASE;
@@ -3183,7 +3183,7 @@ wavelan_packet_xmit(struct sk_buff *	skb,
 static inline int
 wv_mmc_init(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   psa_t		psa;
   mmw_t		m;
   int		configured;
@@ -3377,7 +3377,7 @@ wv_mmc_init(struct net_device *	dev)
 static int
 wv_ru_stop(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   net_local *	lp = netdev_priv(dev);
   unsigned long	flags;
   int		status;
@@ -3440,7 +3440,7 @@ wv_ru_stop(struct net_device *	dev)
 static int
 wv_ru_start(struct net_device *	dev)
 {
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
   net_local *	lp = netdev_priv(dev);
   unsigned long	flags;
 
@@ -3528,7 +3528,7 @@ wv_ru_start(struct net_device *	dev)
 static int
 wv_82593_config(struct net_device *	dev)
 {
-  kio_addr_t			base = dev->base_addr;
+  unsigned int			base = dev->base_addr;
   net_local *			lp = netdev_priv(dev);
   struct i82593_conf_block	cfblk;
   int				ret = TRUE;
@@ -3765,7 +3765,7 @@ static int
 wv_hw_config(struct net_device *	dev)
 {
   net_local *		lp = netdev_priv(dev);
-  kio_addr_t		base = dev->base_addr;
+  unsigned int		base = dev->base_addr;
   unsigned long		flags;
   int			ret = FALSE;
 
@@ -4047,7 +4047,7 @@ wavelan_interrupt(int		irq,
 {
   struct net_device *	dev = dev_id;
   net_local *	lp;
-  kio_addr_t	base;
+  unsigned int	base;
   int		status0;
   u_int		tx_status;
 
@@ -4306,7 +4306,7 @@ static void
 wavelan_watchdog(struct net_device *	dev)
 {
   net_local *		lp = netdev_priv(dev);
-  kio_addr_t		base = dev->base_addr;
+  unsigned int		base = dev->base_addr;
   unsigned long		flags;
   int			aborted = FALSE;
 
@@ -4382,7 +4382,7 @@ wavelan_open(struct net_device *	dev)
 {
   net_local *	lp = netdev_priv(dev);
   struct pcmcia_device *	link = lp->link;
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
 
 #ifdef DEBUG_CALLBACK_TRACE
   printk(KERN_DEBUG "%s: ->wavelan_open(dev=0x%x)\n", dev->name,
@@ -4436,7 +4436,7 @@ static int
 wavelan_close(struct net_device *	dev)
 {
   struct pcmcia_device *	link = ((net_local *)netdev_priv(dev))->link;
-  kio_addr_t	base = dev->base_addr;
+  unsigned int	base = dev->base_addr;
 
 #ifdef DEBUG_CALLBACK_TRACE
   printk(KERN_DEBUG "%s: ->wavelan_close(dev=0x%x)\n", dev->name,
diff --git a/drivers/pcmcia/i82092.c b/drivers/pcmcia/i82092.c
index df21e2d..7495155 100644
--- a/drivers/pcmcia/i82092.c
+++ b/drivers/pcmcia/i82092.c
@@ -82,7 +82,7 @@ struct socket_info {
 				    1 = empty socket, 
 				    2 = card but not initialized,
 				    3 = operational card */
-	kio_addr_t io_base; 	/* base io address of the socket */
+	unsigned int io_base; 	/* base io address of the socket */
 	
 	struct pcmcia_socket socket;
 	struct pci_dev *dev;	/* The PCI device for the socket */
diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c
index 839bb1c..32a2ab1 100644
--- a/drivers/pcmcia/i82365.c
+++ b/drivers/pcmcia/i82365.c
@@ -164,7 +164,7 @@ struct i82365_socket {
     u_short		type, flags;
     struct pcmcia_socket	socket;
     unsigned int	number;
-    kio_addr_t		ioaddr;
+    unsigned int	ioaddr;
     u_short		psock;
     u_char		cs_irq, intr;
     union {
@@ -238,7 +238,7 @@ static u_char i365_get(u_short sock, u_short reg)
     unsigned long flags;
     spin_lock_irqsave(&bus_lock,flags);
     {
-	kio_addr_t port = socket[sock].ioaddr;
+	unsigned int port = socket[sock].ioaddr;
 	u_char val;
 	reg = I365_REG(socket[sock].psock, reg);
 	outb(reg, port); val = inb(port+1);
@@ -252,7 +252,7 @@ static void i365_set(u_short sock, u_short reg, u_char data)
     unsigned long flags;
     spin_lock_irqsave(&bus_lock,flags);
     {
-	kio_addr_t port = socket[sock].ioaddr;
+	unsigned int port = socket[sock].ioaddr;
 	u_char val = I365_REG(socket[sock].psock, reg);
 	outb(val, port); outb(data, port+1);
 	spin_unlock_irqrestore(&bus_lock,flags);
@@ -588,7 +588,7 @@ static int to_cycles(int ns)
 
 /*====================================================================*/
 
-static int __init identify(kio_addr_t port, u_short sock)
+static int __init identify(unsigned int port, u_short sock)
 {
     u_char val;
     int type = -1;
@@ -659,7 +659,7 @@ static int __init identify(kio_addr_t port, u_short sock)
 static int __init is_alive(u_short sock)
 {
     u_char stat;
-    kio_addr_t start, stop;
+    unsigned int start, stop;
     
     stat = i365_get(sock, I365_STATUS);
     start = i365_get_pair(sock, I365_IO(0)+I365_W_START);
@@ -678,7 +678,7 @@ static int __init is_alive(u_short sock)
 
 /*====================================================================*/
 
-static void __init add_socket(kio_addr_t port, int psock, int type)
+static void __init add_socket(unsigned int port, int psock, int type)
 {
     socket[sockets].ioaddr = port;
     socket[sockets].psock = psock;
@@ -698,7 +698,7 @@ static void __init add_pcic(int ns, int type)
     base = sockets-ns;
     if (base == 0) printk("\n");
     printk(KERN_INFO "  %s", pcic[type].name);
-    printk(" ISA-to-PCMCIA at port %#lx ofs 0x%02x",
+    printk(" ISA-to-PCMCIA at port %#x ofs 0x%02x",
 	       t->ioaddr, t->psock*0x40);
     printk(", %d socket%s\n", ns, ((ns > 1) ? "s" : ""));
 
@@ -772,7 +772,7 @@ static struct pnp_dev *i82365_pnpdev;
 static void __init isa_probe(void)
 {
     int i, j, sock, k, ns, id;
-    kio_addr_t port;
+    unsigned int port;
 #ifdef CONFIG_PNP
     struct isapnp_device_id *devid;
     struct pnp_dev *dev;
@@ -1053,7 +1053,7 @@ static int i365_set_io_map(u_short sock, struct pccard_io_map *io)
     u_char map, ioctl;
     
     debug(1, "SetIOMap(%d, %d, %#2.2x, %d ns, "
-	  "%#lx-%#lx)\n", sock, io->map, io->flags,
+	  "%#x-%#x)\n", sock, io->map, io->flags,
 	  io->speed, io->start, io->stop);
     map = io->map;
     if ((map > 1) || (io->start > 0xffff) || (io->stop > 0xffff) ||
diff --git a/drivers/pcmcia/m32r_cfc.c b/drivers/pcmcia/m32r_cfc.c
index 91da15b..3616da2 100644
--- a/drivers/pcmcia/m32r_cfc.c
+++ b/drivers/pcmcia/m32r_cfc.c
@@ -58,7 +58,7 @@ typedef struct pcc_socket {
 	u_short			type, flags;
 	struct pcmcia_socket	socket;
 	unsigned int		number;
- 	kio_addr_t		ioaddr;
+	unsigned int		ioaddr;
 	u_long			mapaddr;
 	u_long			base;	/* PCC register base */
 	u_char			cs_irq1, cs_irq2, intr;
@@ -298,7 +298,8 @@ static int __init is_alive(u_short sock)
 	return 0;
 }
 
-static void add_pcc_socket(ulong base, int irq, ulong mapaddr, kio_addr_t ioaddr)
+static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
+			   unsigned int ioaddr)
 {
 	pcc_socket_t *t = &socket[pcc_sockets];
 
@@ -738,7 +739,7 @@ static int __init init_m32r_pcc(void)
 #else	/* CONFIG_PLAT_USRV */
 	{
 		ulong base, mapaddr;
-		kio_addr_t ioaddr;
+		unsigned int ioaddr;
 
 		for (i = 0 ; i < M32R_MAX_PCC ; i++) {
 			base = (ulong)PLD_CFRSTCR;
diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c
index ec4c125..2b42b71 100644
--- a/drivers/pcmcia/m32r_pcc.c
+++ b/drivers/pcmcia/m32r_pcc.c
@@ -65,7 +65,7 @@ typedef struct pcc_socket {
 	u_short			type, flags;
 	struct pcmcia_socket	socket;
 	unsigned int		number;
- 	kio_addr_t		ioaddr;
+	unsigned int		ioaddr;
 	u_long			mapaddr;
 	u_long			base;	/* PCC register base */
 	u_char			cs_irq, intr;
@@ -310,7 +310,8 @@ static int __init is_alive(u_short sock)
 	return 0;
 }
 
-static void add_pcc_socket(ulong base, int irq, ulong mapaddr, kio_addr_t ioaddr)
+static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
+			   unsigned int ioaddr)
 {
   	pcc_socket_t *t = &socket[pcc_sockets];
 
@@ -491,7 +492,7 @@ static int _pcc_set_io_map(u_short sock, struct pccard_io_map *io)
 	u_char map;
 
 	debug(3, "m32r-pcc: SetIOMap(%d, %d, %#2.2x, %d ns, "
-		  "%#lx-%#lx)\n", sock, io->map, io->flags,
+		  "%#x-%#x)\n", sock, io->map, io->flags,
 		  io->speed, io->start, io->stop);
 	map = io->map;
 
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index bfcaad6..a8d1007 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -186,15 +186,16 @@ static int sub_interval(struct resource_map *map, u_long base, u_long num)
 ======================================================================*/
 
 #ifdef CONFIG_PCMCIA_PROBE
-static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num)
+static void do_io_probe(struct pcmcia_socket *s, unsigned int base,
+			unsigned int num)
 {
     struct resource *res;
     struct socket_data *s_data = s->resource_data;
-    kio_addr_t i, j, bad;
+    unsigned int i, j, bad;
     int any;
     u_char *b, hole, most;
 
-    printk(KERN_INFO "cs: IO port probe %#lx-%#lx:",
+    printk(KERN_INFO "cs: IO port probe %#x-%#x:",
 	   base, base+num-1);
 
     /* First, what does a floating port look like? */
@@ -233,7 +234,7 @@ static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num
 	} else {
 	    if (bad) {
 		sub_interval(&s_data->io_db, bad, i-bad);
-		printk(" %#lx-%#lx", bad, i-1);
+		printk(" %#x-%#x", bad, i-1);
 		bad = 0;
 	    }
 	}
@@ -244,7 +245,7 @@ static void do_io_probe(struct pcmcia_socket *s, kio_addr_t base, kio_addr_t num
 	    return;
 	} else {
 	    sub_interval(&s_data->io_db, bad, i-bad);
-	    printk(" %#lx-%#lx", bad, i-1);
+	    printk(" %#x-%#x", bad, i-1);
 	}
     }
 
diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c
index 749ac37..5792bd5 100644
--- a/drivers/pcmcia/tcic.c
+++ b/drivers/pcmcia/tcic.c
@@ -719,7 +719,7 @@ static int tcic_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io)
     u_short base, len, ioctl;
     
     debug(1, "SetIOMap(%d, %d, %#2.2x, %d ns, "
-	  "%#lx-%#lx)\n", psock, io->map, io->flags,
+	  "%#x-%#x)\n", psock, io->map, io->flags,
 	  io->speed, io->start, io->stop);
     if ((io->map > 1) || (io->start > 0xffff) || (io->stop > 0xffff) ||
 	(io->stop < io->start)) return -EINVAL;
diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c
index d8b6600..164d2a4 100644
--- a/drivers/serial/serial_cs.c
+++ b/drivers/serial/serial_cs.c
@@ -389,7 +389,7 @@ static void serial_detach(struct pcmcia_device *link)
 /*====================================================================*/
 
 static int setup_serial(struct pcmcia_device *handle, struct serial_info * info,
-			kio_addr_t iobase, int irq)
+			unsigned int iobase, int irq)
 {
 	struct uart_port port;
 	int line;
@@ -456,7 +456,7 @@ next_tuple(struct pcmcia_device *handle, tuple_t * tuple, cisparse_t * parse)
 
 static int simple_config(struct pcmcia_device *link)
 {
-	static const kio_addr_t base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
+	static const unsigned int base[5] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8, 0x0 };
 	static const int size_table[2] = { 8, 16 };
 	struct serial_info *info = link->priv;
 	struct serial_cfg_mem *cfg_mem;
@@ -480,7 +480,7 @@ static int simple_config(struct pcmcia_device *link)
 	/* If the card is already configured, look up the port and irq */
 	i = pcmcia_get_configuration_info(link, &config);
 	if ((i == CS_SUCCESS) && (config.Attributes & CONF_VALID_CLIENT)) {
-		kio_addr_t port = 0;
+		unsigned int port = 0;
 		if ((config.BasePort2 != 0) && (config.NumPorts2 == 8)) {
 			port = config.BasePort2;
 			info->slave = 1;
diff --git a/include/pcmcia/cs_types.h b/include/pcmcia/cs_types.h
index 5f38803..9a6bcc4 100644
--- a/include/pcmcia/cs_types.h
+++ b/include/pcmcia/cs_types.h
@@ -27,7 +27,6 @@ typedef u_int   ioaddr_t;
 #else
 typedef u_short	ioaddr_t;
 #endif
-typedef unsigned long kio_addr_t;
 
 typedef u_short	socket_t;
 typedef u_int	event_t;
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 6e84258..f95dca0 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -92,7 +92,7 @@ typedef struct pccard_io_map {
     u_char	map;
     u_char	flags;
     u_short	speed;
-    kio_addr_t	start, stop;
+    u_int	start, stop;
 } pccard_io_map;
 
 typedef struct pccard_mem_map {
@@ -155,7 +155,7 @@ extern struct pccard_resource_ops pccard_iodyn_ops;
 struct pcmcia_socket;
 
 typedef struct io_window_t {
-	kio_addr_t		InUse, Config;
+	u_int			InUse, Config;
 	struct resource		*res;
 } io_window_t;
 
@@ -208,7 +208,7 @@ struct pcmcia_socket {
 	u_int				features;
 	u_int				irq_mask;
 	u_int				map_size;
-	kio_addr_t			io_offset;
+	u_int				io_offset;
 	u_char				pci_irq;
 	struct pci_dev *		cb_dev;
 
-- 
cgit v1.1


From d6b4fa6d698f5cf331ead08db4ba5e60cd3c83be Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 4 Feb 2008 22:27:37 -0800
Subject: pcmcia: stop updating dev->power.power_state

This stops the pcmcia core from using dev->power.power_state; that field is
deprecated (overdue for removal) and the only reason to update it was to make
the /sys/devices/.../power/state files (now removed) work better.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/ds.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 15c18f5..846468c 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -1130,8 +1130,6 @@ static int runtime_suspend(struct device *dev)
 	down(&dev->sem);
 	rc = pcmcia_dev_suspend(dev, PMSG_SUSPEND);
 	up(&dev->sem);
-	if (!rc)
-		dev->power.power_state.event = PM_EVENT_SUSPEND;
 	return rc;
 }
 
@@ -1142,8 +1140,6 @@ static void runtime_resume(struct device *dev)
 	down(&dev->sem);
 	rc = pcmcia_dev_resume(dev);
 	up(&dev->sem);
-	if (!rc)
-		dev->power.power_state.event = PM_EVENT_ON;
 }
 
 /************************ per-device sysfs output ***************************/
@@ -1265,6 +1261,9 @@ static int pcmcia_dev_suspend(struct device * dev, pm_message_t state)
 	struct pcmcia_driver *p_drv = NULL;
 	int ret = 0;
 
+	if (p_dev->suspended)
+		return 0;
+
 	ds_dbg(2, "suspending %s\n", dev->bus_id);
 
 	if (dev->driver)
@@ -1301,6 +1300,9 @@ static int pcmcia_dev_resume(struct device * dev)
         struct pcmcia_driver *p_drv = NULL;
 	int ret = 0;
 
+	if (!p_dev->suspended)
+		return 0;
+
 	ds_dbg(2, "resuming %s\n", dev->bus_id);
 
 	if (dev->driver)
-- 
cgit v1.1


From 52debb06238b8076ec2667359668d4c5e38e8807 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 4 Feb 2008 22:27:38 -0800
Subject: pcmcia: include bad CIS filename in error message

- Print the invalid CIS filename in the invalid filename message.
- Use sizeof() instead of hard-coded constant for buffer size.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/ds.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 846468c..5a85871 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -865,11 +865,12 @@ static int pcmcia_load_firmware(struct pcmcia_device *dev, char * filename)
 	ds_dbg(1, "trying to load CIS file %s\n", filename);
 
 	if (strlen(filename) > 14) {
-		printk(KERN_WARNING "pcmcia: CIS filename is too long\n");
+		printk(KERN_WARNING "pcmcia: CIS filename is too long [%s]\n",
+			filename);
 		return -EINVAL;
 	}
 
-	snprintf(path, 20, "%s", filename);
+	snprintf(path, sizeof(path), "%s", filename);
 
 	if (request_firmware(&fw, path, &dev->dev) == 0) {
 		if (fw->size >= CISTPL_MAX_CIS_SIZE) {
-- 
cgit v1.1


From 1569d9e89a77d51750497dc23d303e27d0d9494d Mon Sep 17 00:00:00 2001
From: Richard Knutsson <ricknu-0@student.ltu.se>
Date: Mon, 4 Feb 2008 22:27:39 -0800
Subject: pcmcia/3c574_cs: fix 'shadow variable' warning

Fixing:
  CHECK   drivers/net/pcmcia/3c574_cs.c
drivers/net/pcmcia/3c574_cs.c:695:7: warning: symbol 'i' shadows an earlier one
drivers/net/pcmcia/3c574_cs.c:636:6: originally declared here

Signed-off-by: Richard Knutson <ricknu-0@student.ltu.se>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/pcmcia/3c574_cs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index dafbbdb..3b78a38 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -696,7 +696,7 @@ static void tc574_reset(struct net_device *dev)
 	mdio_write(ioaddr, lp->phys, 4, lp->advertising);
 	if (!auto_polarity) {
 		/* works for TDK 78Q2120 series MII's */
-		int i = mdio_read(ioaddr, lp->phys, 16) | 0x20;
+		i = mdio_read(ioaddr, lp->phys, 16) | 0x20;
 		mdio_write(ioaddr, lp->phys, 16, i);
 	}
 
-- 
cgit v1.1


From 6b2e43861b09bce857d41d47c853003be587a575 Mon Sep 17 00:00:00 2001
From: Richard Knutsson <ricknu-0@student.ltu.se>
Date: Mon, 4 Feb 2008 22:27:40 -0800
Subject: pcmcia/axnet_cs: make functions static

Fixing:
  CHECK   drivers/net/pcmcia/axnet_cs.c
drivers/net/pcmcia/axnet_cs.c:994:5: warning: symbol 'ax_close' was not declared. Should it be static?
drivers/net/pcmcia/axnet_cs.c:1017:6: warning: symbol 'ei_tx_timeout' was not declared. Should it be static?

Signed-off-by: Richard Knutsson <ricknu-0@student.ltu.se>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/pcmcia/axnet_cs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index e115881..9d4d706 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -991,7 +991,7 @@ static int ax_open(struct net_device *dev)
  *
  * Opposite of ax_open(). Only used when "ifconfig <devname> down" is done.
  */
-int ax_close(struct net_device *dev)
+static int ax_close(struct net_device *dev)
 {
 	unsigned long flags;
 
@@ -1014,7 +1014,7 @@ int ax_close(struct net_device *dev)
  * completed (or failed) - i.e. never posted a Tx related interrupt.
  */
 
-void ei_tx_timeout(struct net_device *dev)
+static void ei_tx_timeout(struct net_device *dev)
 {
 	long e8390_base = dev->base_addr;
 	struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev);
-- 
cgit v1.1


From c61f26fa609f11a471a68668b838b7366b2b75e0 Mon Sep 17 00:00:00 2001
From: Richard Knutsson <ricknu-0@student.ltu.se>
Date: Mon, 4 Feb 2008 22:27:41 -0800
Subject: pcmcia/axnet_cs: make use of 'max()' instead of handcrafted one

Use 'max(x,y)' instead of 'x < y ? y : x'.

Signed-off-by: Richard Knutsson <ricknu-0@student.ltu.se>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/pcmcia/axnet_cs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 9d4d706..e8a63e4 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -1087,8 +1087,8 @@ static int ei_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	
 	ei_local->irqlock = 1;
 
-	send_length = ETH_ZLEN < length ? length : ETH_ZLEN;
-	
+	send_length = max(length, ETH_ZLEN);
+
 	/*
 	 * We have two Tx slots available for use. Find the first free
 	 * slot, and then perform some sanity checks. With two Tx bufs,
-- 
cgit v1.1


From cfd4734c1e686164c87b9f31b67401cdf6f34238 Mon Sep 17 00:00:00 2001
From: Richard Knutsson <ricknu-0@student.ltu.se>
Date: Mon, 4 Feb 2008 22:27:41 -0800
Subject: pcmcia/fmvj18x_cs: fix 'shadow variable' warning

Fixing:
  CHECK   drivers/net/pcmcia/fmvj18x_cs.c
drivers/net/pcmcia/fmvj18x_cs.c:1205:6: warning: symbol 'i' shadows an earlier one
drivers/net/pcmcia/fmvj18x_cs.c:1179:9: originally declared here

Signed-off-by: Richard Knutsson <ricknu-0@student.ltu.se>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/pcmcia/fmvj18x_cs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 7cb2253..8f328a0 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -1198,8 +1198,7 @@ static void set_rx_mode(struct net_device *dev)
 	outb(1, ioaddr + RX_MODE);	/* Ignore almost all multicasts. */
     } else {
 	struct dev_mc_list *mclist;
-	int i;
-	
+
 	memset(mc_filter, 0, sizeof(mc_filter));
 	for (i = 0, mclist = dev->mc_list; mclist && i < dev->mc_count;
 	     i++, mclist = mclist->next) {
-- 
cgit v1.1


From 9ab9898e32971b938f9e8a997b12d0c4dd4832f7 Mon Sep 17 00:00:00 2001
From: Richard Knutsson <ricknu-0@student.ltu.se>
Date: Mon, 4 Feb 2008 22:27:42 -0800
Subject: pcmcia/pcnet_cs: fix 'shadow variable' warning

Fixing:
  CHECK   drivers/net/pcmcia/pcnet_cs.c
drivers/net/pcmcia/pcnet_cs.c:523:15: warning: symbol 'hw_info' shadows an earlier one
drivers/net/pcmcia/pcnet_cs.c:148:18: originally declared here

Signed-off-by: Richard Knutsson <ricknu-0@student.ltu.se>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/pcmcia/pcnet_cs.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 6bc48a0..6323988 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -521,7 +521,7 @@ static int pcnet_config(struct pcmcia_device *link)
     int i, last_ret, last_fn, start_pg, stop_pg, cm_offset;
     int has_shmem = 0;
     u_short buf[64];
-    hw_info_t *hw_info;
+    hw_info_t *local_hw_info;
     DECLARE_MAC_BUF(mac);
 
     DEBUG(0, "pcnet_config(0x%p)\n", link);
@@ -590,23 +590,23 @@ static int pcnet_config(struct pcmcia_device *link)
 	dev->if_port = 0;
     }
 
-    hw_info = get_hwinfo(link);
-    if (hw_info == NULL)
-	hw_info = get_prom(link);
-    if (hw_info == NULL)
-	hw_info = get_dl10019(link);
-    if (hw_info == NULL)
-	hw_info = get_ax88190(link);
-    if (hw_info == NULL)
-	hw_info = get_hwired(link);
-
-    if (hw_info == NULL) {
+    local_hw_info = get_hwinfo(link);
+    if (local_hw_info == NULL)
+	local_hw_info = get_prom(link);
+    if (local_hw_info == NULL)
+	local_hw_info = get_dl10019(link);
+    if (local_hw_info == NULL)
+	local_hw_info = get_ax88190(link);
+    if (local_hw_info == NULL)
+	local_hw_info = get_hwired(link);
+
+    if (local_hw_info == NULL) {
 	printk(KERN_NOTICE "pcnet_cs: unable to read hardware net"
 	       " address for io base %#3lx\n", dev->base_addr);
 	goto failed;
     }
 
-    info->flags = hw_info->flags;
+    info->flags = local_hw_info->flags;
     /* Check for user overrides */
     info->flags |= (delay_output) ? DELAY_OUTPUT : 0;
     if ((link->manf_id == MANFID_SOCKET) &&
-- 
cgit v1.1


From 4c1fc445c29c6208c44e10c0253beea890bf5435 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:27:42 -0800
Subject: at91_cf: use generic gpio calls

Update the AT91 CF driver to use the generic GPIO calls instead of the
AT91-specific ones; and request exclusive use of those signals.

Minor tweaks to cleanup code paths: always in reverse order of how the
resources were allocated, with remove() matching the fault paths of
probe().

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/at91_cf.c | 62 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index eb6abd3..385e145 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c
@@ -21,9 +21,9 @@
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/sizes.h>
+#include <asm/gpio.h>
 
 #include <asm/arch/board.h>
-#include <asm/arch/gpio.h>
 #include <asm/arch/at91rm9200_mc.h>
 
 
@@ -56,7 +56,7 @@ struct at91_cf_socket {
 
 static inline int at91_cf_present(struct at91_cf_socket *cf)
 {
-	return !at91_get_gpio_value(cf->board->det_pin);
+	return !gpio_get_value(cf->board->det_pin);
 }
 
 /*--------------------------------------------------------------------------*/
@@ -100,9 +100,9 @@ static int at91_cf_get_status(struct pcmcia_socket *s, u_int *sp)
 		int vcc	= cf->board->vcc_pin;
 
 		*sp = SS_DETECT | SS_3VCARD;
-		if (!rdy || at91_get_gpio_value(rdy))
+		if (!rdy || gpio_get_value(rdy))
 			*sp |= SS_READY;
-		if (!vcc || at91_get_gpio_value(vcc))
+		if (!vcc || gpio_get_value(vcc))
 			*sp |= SS_POWERON;
 	} else
 		*sp = 0;
@@ -121,10 +121,10 @@ at91_cf_set_socket(struct pcmcia_socket *sock, struct socket_state_t *s)
 	if (cf->board->vcc_pin) {
 		switch (s->Vcc) {
 			case 0:
-				at91_set_gpio_value(cf->board->vcc_pin, 0);
+				gpio_set_value(cf->board->vcc_pin, 0);
 				break;
 			case 33:
-				at91_set_gpio_value(cf->board->vcc_pin, 1);
+				gpio_set_value(cf->board->vcc_pin, 1);
 				break;
 			default:
 				return -EINVAL;
@@ -132,7 +132,7 @@ at91_cf_set_socket(struct pcmcia_socket *sock, struct socket_state_t *s)
 	}
 
 	/* toggle reset if needed */
-	at91_set_gpio_value(cf->board->rst_pin, s->flags & SS_RESET);
+	gpio_set_value(cf->board->rst_pin, s->flags & SS_RESET);
 
 	pr_debug("%s: Vcc %d, io_irq %d, flags %04x csc %04x\n",
 		driver_name, s->Vcc, s->io_irq, s->flags, s->csc_mask);
@@ -239,11 +239,24 @@ static int __init at91_cf_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, cf);
 
 	/* must be a GPIO; ergo must trigger on both edges */
-	status = request_irq(board->det_pin, at91_cf_irq, 0, driver_name, cf);
+	status = gpio_request(board->det_pin, "cf_det");
 	if (status < 0)
 		goto fail0;
+	status = request_irq(board->det_pin, at91_cf_irq, 0, driver_name, cf);
+	if (status < 0)
+		goto fail00;
 	device_init_wakeup(&pdev->dev, 1);
 
+	status = gpio_request(board->rst_pin, "cf_rst");
+	if (status < 0)
+		goto fail0a;
+
+	if (board->vcc_pin) {
+		status = gpio_request(board->vcc_pin, "cf_vcc");
+		if (status < 0)
+			goto fail0b;
+	}
+
 	/*
 	 * The card driver will request this irq later as needed.
 	 * but it causes lots of "irqNN: nobody cared" messages
@@ -251,16 +264,20 @@ static int __init at91_cf_probe(struct platform_device *pdev)
 	 * (Note:  DK board doesn't wire the IRQ pin...)
 	 */
 	if (board->irq_pin) {
+		status = gpio_request(board->irq_pin, "cf_irq");
+		if (status < 0)
+			goto fail0c;
 		status = request_irq(board->irq_pin, at91_cf_irq,
 				IRQF_SHARED, driver_name, cf);
 		if (status < 0)
-			goto fail0a;
+			goto fail0d;
 		cf->socket.pci_irq = board->irq_pin;
 	} else
 		cf->socket.pci_irq = NR_IRQS + 1;
 
 	/* pcmcia layer only remaps "real" memory not iospace */
-	cf->socket.io_offset = (unsigned long) ioremap(cf->phys_baseaddr + CF_IO_PHYS, SZ_2K);
+	cf->socket.io_offset = (unsigned long)
+			ioremap(cf->phys_baseaddr + CF_IO_PHYS, SZ_2K);
 	if (!cf->socket.io_offset) {
 		status = -ENXIO;
 		goto fail1;
@@ -296,11 +313,21 @@ fail2:
 fail1:
 	if (cf->socket.io_offset)
 		iounmap((void __iomem *) cf->socket.io_offset);
-	if (board->irq_pin)
+	if (board->irq_pin) {
 		free_irq(board->irq_pin, cf);
+fail0d:
+		gpio_free(board->irq_pin);
+	}
+fail0c:
+	if (board->vcc_pin)
+		gpio_free(board->vcc_pin);
+fail0b:
+	gpio_free(board->rst_pin);
 fail0a:
 	device_init_wakeup(&pdev->dev, 0);
 	free_irq(board->det_pin, cf);
+fail00:
+	gpio_free(board->det_pin);
 fail0:
 	kfree(cf);
 	return status;
@@ -313,13 +340,18 @@ static int __exit at91_cf_remove(struct platform_device *pdev)
 	struct resource		*io = cf->socket.io[0].res;
 
 	pcmcia_unregister_socket(&cf->socket);
-	if (board->irq_pin)
+	release_mem_region(io->start, io->end + 1 - io->start);
+	iounmap((void __iomem *) cf->socket.io_offset);
+	if (board->irq_pin) {
 		free_irq(board->irq_pin, cf);
+		gpio_free(board->irq_pin);
+	}
+	if (board->vcc_pin)
+		gpio_free(board->vcc_pin);
+	gpio_free(board->rst_pin);
 	device_init_wakeup(&pdev->dev, 0);
 	free_irq(board->det_pin, cf);
-	iounmap((void __iomem *) cf->socket.io_offset);
-	release_mem_region(io->start, io->end + 1 - io->start);
-
+	gpio_free(board->det_pin);
 	kfree(cf);
 	return 0;
 }
-- 
cgit v1.1


From 5a1c3e1aa977457ded6fd0739e032c9684bf23bd Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 4 Feb 2008 22:27:44 -0800
Subject: drivers/pcmcia: Add missing iounmap

of_iomap calls ioremap, and so should be matched with an iounmap.  At the
two error returns, the result of calling of_iomap is only stored in a local
variable, so these error paths need to call iounmap.  Furthermore, this
function ultimately stores the result of of_iomap in an array that is local
to the file.  These values should be iounmapped at some point.  I have
added a corresponding call to iounmap at the end of the function
m8xx_remove.

The problem was found using the following semantic match.
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@@
type T,T1,T2;
identifier E;
statement S;
expression x1,x2,x3;
int ret;
@@

  T E;
  ...
* E = of_iomap(...);
  if (E == NULL) S
  ... when != iounmap(...,(T1)E,...)
      when != if (E != NULL) { ... iounmap(...,(T1)E,...); ...}
      when != x1 = (T1)E
      when != E = x3;
      when any
  if (...) {
    ... when != iounmap(...,(T2)E,...)
        when != if (E != NULL) { ... iounmap(...,(T2)E,...); ...}
        when != x2 = (T2)E
(
*   return;
|
*   return ret;
)
  }
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Vitaly Bordug <vitb@kernel.crashing.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Olof Johansson <olof@lixom.net>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/m8xx_pcmcia.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 4ea426a..ac70d2c 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -1174,8 +1174,10 @@ static int __init m8xx_probe(struct of_device *ofdev,
 
 	pcmcia_schlvl = irq_of_parse_and_map(np, 0);
 	hwirq = irq_map[pcmcia_schlvl].hwirq;
-	if (pcmcia_schlvl < 0)
+	if (pcmcia_schlvl < 0) {
+		iounmap(pcmcia);
 		return -EINVAL;
+	}
 
 	m8xx_pgcrx[0] = &pcmcia->pcmc_pgcra;
 	m8xx_pgcrx[1] = &pcmcia->pcmc_pgcrb;
@@ -1189,6 +1191,7 @@ static int __init m8xx_probe(struct of_device *ofdev,
 			driver_name, socket)) {
 		pcmcia_error("Cannot allocate IRQ %u for SCHLVL!\n",
 			     pcmcia_schlvl);
+		iounmap(pcmcia);
 		return -1;
 	}
 
@@ -1284,6 +1287,7 @@ static int m8xx_remove(struct of_device *ofdev)
 	}
 	for (i = 0; i < PCMCIA_SOCKETS_NO; i++)
 		pcmcia_unregister_socket(&socket[i].socket);
+	iounmap(pcmcia);
 
 	free_irq(pcmcia_schlvl, NULL);
 
-- 
cgit v1.1


From 1523508d6321436b6edfcd99aab04a344f9aed3f Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 4 Feb 2008 22:27:46 -0800
Subject: drivers/pcmcia: add missing pci_dev_get

pci_get_slot does a pci_dev_get, so pci_dev_put needs to be called in an
error case.

An extract of the semantic match used to find the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@@
type find1.T,T1,T2;
identifier find1.E;
statement find1.S;
expression x1,x2,x3;
expression find1.test;
int ret != 0;
@@

  T E;
  ...
(
* E = pci_get_slot(...);
  if (E == NULL) S
|
* if ((E = pci_get_slot(...)) == NULL)
  S
)
  ... when != pci_dev_put(...,(T1)E,...)
      when != if (E != NULL) { ... pci_dev_put(...,(T1)E,...); ...}
      when != x1 = (T1)E
      when != E = x3;
      when any
  if (test) {
    ... when != pci_dev_put(...,(T2)E,...)
        when != if (E != NULL) { ... pci_dev_put(...,(T2)E,...); ...}
        when != x2 = (T2)E
(
*   return;
|
*   return ret;
)
  }
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/cardbus.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c
index a1bd763..714baae 100644
--- a/drivers/pcmcia/cardbus.c
+++ b/drivers/pcmcia/cardbus.c
@@ -143,7 +143,7 @@ int read_cb_mem(struct pcmcia_socket * s, int space, u_int addr, u_int len, void
 	/* Config space? */
 	if (space == 0) {
 		if (addr + len > 0x100)
-			goto fail;
+			goto failput;
 		for (; len; addr++, ptr++, len--)
 			pci_read_config_byte(dev, addr, ptr);
 		return 0;
@@ -171,6 +171,8 @@ int read_cb_mem(struct pcmcia_socket * s, int space, u_int addr, u_int len, void
 	memcpy_fromio(ptr, s->cb_cis_virt + addr, len);
 	return 0;
 
+failput:
+	pci_dev_put(dev);
 fail:
 	memset(ptr, 0xff, len);
 	return -1;
-- 
cgit v1.1


From c3e4642be734ce3d2c7398246d8cbced3a039f54 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Mon, 4 Feb 2008 22:27:46 -0800
Subject: serial: keep the DTR setting for serial console.

with reverting "x86, serial: convert legacy COM ports to platform devices",
we will have the serial console before the port is probled again.

uart_add_one_port==>uart_configure_port==>set_mcttrl(port, 0) will clear
the DTR setting by uart_set_options().  then I will lose my output from
serial console again.

So try to keep DTR in uart_configure_port()

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <ak@suse.de>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 3bb5d24..0cf382b 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -2150,10 +2150,11 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
 
 		/*
 		 * Ensure that the modem control lines are de-activated.
+		 * keep the DTR setting that is set in uart_set_options()
 		 * We probably don't need a spinlock around this, but
 		 */
 		spin_lock_irqsave(&port->lock, flags);
-		port->ops->set_mctrl(port, 0);
+		port->ops->set_mctrl(port, port->mctrl & TIOCM_DTR);
 		spin_unlock_irqrestore(&port->lock, flags);
 
 		/*
-- 
cgit v1.1


From 1452750afc923b838a76e23150d5f1b4fc718b11 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Mon, 4 Feb 2008 22:27:48 -0800
Subject: drivers/serial/s3c2410.c: remove dead config symbols

Remove dead config symbol.

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Dooks <ben@fluff.org>
Cc: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/s3c2410.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index e773c8e..45de193 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -1527,7 +1527,7 @@ static inline void s3c2440_serial_exit(void)
 #define s3c2440_uart_inf_at NULL
 #endif /* CONFIG_CPU_S3C2440 */
 
-#if defined(CONFIG_CPU_S3C2412) || defined(CONFIG_CPU_S3C2413)
+#if defined(CONFIG_CPU_S3C2412)
 
 static int s3c2412_serial_setsource(struct uart_port *port,
 				     struct s3c24xx_uart_clksrc *clk)
-- 
cgit v1.1


From 02c9b5cf9acd8a85313b892dc5196ccf133d4884 Mon Sep 17 00:00:00 2001
From: "Krauth.Julien" <Krauth.Julien@addi-data.com>
Date: Mon, 4 Feb 2008 22:27:49 -0800
Subject: serial: add ADDI-DATA GmbH Communication cardsin8250_pci.c and
 pci_ids.h

Add ADDI-DATA GmbH communication cards to 8250_pci driver.  Supported cards
are:

APCI-7300, APCI-7420, APCI-7500, APCI-7800 APCI-7300-2, APCI-7420-2,
APCI-7500-2 APCI-7300-3, APCI-7420-3, APCI-7500-3, APCI-7800-3

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Krauth J. <krauth.julien@addi-data.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h   |  17 ++++++
 2 files changed, 150 insertions(+)

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index ceb03c9..0a4ac2b 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -106,6 +106,32 @@ setup_port(struct serial_private *priv, struct uart_port *port,
 }
 
 /*
+ * ADDI-DATA GmbH communication cards <info@addi-data.com>
+ */
+static int addidata_apci7800_setup(struct serial_private *priv,
+				struct pciserial_board *board,
+				struct uart_port *port, int idx)
+{
+	unsigned int bar = 0, offset = board->first_offset;
+	bar = FL_GET_BASE(board->flags);
+
+	if (idx < 2) {
+		offset += idx * board->uart_offset;
+	} else if ((idx >= 2) && (idx < 4)) {
+		bar += 1;
+		offset += ((idx - 2) * board->uart_offset);
+	} else if ((idx >= 4) && (idx < 6)) {
+		bar += 2;
+		offset += ((idx - 4) * board->uart_offset);
+	} else if (idx >= 6) {
+		bar += 3;
+		offset += ((idx - 6) * board->uart_offset);
+	}
+
+	return setup_port(priv, port, bar, offset, board->reg_shift);
+}
+
+/*
  * AFAVLAB uses a different mixture of BARs and offsets
  * Not that ugly ;) -- HW
  */
@@ -752,6 +778,16 @@ pci_default_setup(struct serial_private *priv, struct pciserial_board *board,
  */
 static struct pci_serial_quirk pci_serial_quirks[] = {
 	/*
+	* ADDI-DATA GmbH communication cards <info@addi-data.com>
+	*/
+	{
+		.vendor         = PCI_VENDOR_ID_ADDIDATA_OLD,
+		.device         = PCI_DEVICE_ID_ADDIDATA_APCI7800,
+		.subvendor      = PCI_ANY_ID,
+		.subdevice      = PCI_ANY_ID,
+		.setup          = addidata_apci7800_setup,
+	},
+	/*
 	 * AFAVLAB cards - these may be called via parport_serial
 	 *  It is not clear whether this applies to all products.
 	 */
@@ -1179,6 +1215,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 115200,
 		.uart_offset	= 8,
 	},
+	[pbn_b0_8_115200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 8,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b0_1_921600] = {
 		.flags		= FL_BASE0,
@@ -2697,6 +2739,97 @@ static struct pci_device_id serial_pci_tbl[] = {
 		pbn_pasemi_1682M },
 
 	/*
+	* ADDI-DATA GmbH communication cards <info@addi-data.com>
+	*/
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7500,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_4_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7420,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_2_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7300,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_1_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA_OLD,
+		PCI_DEVICE_ID_ADDIDATA_APCI7800,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b1_8_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7500_2,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_4_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7420_2,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_2_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7300_2,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_1_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7500_3,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_4_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7420_3,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_2_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7300_3,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_1_115200 },
+
+	{	PCI_VENDOR_ID_ADDIDATA,
+		PCI_DEVICE_ID_ADDIDATA_APCI7800_3,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
+		0,
+		0,
+		pbn_b0_8_115200 },
+
+	/*
 	 * These entries match devices with class COMMUNICATION_SERIAL,
 	 * COMMUNICATION_MODEM or COMMUNICATION_MULTISERIAL
 	 */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 41f6f28..39d3283 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2043,6 +2043,23 @@
 #define PCI_VENDOR_ID_QUICKNET		0x15e2
 #define PCI_DEVICE_ID_QUICKNET_XJ	0x0500
 
+/*
+ * ADDI-DATA GmbH communication cards <info@addi-data.com>
+ */
+#define PCI_VENDOR_ID_ADDIDATA_OLD             0x10E8
+#define PCI_VENDOR_ID_ADDIDATA                 0x15B8
+#define PCI_DEVICE_ID_ADDIDATA_APCI7500        0x7000
+#define PCI_DEVICE_ID_ADDIDATA_APCI7420        0x7001
+#define PCI_DEVICE_ID_ADDIDATA_APCI7300        0x7002
+#define PCI_DEVICE_ID_ADDIDATA_APCI7800        0x818E
+#define PCI_DEVICE_ID_ADDIDATA_APCI7500_2      0x7009
+#define PCI_DEVICE_ID_ADDIDATA_APCI7420_2      0x700A
+#define PCI_DEVICE_ID_ADDIDATA_APCI7300_2      0x700B
+#define PCI_DEVICE_ID_ADDIDATA_APCI7500_3      0x700C
+#define PCI_DEVICE_ID_ADDIDATA_APCI7420_3      0x700D
+#define PCI_DEVICE_ID_ADDIDATA_APCI7300_3      0x700E
+#define PCI_DEVICE_ID_ADDIDATA_APCI7800_3      0x700F
+
 #define PCI_VENDOR_ID_PDC		0x15e9
 
 #define PCI_VENDOR_ID_FARSITE           0x1619
-- 
cgit v1.1


From 74a197417240120d638d67d74f48655fb7f46f16 Mon Sep 17 00:00:00 2001
From: Will Newton <will.newton@gmail.com>
Date: Mon, 4 Feb 2008 22:27:50 -0800
Subject: 8250.c: support specifying DW APB UARTs in device platform_data

Allow the private_data field to be specified in platform_data for the
standard 8250/16550 UART.  This field is used by DW APB type UARTs and
without this patch it's only possible to set this field when registering
the port by hand.  If private_data is not set then the driver will
potentially oops with a NULL pointer dereference.

Signed-off-by: Will Newton <will.newton@gmail.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 40 +++++++++++++++++++++-------------------
 include/linux/serial_8250.h |  1 +
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index f94109c..c93ef205 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2662,16 +2662,17 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 	memset(&port, 0, sizeof(struct uart_port));
 
 	for (i = 0; p && p->flags != 0; p++, i++) {
-		port.iobase	= p->iobase;
-		port.membase	= p->membase;
-		port.irq	= p->irq;
-		port.uartclk	= p->uartclk;
-		port.regshift	= p->regshift;
-		port.iotype	= p->iotype;
-		port.flags	= p->flags;
-		port.mapbase	= p->mapbase;
-		port.hub6	= p->hub6;
-		port.dev	= &dev->dev;
+		port.iobase		= p->iobase;
+		port.membase		= p->membase;
+		port.irq		= p->irq;
+		port.uartclk		= p->uartclk;
+		port.regshift		= p->regshift;
+		port.iotype		= p->iotype;
+		port.flags		= p->flags;
+		port.mapbase		= p->mapbase;
+		port.hub6		= p->hub6;
+		port.private_data	= p->private_data;
+		port.dev		= &dev->dev;
 		if (share_irqs)
 			port.flags |= UPF_SHARE_IRQ;
 		ret = serial8250_register_port(&port);
@@ -2812,15 +2813,16 @@ int serial8250_register_port(struct uart_port *port)
 	if (uart) {
 		uart_remove_one_port(&serial8250_reg, &uart->port);
 
-		uart->port.iobase   = port->iobase;
-		uart->port.membase  = port->membase;
-		uart->port.irq      = port->irq;
-		uart->port.uartclk  = port->uartclk;
-		uart->port.fifosize = port->fifosize;
-		uart->port.regshift = port->regshift;
-		uart->port.iotype   = port->iotype;
-		uart->port.flags    = port->flags | UPF_BOOT_AUTOCONF;
-		uart->port.mapbase  = port->mapbase;
+		uart->port.iobase       = port->iobase;
+		uart->port.membase      = port->membase;
+		uart->port.irq          = port->irq;
+		uart->port.uartclk      = port->uartclk;
+		uart->port.fifosize     = port->fifosize;
+		uart->port.regshift     = port->regshift;
+		uart->port.iotype       = port->iotype;
+		uart->port.flags        = port->flags | UPF_BOOT_AUTOCONF;
+		uart->port.mapbase      = port->mapbase;
+		uart->port.private_data = port->private_data;
 		if (port->dev)
 			uart->port.dev = port->dev;
 
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index afe0f6d..00b65c0 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -23,6 +23,7 @@ struct plat_serial8250_port {
 	resource_size_t	mapbase;	/* resource base */
 	unsigned int	irq;		/* interrupt number */
 	unsigned int	uartclk;	/* UART clock rate */
+	void            *private_data;
 	unsigned char	regshift;	/* register shift */
 	unsigned char	iotype;		/* UPIO_* */
 	unsigned char	hub6;
-- 
cgit v1.1


From 9d778a69370cc1b643b13648df971c83ff5654ef Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Mon, 4 Feb 2008 22:27:51 -0800
Subject: serial: avoid waking up closed serial ports on resume

When we boot, serial ports remain in low power mode until they're used either
by userspace or for the kernel console.

However, if you suspend the system, and then resume, all serial ports will be
taken out of low power mode.  This is bad news for embedded devices where this
can mean higher power consumption.

Only bring a serial port out of low power mode if the port is being used as
the kernel console, or is in use by userspace.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 0cf382b..2554d2f 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -2029,8 +2029,6 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 	}
 	port->suspended = 0;
 
-	uart_change_pm(state, 0);
-
 	/*
 	 * Re-enable the console device after suspending.
 	 */
@@ -2049,6 +2047,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		if (state->info && state->info->tty && termios.c_cflag == 0)
 			termios = *state->info->tty->termios;
 
+		uart_change_pm(state, 0);
 		port->ops->set_termios(port, &termios, NULL);
 		console_start(port->cons);
 	}
@@ -2057,6 +2056,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		const struct uart_ops *ops = port->ops;
 		int ret;
 
+		uart_change_pm(state, 0);
 		ops->set_mctrl(port, 0);
 		ret = ops->startup(port);
 		if (ret == 0) {
-- 
cgit v1.1


From c8c6bfa39d6bd7347f43937c8767ae145b61bcb4 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Mon, 4 Feb 2008 22:27:52 -0800
Subject: serial: avoid stalling suspend if serial port won't drain

Some ports seem to be unable to drain their transmitters on shut down.  Such a
problem can occur if the port is programmed for hardware imposed flow control,
characters are in the FIFO but the CTS signal is inactive.

Normally, this isn't a problem because most places where we wait for the
transmitter to drain have a time-out.  However, there is no timeout in the
suspend path.

Give a port 30ms to drain; this is an arbitary value chosen to avoid long
delays if there are many such ports in the system, while giving a reasonable
chance for a single port to drain.  Should a port not drain within this
timeout, issue a warning.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 2554d2f..304fe32 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1977,6 +1977,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 
 	if (state->info && state->info->flags & UIF_INITIALIZED) {
 		const struct uart_ops *ops = port->ops;
+		int tries;
 
 		state->info->flags = (state->info->flags & ~UIF_INITIALIZED)
 				     | UIF_SUSPENDED;
@@ -1990,9 +1991,14 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 		/*
 		 * Wait for the transmitter to empty.
 		 */
-		while (!ops->tx_empty(port)) {
+		for (tries = 3; !ops->tx_empty(port) && tries; tries--) {
 			msleep(10);
 		}
+		if (!tries)
+			printk(KERN_ERR "%s%s%s%d: Unable to drain transmitter\n",
+			       port->dev ? port->dev->bus_id : "",
+			       port->dev ? ": " : "",
+			       drv->dev_name, port->line);
 
 		ops->shutdown(port);
 	}
-- 
cgit v1.1


From 6d4d67beb963de8865499781b8523e5b683819c3 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 4 Feb 2008 22:27:53 -0800
Subject: serial: speed setup failure reporting

Invalid speeds are forced to 9600. Update the code for this to encode new
style baud rates properly.

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/serial_core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 304fe32..276da14 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -371,7 +371,8 @@ uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
 		 */
 		termios->c_cflag &= ~CBAUD;
 		if (old) {
-			termios->c_cflag |= old->c_cflag & CBAUD;
+			baud = tty_termios_baud_rate(old);
+			tty_termios_encode_baud_rate(termios, baud, baud);
 			old = NULL;
 			continue;
 		}
@@ -380,7 +381,7 @@ uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
 		 * As a last resort, if the quotient is zero,
 		 * default to 9600 bps
 		 */
-		termios->c_cflag |= B9600;
+		tty_termios_encode_baud_rate(termios, 9600, 9600);
 	}
 
 	return 0;
-- 
cgit v1.1


From 3e8d4e2075e049664294722b436edfc5ced6ca53 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 4 Feb 2008 22:27:53 -0800
Subject: serial: Coding style

Coding style tweaks and printk levels.

Signed-off-by: Alan Cox <alan@redhat.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c     |  2 +-
 drivers/serial/8250_pnp.c | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index c93ef205..b8a4bd9 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2047,7 +2047,7 @@ serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
 	 * Oxford Semi 952 rev B workaround
 	 */
 	if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
-		quot ++;
+		quot++;
 
 	if (up->capabilities & UART_CAP_FIFO && up->port.fifosize > 1) {
 		if (baud < 2400)
diff --git a/drivers/serial/8250_pnp.c b/drivers/serial/8250_pnp.c
index 1de098e..6f09cbd 100644
--- a/drivers/serial/8250_pnp.c
+++ b/drivers/serial/8250_pnp.c
@@ -414,8 +414,9 @@ static int __devinit check_resources(struct pnp_option *option)
  */
 static int __devinit serial_pnp_guess_board(struct pnp_dev *dev, int *flags)
 {
-	if (!(check_name(pnp_dev_name(dev)) || (dev->card && check_name(dev->card->name))))
-		return -ENODEV;
+	if (!(check_name(pnp_dev_name(dev)) ||
+		(dev->card && check_name(dev->card->name))))
+			return -ENODEV;
 
 	if (check_resources(dev->independent))
 		return 0;
@@ -452,8 +453,9 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
 		return -ENODEV;
 
 #ifdef SERIAL_DEBUG_PNP
-	printk("Setup PNP port: port %x, mem 0x%lx, irq %d, type %d\n",
-	       port.iobase, port.mapbase, port.irq, port.iotype);
+	printk(KERN_DEBUG
+		"Setup PNP port: port %x, mem 0x%lx, irq %d, type %d\n",
+		       port.iobase, port.mapbase, port.irq, port.iotype);
 #endif
 
 	port.flags |= UPF_SKIP_TEST | UPF_BOOT_AUTOCONF;
-- 
cgit v1.1


From 7bbdc3d51cf793dd81c38f794f4cb73df58d1527 Mon Sep 17 00:00:00 2001
From: "Mark A. Greer" <mgreer@mvista.com>
Date: Mon, 4 Feb 2008 22:27:54 -0800
Subject: serial: MPSC: set baudrate when BRG divider is set.

The clock to generate the desired baudrate with the MPSC is first divided
by the Baud Rate Generator (BRG) and then by the MPSC itself.  So, when the
BRG divider is changed, the MPSC divider must also be changed to generate
the correct baudrate.  During MPSC initialization, the BRG divider is
changed but the MPSC divider isn't changed until much later.  This results
in some printk's coming out garbled.  To fix that, set the MPSC divider at
the same time that the BRG divider is changed.

Signed-off-by: Mark A. Greer <mgreer@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/mpsc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 4d643c9..cb3a919 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -612,6 +612,7 @@ static void mpsc_hw_init(struct mpsc_port_info *pi)
 
 	/* No preamble, 16x divider, low-latency, */
 	writel(0x04400400, pi->mpsc_base + MPSC_MMCRH);
+	mpsc_set_baudrate(pi, pi->default_baud);
 
 	if (pi->mirror_regs) {
 		pi->MPSC_CHR_1_m = 0;
-- 
cgit v1.1


From 6b7b651055221127304a4e373ee9b762398d54d7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:55 -0800
Subject: iommu sg merging: add device_dma_parameters structure

IOMMUs merges scatter/gather segments without considering a low level
driver's restrictions. The problem is that IOMMUs can't access to the
limitations because they are in request_queue.

This patchset introduces a new structure, device_dma_parameters,
including dma information. A pointer to device_dma_parameters is added
to struct device. The bus specific structures (like pci_dev) includes
device_dma_parameters. Low level drivers can use dma_set_max_seg_size
to tell IOMMUs about the restrictions.

We can move more dma stuff in struct device (like dma_mask) to struct
device_dma_parameters later (needs some cleanups before that).

This includes patches for all the IOMMUs that could merge sg (x86_64,
ppc, IA64, alpha, sparc64, and parisc) though only the ppc patch was
tested. The patches for other IOMMUs are only compile tested.

This patch:

Add a new structure, device_dma_parameters, including dma information.  A
pointer to device_dma_parameters is added to struct device.

- there are only max_segment_size and segment_boundary_mask there but we'll
  move more dma stuff in struct device (like dma_mask) to struct
  device_dma_parameters later.  segment_boundary_mask is not supported yet.

- new accessors for the dma parameters are added.  So we can easily change
  where to place struct device_dma_parameters in the future.

- dma_get_max_seg_size returns 64K if dma_parms in struct device isn't set
  up properly.  64K is the default max_segment_size in the block layer.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/device.h      | 11 +++++++++++
 include/linux/dma-mapping.h | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 479c0b3..2258d89 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -410,6 +410,15 @@ extern int devres_release_group(struct device *dev, void *id);
 extern void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp);
 extern void devm_kfree(struct device *dev, void *p);
 
+struct device_dma_parameters {
+	/*
+	 * a low level driver may set these to teach IOMMU code about
+	 * sg limitations.
+	 */
+	unsigned int max_segment_size;
+	unsigned long segment_boundary_mask;
+};
+
 struct device {
 	struct klist		klist_children;
 	struct klist_node	knode_parent;	/* node in sibling list */
@@ -445,6 +454,8 @@ struct device {
 					     64 bit addresses for consistent
 					     allocations such descriptors. */
 
+	struct device_dma_parameters *dma_parms;
+
 	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
 
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4470950..df3a361 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -60,6 +60,21 @@ static inline int is_device_dma_capable(struct device *dev)
 
 extern u64 dma_get_required_mask(struct device *dev);
 
+static inline unsigned int dma_get_max_seg_size(struct device *dev)
+{
+	return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536;
+}
+
+static inline unsigned int dma_set_max_seg_size(struct device *dev,
+						unsigned int size)
+{
+	if (dev->dma_parms) {
+		dev->dma_parms->max_segment_size = size;
+		return 0;
+	} else
+		return -EIO;
+}
+
 /* flags for the coherent memory api */
 #define	DMA_MEMORY_MAP			0x01
 #define DMA_MEMORY_IO			0x02
-- 
cgit v1.1


From 4d57cdfacaa1c207bf4c071f89835e0368766a50 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:55 -0800
Subject: iommu sg merging: PCI: add device_dma_parameters support

This adds struct device_dma_parameters in struct pci_dev and properly
sets up a pointer in struct device.

The default max_segment_size is set to 64K, same to the block layer's
default value.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Mostly-acked-by: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/pci.c   | 8 ++++++++
 drivers/pci/probe.c | 3 +++
 include/linux/pci.h | 9 +++++++++
 3 files changed, 20 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 04aac77..be97090 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1451,6 +1451,14 @@ pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask)
 }
 #endif
 
+#ifndef HAVE_ARCH_PCI_SET_DMA_MAX_SEGMENT_SIZE
+int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size)
+{
+	return dma_set_max_seg_size(&dev->dev, size);
+}
+EXPORT_SYMBOL(pci_set_dma_max_seg_size);
+#endif
+
 /**
  * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count
  * @dev: PCI device to query
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 7f5dab3..f47d596 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -933,8 +933,11 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 
 	set_dev_node(&dev->dev, pcibus_to_node(bus));
 	dev->dev.dma_mask = &dev->dma_mask;
+	dev->dev.dma_parms = &dev->dma_parms;
 	dev->dev.coherent_dma_mask = 0xffffffffull;
 
+	pci_set_dma_max_seg_size(dev, 65536);
+
 	/* Fix up broken headers */
 	pci_fixup_device(pci_fixup_header, dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cee75c0..ba3a7f4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -159,6 +159,8 @@ struct pci_dev {
 					   this if your device has broken DMA
 					   or supports 64-bit transfers.  */
 
+	struct device_dma_parameters dma_parms;
+
 	pci_power_t     current_state;  /* Current operating state. In ACPI-speak,
 					   this is D0-D3, D0 being fully functional,
 					   and D3 being off. */
@@ -580,6 +582,7 @@ void pci_intx(struct pci_dev *dev, int enable);
 void pci_msi_off(struct pci_dev *dev);
 int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
+int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size);
 int pcix_get_max_mmrbc(struct pci_dev *dev);
 int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
@@ -822,6 +825,12 @@ static inline int pci_set_dma_mask(struct pci_dev *dev, u64 mask)
 	return -EIO;
 }
 
+static inline int pci_set_dma_max_seg_size(struct pci_dev *dev,
+					unsigned int size)
+{
+	return -EIO;
+}
+
 static inline int pci_assign_resource(struct pci_dev *dev, int i)
 {
 	return -EBUSY;
-- 
cgit v1.1


From 42d00284e16bf998f8f93ce5d061df81b0ff283e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:56 -0800
Subject: iommu sg merging: x86: make pci-gart iommu respect the segment size
 limits

This patch makes pci-gart iommu respect segment size limits when
merging sg lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/pci-gart_64.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 845cbec..5ee700f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -416,6 +416,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
 	int need = 0, nextneed, i, out, start;
 	unsigned long pages = 0;
+	unsigned int seg_size;
+	unsigned int max_seg_size;
 
 	if (nents == 0)
 		return 0;
@@ -426,6 +428,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 	out = 0;
 	start = 0;
 	start_sg = sgmap = sg;
+	seg_size = 0;
+	max_seg_size = dma_get_max_seg_size(dev);
 	ps = NULL; /* shut up gcc */
 	for_each_sg(sg, s, nents, i) {
 		dma_addr_t addr = sg_phys(s);
@@ -443,11 +447,13 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 			 * offset.
 			 */
 			if (!iommu_merge || !nextneed || !need || s->offset ||
+			    (s->length + seg_size > max_seg_size) ||
 			    (ps->offset + ps->length) % PAGE_SIZE) {
 				if (dma_map_cont(start_sg, i - start, sgmap,
 						  pages, need) < 0)
 					goto error;
 				out++;
+				seg_size = 0;
 				sgmap = sg_next(sgmap);
 				pages = 0;
 				start = i;
@@ -455,6 +461,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 			}
 		}
 
+		seg_size += s->length;
 		need = nextneed;
 		pages += to_pages(s->offset, s->length);
 		ps = s;
-- 
cgit v1.1


From 740c3ce66700640a6e6136ff679b067e92125794 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:57 -0800
Subject: iommu sg merging: ppc: make iommu respect the segment size limits

This patch makes iommu respect segment size limits when merging sg
lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/dma_64.c | 2 +-
 arch/powerpc/kernel/iommu.c  | 8 ++++++--
 include/asm-powerpc/iommu.h  | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c
index 8423907..f9de2fd 100644
--- a/arch/powerpc/kernel/dma_64.c
+++ b/arch/powerpc/kernel/dma_64.c
@@ -68,7 +68,7 @@ static void dma_iommu_unmap_single(struct device *dev, dma_addr_t dma_handle,
 static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 			    int nelems, enum dma_data_direction direction)
 {
-	return iommu_map_sg(dev->archdata.dma_data, sglist, nelems,
+	return iommu_map_sg(dev, sglist, nelems,
 			    device_to_mask(dev), direction);
 }
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a3c406a..d0e6fac 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -270,16 +270,18 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
-int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 		 int nelems, unsigned long mask,
 		 enum dma_data_direction direction)
 {
+	struct iommu_table *tbl = dev->archdata.dma_data;
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
 	int outcount, incount, i;
 	unsigned int align;
 	unsigned long handle;
+	unsigned int max_seg_size;
 
 	BUG_ON(direction == DMA_NONE);
 
@@ -298,6 +300,7 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
+	max_seg_size = dma_get_max_seg_size(dev);
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long vaddr, npages, entry, slen;
 
@@ -344,7 +347,8 @@ int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 			/* We cannot merge if:
 			 * - allocated dma_addr isn't contiguous to previous allocation
 			 */
-			if (novmerge || (dma_addr != dma_next)) {
+			if (novmerge || (dma_addr != dma_next) ||
+			    (outs->dma_length + s->length > max_seg_size)) {
 				/* Can't merge: create a new segment */
 				segstart = s;
 				outcount++;
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index 7a3cef7..a07a67c 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -79,7 +79,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
 
-extern int iommu_map_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+extern int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 			int nelems, unsigned long mask,
 			enum dma_data_direction direction);
 extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
-- 
cgit v1.1


From a031bbcb8d7559d61f383880f23dd0e047247410 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:58 -0800
Subject: iommu sg merging: IA64: make sba_iommu respect the segment size
 limits

This patch makes sba iommu respect segment size limits when merging sg
lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/hp/common/sba_iommu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 45bf04e..c412fe6 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1265,7 +1265,7 @@ sba_fill_pdir(
  * the sglist do both.
  */
 static SBA_INLINE int
-sba_coalesce_chunks( struct ioc *ioc,
+sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
 	struct scatterlist *startsg,
 	int nents)
 {
@@ -1275,6 +1275,7 @@ sba_coalesce_chunks( struct ioc *ioc,
 	struct scatterlist *dma_sg;        /* next DMA stream head */
 	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
 	int n_mappings = 0;
+	unsigned int max_seg_size = dma_get_max_seg_size(dev);
 
 	while (nents > 0) {
 		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
@@ -1314,6 +1315,9 @@ sba_coalesce_chunks( struct ioc *ioc,
 			    > DMA_CHUNK_SIZE)
 				break;
 
+			if (dma_len + startsg->length > max_seg_size)
+				break;
+
 			/*
 			** Then look for virtually contiguous blocks.
 			**
@@ -1441,7 +1445,7 @@ int sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, int di
 	** w/o this association, we wouldn't have coherent DMA!
 	** Access to the virtual address is what forces a two pass algorithm.
 	*/
-	coalesced = sba_coalesce_chunks(ioc, sglist, nents);
+	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
 
 	/*
 	** Program the I/O Pdir
-- 
cgit v1.1


From 7c53664dcd5df7349edb56f04c743bf66510a6f1 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:27:58 -0800
Subject: iommu sg merging: alpha: make pci_iommu respect the segment size
 limits

This patch makes pci_iommu respect segment size limits when merging sg
lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/pci_iommu.c | 24 ++++++++++++++++++------
 include/asm-alpha/pci.h       |  1 +
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 2d00a08..26d3789 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -9,6 +9,7 @@
 #include <linux/bootmem.h>
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/io.h>
 #include <asm/hwrpb.h>
@@ -470,22 +471,29 @@ EXPORT_SYMBOL(pci_free_consistent);
 #define SG_ENT_PHYS_ADDRESS(SG) __pa(SG_ENT_VIRT_ADDRESS(SG))
 
 static void
-sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok)
+sg_classify(struct device *dev, struct scatterlist *sg, struct scatterlist *end,
+	    int virt_ok)
 {
 	unsigned long next_paddr;
 	struct scatterlist *leader;
 	long leader_flag, leader_length;
+	unsigned int max_seg_size;
 
 	leader = sg;
 	leader_flag = 0;
 	leader_length = leader->length;
 	next_paddr = SG_ENT_PHYS_ADDRESS(leader) + leader_length;
 
+	/* we will not marge sg without device. */
+	max_seg_size = dev ? dma_get_max_seg_size(dev) : 0;
 	for (++sg; sg < end; ++sg) {
 		unsigned long addr, len;
 		addr = SG_ENT_PHYS_ADDRESS(sg);
 		len = sg->length;
 
+		if (leader_length + len > max_seg_size)
+			goto new_segment;
+
 		if (next_paddr == addr) {
 			sg->dma_address = -1;
 			leader_length += len;
@@ -494,6 +502,7 @@ sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok)
 			leader_flag = 1;
 			leader_length += len;
 		} else {
+new_segment:
 			leader->dma_address = leader_flag;
 			leader->dma_length = leader_length;
 			leader = sg;
@@ -512,7 +521,7 @@ sg_classify(struct scatterlist *sg, struct scatterlist *end, int virt_ok)
    in the blanks.  */
 
 static int
-sg_fill(struct scatterlist *leader, struct scatterlist *end,
+sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
 	struct scatterlist *out, struct pci_iommu_arena *arena,
 	dma_addr_t max_dma, int dac_allowed)
 {
@@ -562,8 +571,8 @@ sg_fill(struct scatterlist *leader, struct scatterlist *end,
 
 		/* Otherwise, break up the remaining virtually contiguous
 		   hunks into individual direct maps and retry.  */
-		sg_classify(leader, end, 0);
-		return sg_fill(leader, end, out, arena, max_dma, dac_allowed);
+		sg_classify(dev, leader, end, 0);
+		return sg_fill(dev, leader, end, out, arena, max_dma, dac_allowed);
 	}
 
 	out->dma_address = arena->dma_base + dma_ofs*PAGE_SIZE + paddr;
@@ -619,12 +628,15 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 	struct pci_iommu_arena *arena;
 	dma_addr_t max_dma;
 	int dac_allowed;
+	struct device *dev;
 
 	if (direction == PCI_DMA_NONE)
 		BUG();
 
 	dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
 
+	dev = pdev ? &pdev->dev : NULL;
+
 	/* Fast path single entry scatterlists.  */
 	if (nents == 1) {
 		sg->dma_length = sg->length;
@@ -638,7 +650,7 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 	end = sg + nents;
 
 	/* First, prepare information about the entries.  */
-	sg_classify(sg, end, alpha_mv.mv_pci_tbi != 0);
+	sg_classify(dev, sg, end, alpha_mv.mv_pci_tbi != 0);
 
 	/* Second, figure out where we're going to map things.  */
 	if (alpha_mv.mv_pci_tbi) {
@@ -658,7 +670,7 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
 	for (out = sg; sg < end; ++sg) {
 		if ((int) sg->dma_address < 0)
 			continue;
-		if (sg_fill(sg, end, out, arena, max_dma, dac_allowed) < 0)
+		if (sg_fill(dev, sg, end, out, arena, max_dma, dac_allowed) < 0)
 			goto error;
 		out++;
 	}
diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index 30ee766..d5b10ef 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
 #include <asm/scatterlist.h>
 #include <asm/machvec.h>
 
-- 
cgit v1.1


From fde6a3c82d67f592eb587be4d12222b0ae6d4321 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:02 -0800
Subject: iommu sg merging: sparc64: make iommu respect the segment size limits

This patch makes iommu respect segment size limits when merging sg
lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc64/kernel/iommu.c        | 2 +-
 arch/sparc64/kernel/iommu_common.c | 8 ++++++--
 arch/sparc64/kernel/iommu_common.h | 3 ++-
 arch/sparc64/kernel/pci_sun4v.c    | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/sparc64/kernel/iommu.c b/arch/sparc64/kernel/iommu.c
index 070a484..4b9115a 100644
--- a/arch/sparc64/kernel/iommu.c
+++ b/arch/sparc64/kernel/iommu.c
@@ -580,7 +580,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
 
 	/* Step 1: Prepare scatter list. */
 
-	npages = prepare_sg(sglist, nelems);
+	npages = prepare_sg(dev, sglist, nelems);
 
 	/* Step 2: Allocate a cluster and context, if necessary. */
 
diff --git a/arch/sparc64/kernel/iommu_common.c b/arch/sparc64/kernel/iommu_common.c
index efd5dff..72a4acf 100644
--- a/arch/sparc64/kernel/iommu_common.c
+++ b/arch/sparc64/kernel/iommu_common.c
@@ -4,6 +4,7 @@
  * Copyright (C) 1999 David S. Miller (davem@redhat.com)
  */
 
+#include <linux/dma-mapping.h>
 #include "iommu_common.h"
 
 /* You are _strongly_ advised to enable the following debugging code
@@ -201,21 +202,24 @@ void verify_sglist(struct scatterlist *sglist, int nents, iopte_t *iopte, int np
 }
 #endif
 
-unsigned long prepare_sg(struct scatterlist *sg, int nents)
+unsigned long prepare_sg(struct device *dev, struct scatterlist *sg, int nents)
 {
 	struct scatterlist *dma_sg = sg;
 	unsigned long prev;
 	u32 dent_addr, dent_len;
+	unsigned int max_seg_size;
 
 	prev  = (unsigned long) sg_virt(sg);
 	prev += (unsigned long) (dent_len = sg->length);
 	dent_addr = (u32) ((unsigned long)(sg_virt(sg)) & (IO_PAGE_SIZE - 1UL));
+	max_seg_size = dma_get_max_seg_size(dev);
 	while (--nents) {
 		unsigned long addr;
 
 		sg = sg_next(sg);
 		addr = (unsigned long) sg_virt(sg);
-		if (! VCONTIG(prev, addr)) {
+		if (! VCONTIG(prev, addr) ||
+			dent_len + sg->length > max_seg_size) {
 			dma_sg->dma_address = dent_addr;
 			dma_sg->dma_length = dent_len;
 			dma_sg = sg_next(dma_sg);
diff --git a/arch/sparc64/kernel/iommu_common.h b/arch/sparc64/kernel/iommu_common.h
index 75b5a58..a90d046e 100644
--- a/arch/sparc64/kernel/iommu_common.h
+++ b/arch/sparc64/kernel/iommu_common.h
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/scatterlist.h>
+#include <linux/device.h>
 
 #include <asm/iommu.h>
 #include <asm/scatterlist.h>
@@ -46,4 +47,4 @@ extern void verify_sglist(struct scatterlist *sg, int nents, iopte_t *iopte, int
 #define VCONTIG(__X, __Y)	(((__X) == (__Y)) || \
 				 (((__X) | (__Y)) << (64UL - PAGE_SHIFT)) == 0UL)
 
-extern unsigned long prepare_sg(struct scatterlist *sg, int nents);
+extern unsigned long prepare_sg(struct device *dev, struct scatterlist *sg, int nents);
diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c
index 1aa8e04..67d6dce 100644
--- a/arch/sparc64/kernel/pci_sun4v.c
+++ b/arch/sparc64/kernel/pci_sun4v.c
@@ -490,7 +490,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
 		goto bad;
 
 	/* Step 1: Prepare scatter list. */
-	npages = prepare_sg(sglist, nelems);
+	npages = prepare_sg(dev, sglist, nelems);
 
 	/* Step 2: Allocate a cluster and context, if necessary. */
 	spin_lock_irqsave(&iommu->lock, flags);
-- 
cgit v1.1


From d1b5163206769aa93271bc1029e877ea9f920a5d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:03 -0800
Subject: iommu sg merging: parisc: make iommu respect the segment size limits

This patch makes iommu respect segment size limits when merging sg
lists.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <willy@debian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parisc/ccio-dma.c      | 2 +-
 drivers/parisc/iommu-helpers.h | 7 ++++++-
 drivers/parisc/sba_iommu.c     | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index ca52307..d08b284 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -941,7 +941,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	** w/o this association, we wouldn't have coherent DMA!
 	** Access to the virtual address is what forces a two pass algorithm.
 	*/
-	coalesced = iommu_coalesce_chunks(ioc, sglist, nents, ccio_alloc_range);
+	coalesced = iommu_coalesce_chunks(ioc, dev, sglist, nents, ccio_alloc_range);
 
 	/*
 	** Program the I/O Pdir
diff --git a/drivers/parisc/iommu-helpers.h b/drivers/parisc/iommu-helpers.h
index 0a1f99a..97ba828 100644
--- a/drivers/parisc/iommu-helpers.h
+++ b/drivers/parisc/iommu-helpers.h
@@ -95,12 +95,14 @@ iommu_fill_pdir(struct ioc *ioc, struct scatterlist *startsg, int nents,
 */
 
 static inline unsigned int
-iommu_coalesce_chunks(struct ioc *ioc, struct scatterlist *startsg, int nents,
+iommu_coalesce_chunks(struct ioc *ioc, struct device *dev,
+		      struct scatterlist *startsg, int nents,
 		      int (*iommu_alloc_range)(struct ioc *, size_t))
 {
 	struct scatterlist *contig_sg;	   /* contig chunk head */
 	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
 	unsigned int n_mappings = 0;
+	unsigned int max_seg_size = dma_get_max_seg_size(dev);
 
 	while (nents > 0) {
 
@@ -142,6 +144,9 @@ iommu_coalesce_chunks(struct ioc *ioc, struct scatterlist *startsg, int nents,
 					    IOVP_SIZE) > DMA_CHUNK_SIZE))
 				break;
 
+			if (startsg->length + dma_len > max_seg_size)
+				break;
+
 			/*
 			** Next see if we can append the next chunk (i.e.
 			** it must end on one page and begin on another
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index e527a0e..d06627c 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -946,7 +946,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	** w/o this association, we wouldn't have coherent DMA!
 	** Access to the virtual address is what forces a two pass algorithm.
 	*/
-	coalesced = iommu_coalesce_chunks(ioc, sglist, nents, sba_alloc_range);
+	coalesced = iommu_coalesce_chunks(ioc, dev, sglist, nents, sba_alloc_range);
 
 	/*
 	** Program the I/O Pdir
-- 
cgit v1.1


From 860ac568e825b623b0b335ca277dd47d1d7fd5d0 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:05 -0800
Subject: iommu sg merging: call blk_queue_segment_boundary in
 __scsi_alloc_queue

request_queue and device struct must have the same value of a segment
size limit. This patch adds blk_queue_segment_boundary in
__scsi_alloc_queue so LLDs don't need to call both
blk_queue_segment_boundary and set_dma_max_seg_size. A LLD can change
the default value (64KB) can call device_dma_parameters accessors like
pci_set_dma_max_seg_size when allocating scsi_host.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/scsi_lib.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b12fb31..68e424f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1569,6 +1569,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 					 request_fn_proc *request_fn)
 {
 	struct request_queue *q;
+	struct device *dev = shost->shost_gendev.parent;
 
 	q = blk_init_queue(request_fn, NULL);
 	if (!q)
@@ -1584,6 +1585,8 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 
+	blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
+
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 
-- 
cgit v1.1


From b7d8629f8b4b250fda578e59ecddc77c6bdec2b6 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:05 -0800
Subject: iommu sg merging: sata_inic162x: use pci_set_dma_max_seg_size

This sets the segment size limit properly via pci_set_dma_max_seg_size
and remove blk_queue_max_segment_size because scsi-ml calls it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/ata/sata_inic162x.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 96e614a..59e65ed 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -108,17 +108,6 @@ struct inic_port_priv {
 	u8	cached_pirq_mask;
 };
 
-static int inic_slave_config(struct scsi_device *sdev)
-{
-	/* This controller is braindamaged.  dma_boundary is 0xffff
-	 * like others but it will lock up the whole machine HARD if
-	 * 65536 byte PRD entry is fed.  Reduce maximum segment size.
-	 */
-	blk_queue_max_segment_size(sdev->request_queue, 65536 - 512);
-
-	return ata_scsi_slave_config(sdev);
-}
-
 static struct scsi_host_template inic_sht = {
 	.module			= THIS_MODULE,
 	.name			= DRV_NAME,
@@ -132,7 +121,7 @@ static struct scsi_host_template inic_sht = {
 	.use_clustering		= ATA_SHT_USE_CLUSTERING,
 	.proc_name		= DRV_NAME,
 	.dma_boundary		= ATA_DMA_BOUNDARY,
-	.slave_configure	= inic_slave_config,
+	.slave_configure	= ata_scsi_slave_config,
 	.slave_destroy		= ata_scsi_slave_destroy,
 	.bios_param		= ata_std_bios_param,
 };
@@ -730,6 +719,18 @@ static int inic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return rc;
 	}
 
+	/*
+	 * This controller is braindamaged.  dma_boundary is 0xffff
+	 * like others but it will lock up the whole machine HARD if
+	 * 65536 byte PRD entry is fed. Reduce maximum segment size.
+	 */
+	rc = pci_set_dma_max_seg_size(pdev, 65536 - 512);
+	if (rc) {
+		dev_printk(KERN_ERR, &pdev->dev,
+			   "failed to set the maximum segment size.\n");
+		return rc;
+	}
+
 	rc = init_controller(iomap[MMIO_BAR], hpriv->cached_hctl);
 	if (rc) {
 		dev_printk(KERN_ERR, &pdev->dev,
-- 
cgit v1.1


From 0c95fdc59640824d7e0b017be295fb912ceef4ab Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:06 -0800
Subject: iommu sg merging: aacraid: use pci_set_dma_max_seg_size

This sets the segment size limit properly via pci_set_dma_max_seg_size
and remove blk_queue_max_segment_size because scsi-ml calls it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: "Salyzyn, Mark" <mark_salyzyn@adaptec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/aacraid/linit.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 0e8267c..fb08861 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -449,9 +449,6 @@ static int aac_slave_configure(struct scsi_device *sdev)
 		else if (depth < 2)
 			depth = 2;
 		scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG, depth);
-		if (!(((struct aac_dev *)host->hostdata)->adapter_info.options &
-				AAC_OPT_NEW_COMM))
-			blk_queue_max_segment_size(sdev->request_queue, 65536);
 	} else
 		scsi_adjust_queue_depth(sdev, 0, 1);
 
@@ -1133,6 +1130,12 @@ static int __devinit aac_probe_one(struct pci_dev *pdev,
 	if (error < 0)
 		goto out_deinit;
 
+	if (!(aac->adapter_info.options & AAC_OPT_NEW_COMM)) {
+		error = pci_set_dma_max_seg_size(pdev, 65536);
+		if (error)
+			goto out_deinit;
+	}
+
 	/*
  	 * Lets override negotiations and drop the maximum SG limit to 34
  	 */
-- 
cgit v1.1


From 0291df8cc9dac09c303d21d5bcd2ad73762c836a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:07 -0800
Subject: iommu sg: add IOMMU helper functions for the free area management

This adds IOMMU helper functions for the free area management.  These
functions take care of LLD's segment boundary limit for IOMMUs.  They would be
useful for IOMMUs that use bitmap for the free area management.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iommu-helper.h |  7 ++++
 lib/Makefile                 |  1 +
 lib/iommu-helper.c           | 80 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)
 create mode 100644 include/linux/iommu-helper.h
 create mode 100644 lib/iommu-helper.c

diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
new file mode 100644
index 0000000..4dd4c04
--- /dev/null
+++ b/include/linux/iommu-helper.h
@@ -0,0 +1,7 @@
+extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
+				      unsigned long start, unsigned int nr,
+				      unsigned long shift,
+				      unsigned long boundary_size,
+				      unsigned long align_mask);
+extern void iommu_area_free(unsigned long *map, unsigned long start,
+			    unsigned int nr);
diff --git a/lib/Makefile b/lib/Makefile
index 543f2502..a18062e4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_SMP) += pcounter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
new file mode 100644
index 0000000..495575a
--- /dev/null
+++ b/lib/iommu-helper.c
@@ -0,0 +1,80 @@
+/*
+ * IOMMU helper functions for the free area management
+ */
+
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+static unsigned long find_next_zero_area(unsigned long *map,
+					 unsigned long size,
+					 unsigned long start,
+					 unsigned int nr,
+					 unsigned long align_mask)
+{
+	unsigned long index, end, i;
+again:
+	index = find_next_zero_bit(map, size, start);
+
+	/* Align allocation */
+	index = (index + align_mask) & ~align_mask;
+
+	end = index + nr;
+	if (end >= size)
+		return -1;
+	for (i = index; i < end; i++) {
+		if (test_bit(i, map)) {
+			start = i+1;
+			goto again;
+		}
+	}
+	return index;
+}
+
+static inline void set_bit_area(unsigned long *map, unsigned long i,
+				int len)
+{
+	unsigned long end = i + len;
+	while (i < end) {
+		__set_bit(i, map);
+		i++;
+	}
+}
+
+static inline int is_span_boundary(unsigned int index, unsigned int nr,
+				   unsigned long shift,
+				   unsigned long boundary_size)
+{
+	shift = (shift + index) & (boundary_size - 1);
+	return shift + nr > boundary_size;
+}
+
+unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
+			       unsigned long start, unsigned int nr,
+			       unsigned long shift, unsigned long boundary_size,
+			       unsigned long align_mask)
+{
+	unsigned long index;
+again:
+	index = find_next_zero_area(map, size, start, nr, align_mask);
+	if (index != -1) {
+		if (is_span_boundary(index, nr, shift, boundary_size)) {
+			/* we could do more effectively */
+			start = index + 1;
+			goto again;
+		}
+		set_bit_area(map, index, nr);
+	}
+	return index;
+}
+EXPORT_SYMBOL(iommu_area_alloc);
+
+void iommu_area_free(unsigned long *map, unsigned long start, unsigned int nr)
+{
+	unsigned long end = start + nr;
+
+	while (start < end) {
+		__clear_bit(start, map);
+		start++;
+	}
+}
+EXPORT_SYMBOL(iommu_area_free);
-- 
cgit v1.1


From fb3475e9b6bfa666107512fbd6006c26014f04b8 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:08 -0800
Subject: iommu sg: powerpc: convert iommu to use the IOMMU helper

This patch converts PPC's IOMMU to use the IOMMU helper functions.  The IOMMU
doesn't allocate a memory area spanning LLD's segment boundary anymore.

iseries_hv_alloc and iseries_hv_map don't have proper device
struct. 4GB boundary is used for them.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig                   |  3 ++
 arch/powerpc/kernel/dma_64.c           |  6 ++--
 arch/powerpc/kernel/iommu.c            | 64 ++++++++++++++++------------------
 arch/powerpc/platforms/iseries/iommu.c |  4 +--
 include/asm-powerpc/iommu.h            | 10 +++---
 5 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b94d450..cf030b0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -256,6 +256,9 @@ config IOMMU_VMERGE
 
 	  Most drivers don't have this problem; it is safe to say Y here.
 
+config IOMMU_HELPER
+	def_bool PPC64
+
 config HOTPLUG_CPU
 	bool "Support for enabling/disabling CPUs"
 	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC)
diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c
index f9de2fd..3a317cb 100644
--- a/arch/powerpc/kernel/dma_64.c
+++ b/arch/powerpc/kernel/dma_64.c
@@ -31,8 +31,8 @@ static inline unsigned long device_to_mask(struct device *dev)
 static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
 				      dma_addr_t *dma_handle, gfp_t flag)
 {
-	return iommu_alloc_coherent(dev->archdata.dma_data, size, dma_handle,
-				    device_to_mask(dev), flag,
+	return iommu_alloc_coherent(dev, dev->archdata.dma_data, size,
+				    dma_handle, device_to_mask(dev), flag,
 				    dev->archdata.numa_node);
 }
 
@@ -52,7 +52,7 @@ static dma_addr_t dma_iommu_map_single(struct device *dev, void *vaddr,
 				       size_t size,
 				       enum dma_data_direction direction)
 {
-	return iommu_map_single(dev->archdata.dma_data, vaddr, size,
+	return iommu_map_single(dev, dev->archdata.dma_data, vaddr, size,
 			        device_to_mask(dev), direction);
 }
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d0e6fac..c42219c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/bitops.h>
+#include <linux/iommu-helper.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -81,17 +82,19 @@ static int __init setup_iommu(char *str)
 __setup("protect4gb=", setup_protect4gb);
 __setup("iommu=", setup_iommu);
 
-static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+static unsigned long iommu_range_alloc(struct device *dev,
+				       struct iommu_table *tbl,
                                        unsigned long npages,
                                        unsigned long *handle,
                                        unsigned long mask,
                                        unsigned int align_order)
 { 
-	unsigned long n, end, i, start;
+	unsigned long n, end, start;
 	unsigned long limit;
 	int largealloc = npages > 15;
 	int pass = 0;
 	unsigned long align_mask;
+	unsigned long boundary_size;
 
 	align_mask = 0xffffffffffffffffl >> (64 - align_order);
 
@@ -136,14 +139,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
 			start &= mask;
 	}
 
-	n = find_next_zero_bit(tbl->it_map, limit, start);
-
-	/* Align allocation */
-	n = (n + align_mask) & ~align_mask;
-
-	end = n + npages;
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << IOMMU_PAGE_SHIFT);
+	else
+		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT);
+	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
-	if (unlikely(end >= limit)) {
+	n = iommu_area_alloc(tbl->it_map, limit, start, npages,
+			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
+			     align_mask);
+	if (n == -1) {
 		if (likely(pass < 2)) {
 			/* First failure, just rescan the half of the table.
 			 * Second failure, rescan the other half of the table.
@@ -158,14 +164,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
 		}
 	}
 
-	for (i = n; i < end; i++)
-		if (test_bit(i, tbl->it_map)) {
-			start = i+1;
-			goto again;
-		}
-
-	for (i = n; i < end; i++)
-		__set_bit(i, tbl->it_map);
+	end = n + npages;
 
 	/* Bump the hint to a new block for small allocs. */
 	if (largealloc) {
@@ -184,16 +183,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
 	return n;
 }
 
-static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
-		       unsigned int npages, enum dma_data_direction direction,
-		       unsigned long mask, unsigned int align_order)
+static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+			      void *page, unsigned int npages,
+			      enum dma_data_direction direction,
+			      unsigned long mask, unsigned int align_order)
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
-	entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order);
+	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
 
 	if (unlikely(entry == DMA_ERROR_CODE)) {
 		spin_unlock_irqrestore(&(tbl->it_lock), flags);
@@ -224,7 +224,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 			 unsigned int npages)
 {
 	unsigned long entry, free_entry;
-	unsigned long i;
 
 	entry = dma_addr >> IOMMU_PAGE_SHIFT;
 	free_entry = entry - tbl->it_offset;
@@ -246,9 +245,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	}
 
 	ppc_md.tce_free(tbl, entry, npages);
-	
-	for (i = 0; i < npages; i++)
-		__clear_bit(free_entry+i, tbl->it_map);
+	iommu_area_free(tbl->it_map, free_entry, npages);
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -317,7 +314,7 @@ int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE &&
 		    (vaddr & ~PAGE_MASK) == 0)
 			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
-		entry = iommu_range_alloc(tbl, npages, &handle,
+		entry = iommu_range_alloc(dev, tbl, npages, &handle,
 					  mask >> IOMMU_PAGE_SHIFT, align);
 
 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
@@ -574,9 +571,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
  * need not be page aligned, the dma_addr_t returned will point to the same
  * byte within the page as vaddr.
  */
-dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
-		size_t size, unsigned long mask,
-		enum dma_data_direction direction)
+dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl,
+			    void *vaddr, size_t size, unsigned long mask,
+			    enum dma_data_direction direction)
 {
 	dma_addr_t dma_handle = DMA_ERROR_CODE;
 	unsigned long uaddr;
@@ -593,7 +590,7 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
 			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
 
-		dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
+		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
 					 mask >> IOMMU_PAGE_SHIFT, align);
 		if (dma_handle == DMA_ERROR_CODE) {
 			if (printk_ratelimit())  {
@@ -625,8 +622,9 @@ void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
  * Returns the virtual address of the buffer and sets dma_handle
  * to the dma address (mapping) of the first page.
  */
-void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
-		dma_addr_t *dma_handle, unsigned long mask, gfp_t flag, int node)
+void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
+			   size_t size,	dma_addr_t *dma_handle,
+			   unsigned long mask, gfp_t flag, int node)
 {
 	void *ret = NULL;
 	dma_addr_t mapping;
@@ -660,7 +658,7 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
 	/* Set up tces to cover the allocated range */
 	nio_pages = size >> IOMMU_PAGE_SHIFT;
 	io_order = get_iommu_order(size);
-	mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
+	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
 			      mask >> IOMMU_PAGE_SHIFT, io_order);
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index 6a0c6f6..11fa3c7 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -199,7 +199,7 @@ static struct iommu_table vio_iommu_table;
 
 void *iseries_hv_alloc(size_t size, dma_addr_t *dma_handle, gfp_t flag)
 {
-	return iommu_alloc_coherent(&vio_iommu_table, size, dma_handle,
+	return iommu_alloc_coherent(NULL, &vio_iommu_table, size, dma_handle,
 				DMA_32BIT_MASK, flag, -1);
 }
 EXPORT_SYMBOL_GPL(iseries_hv_alloc);
@@ -213,7 +213,7 @@ EXPORT_SYMBOL_GPL(iseries_hv_free);
 dma_addr_t iseries_hv_map(void *vaddr, size_t size,
 			enum dma_data_direction direction)
 {
-	return iommu_map_single(&vio_iommu_table, vaddr, size,
+	return iommu_map_single(NULL, &vio_iommu_table, vaddr, size,
 				DMA_32BIT_MASK, direction);
 }
 
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index a07a67c..852e15f 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -85,13 +85,13 @@ extern int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 			   int nelems, enum dma_data_direction direction);
 
-extern void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
-				  dma_addr_t *dma_handle, unsigned long mask,
-				  gfp_t flag, int node);
+extern void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
+				  size_t size, dma_addr_t *dma_handle,
+				  unsigned long mask, gfp_t flag, int node);
 extern void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 				void *vaddr, dma_addr_t dma_handle);
-extern dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
-				   size_t size, unsigned long mask,
+extern dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl,
+				   void *vaddr, size_t size, unsigned long mask,
 				   enum dma_data_direction direction);
 extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
 			       size_t size, enum dma_data_direction direction);
-- 
cgit v1.1


From 383af9525bb27f927511874f6306247ec13f1c28 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:09 -0800
Subject: iommu sg: powerpc: remove DMA 4GB boundary protection

Previously, during initialization of the IOMMU tables, the last entry
at each 4GB boundary is marked as used since there are many adapters
which cannot handle DMAing across any 4GB boundary.

The IOMMU doesn't allocate a memory area spanning LLD's segment
boundary anymore. The segment boundary of devices are set to 4GB by
default. So we can remove 4GB boundary protection now.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/iommu.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c42219c..8f1f4e5 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -453,9 +453,6 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 {
 	unsigned long sz;
-	unsigned long start_index, end_index;
-	unsigned long entries_per_4g;
-	unsigned long index;
 	static int welcomed = 0;
 	struct page *page;
 
@@ -477,6 +474,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 
 #ifdef CONFIG_CRASH_DUMP
 	if (ppc_md.tce_get) {
+		unsigned long index;
 		unsigned long tceval;
 		unsigned long tcecount = 0;
 
@@ -507,23 +505,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
 #endif
 
-	/*
-	 * DMA cannot cross 4 GB boundary.  Mark last entry of each 4
-	 * GB chunk as reserved.
-	 */
-	if (protect4gb) {
-		entries_per_4g = 0x100000000l >> IOMMU_PAGE_SHIFT;
-
-		/* Mark the last bit before a 4GB boundary as used */
-		start_index = tbl->it_offset | (entries_per_4g - 1);
-		start_index -= tbl->it_offset;
-
-		end_index = tbl->it_size;
-
-		for (index = start_index; index < end_index - 1; index += entries_per_4g)
-			__set_bit(index, tbl->it_map);
-	}
-
 	if (!welcomed) {
 		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
 		       novmerge ? "disabled" : "enabled");
-- 
cgit v1.1


From 1b39b077789955c8389488d53d075518fdcd582e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:10 -0800
Subject: iommu sg: x86: convert calgary IOMMU to use the IOMMU helper

This patch converts calgary IOMMU to use the IOMMU helper
functions. The IOMMU doesn't allocate a memory area spanning LLD's
segment boundary anymore.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Muli Ben-Yehuda <mulix@mulix.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                 |  3 +++
 arch/x86/kernel/pci-calgary_64.c | 34 ++++++++++++++++++++--------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 59eef1c..c976eb4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -465,6 +465,9 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
 	  If unsure, say Y.
 
+config IOMMU_HELPER
+	def_bool CALGARY_IOMMU
+
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1fe7f04..1b5464c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -35,6 +35,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
@@ -260,22 +261,28 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
 }
 
-static unsigned long iommu_range_alloc(struct iommu_table *tbl,
-	unsigned int npages)
+static unsigned long iommu_range_alloc(struct device *dev,
+				       struct iommu_table *tbl,
+				       unsigned int npages)
 {
 	unsigned long flags;
 	unsigned long offset;
+	unsigned long boundary_size;
+
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			      PAGE_SIZE) >> PAGE_SHIFT;
 
 	BUG_ON(npages == 0);
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
-				       tbl->it_size, npages);
+	offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
+				  npages, 0, boundary_size, 0);
 	if (offset == ~0UL) {
 		tbl->chip_ops->tce_cache_blast(tbl);
-		offset = find_next_zero_string(tbl->it_map, 0,
-					       tbl->it_size, npages);
+
+		offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
+					  npages, 0, boundary_size, 0);
 		if (offset == ~0UL) {
 			printk(KERN_WARNING "Calgary: IOMMU full.\n");
 			spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -286,7 +293,6 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
 		}
 	}
 
-	set_bit_string(tbl->it_map, offset, npages);
 	tbl->it_hint = offset + npages;
 	BUG_ON(tbl->it_hint > tbl->it_size);
 
@@ -295,13 +301,13 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
 	return offset;
 }
 
-static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
-	unsigned int npages, int direction)
+static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+			      void *vaddr, unsigned int npages, int direction)
 {
 	unsigned long entry;
 	dma_addr_t ret = bad_dma_address;
 
-	entry = iommu_range_alloc(tbl, npages);
+	entry = iommu_range_alloc(dev, tbl, npages);
 
 	if (unlikely(entry == bad_dma_address))
 		goto error;
@@ -354,7 +360,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 			       badbit, tbl, dma_addr, entry, npages);
 	}
 
-	__clear_bit_string(tbl->it_map, entry, npages);
+	iommu_area_free(tbl->it_map, entry, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
 }
@@ -438,7 +444,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 		vaddr = (unsigned long) sg_virt(s);
 		npages = num_dma_pages(vaddr, s->length);
 
-		entry = iommu_range_alloc(tbl, npages);
+		entry = iommu_range_alloc(dev, tbl, npages);
 		if (entry == bad_dma_address) {
 			/* makes sure unmap knows to stop */
 			s->dma_length = 0;
@@ -476,7 +482,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
 	npages = num_dma_pages(uaddr, size);
 
 	if (translation_enabled(tbl))
-		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
 	else
 		dma_handle = virt_to_bus(vaddr);
 
@@ -516,7 +522,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
 
 	if (translation_enabled(tbl)) {
 		/* set up tces to cover the allocated range */
-		mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+		mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
 		if (mapping == bad_dma_address)
 			goto free;
 
-- 
cgit v1.1


From fde9a1094ddf2892188a8a0eccda527de47cba8e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:11 -0800
Subject: iommu sg: x86: convert gart IOMMU to use the IOMMU helper

This patch converts gart IOMMU to use the IOMMU helper functions. The
IOMMU doesn't allocate a memory area spanning LLD's segment boundary
anymore.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Muli Ben-Yehuda <mulix@mulix.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig              |  2 +-
 arch/x86/kernel/pci-gart_64.c | 41 +++++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c976eb4..4348211 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -466,7 +466,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	  If unsure, say Y.
 
 config IOMMU_HELPER
-	def_bool CALGARY_IOMMU
+	def_bool (CALGARY_IOMMU || GART_IOMMU)
 
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 5ee700f..65f6acb 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -25,6 +25,7 @@
 #include <linux/bitops.h>
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
 #include <asm/atomic.h>
 #include <asm/io.h>
 #include <asm/mtrr.h>
@@ -82,17 +83,24 @@ AGPEXTERN __u32 *agp_gatt_table;
 static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
 static int need_flush;		/* global flush state. set for each gart wrap */
 
-static unsigned long alloc_iommu(int size)
+static unsigned long alloc_iommu(struct device *dev, int size)
 {
 	unsigned long offset, flags;
+	unsigned long boundary_size;
+	unsigned long base_index;
+
+	base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
+			   PAGE_SIZE) >> PAGE_SHIFT;
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			      PAGE_SIZE) >> PAGE_SHIFT;
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	offset = find_next_zero_string(iommu_gart_bitmap, next_bit,
-					iommu_pages, size);
+	offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
+				  size, base_index, boundary_size, 0);
 	if (offset == -1) {
 		need_flush = 1;
-		offset = find_next_zero_string(iommu_gart_bitmap, 0,
-						iommu_pages, size);
+		offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
+					  size, base_index, boundary_size, 0);
 	}
 	if (offset != -1) {
 		set_bit_string(iommu_gart_bitmap, offset, size);
@@ -114,7 +122,7 @@ static void free_iommu(unsigned long offset, int size)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	__clear_bit_string(iommu_gart_bitmap, offset, size);
+	iommu_area_free(iommu_gart_bitmap, offset, size);
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 }
 
@@ -235,7 +243,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 				size_t size, int dir)
 {
 	unsigned long npages = to_pages(phys_mem, size);
-	unsigned long iommu_page = alloc_iommu(npages);
+	unsigned long iommu_page = alloc_iommu(dev, npages);
 	int i;
 
 	if (iommu_page == -1) {
@@ -355,10 +363,11 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 }
 
 /* Map multiple scatterlist entries continuous into the first. */
-static int __dma_map_cont(struct scatterlist *start, int nelems,
-			  struct scatterlist *sout, unsigned long pages)
+static int __dma_map_cont(struct device *dev, struct scatterlist *start,
+			  int nelems, struct scatterlist *sout,
+			  unsigned long pages)
 {
-	unsigned long iommu_start = alloc_iommu(pages);
+	unsigned long iommu_start = alloc_iommu(dev, pages);
 	unsigned long iommu_page = iommu_start;
 	struct scatterlist *s;
 	int i;
@@ -394,8 +403,8 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
 }
 
 static inline int
-dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout,
-	     unsigned long pages, int need)
+dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
+	     struct scatterlist *sout, unsigned long pages, int need)
 {
 	if (!need) {
 		BUG_ON(nelems != 1);
@@ -403,7 +412,7 @@ dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout,
 		sout->dma_length = start->length;
 		return 0;
 	}
-	return __dma_map_cont(start, nelems, sout, pages);
+	return __dma_map_cont(dev, start, nelems, sout, pages);
 }
 
 /*
@@ -449,8 +458,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 			if (!iommu_merge || !nextneed || !need || s->offset ||
 			    (s->length + seg_size > max_seg_size) ||
 			    (ps->offset + ps->length) % PAGE_SIZE) {
-				if (dma_map_cont(start_sg, i - start, sgmap,
-						  pages, need) < 0)
+				if (dma_map_cont(dev, start_sg, i - start,
+						 sgmap, pages, need) < 0)
 					goto error;
 				out++;
 				seg_size = 0;
@@ -466,7 +475,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 		pages += to_pages(s->offset, s->length);
 		ps = s;
 	}
-	if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0)
+	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
 		goto error;
 	out++;
 	flush_gart();
-- 
cgit v1.1


From 67ec11cf968241c9ae907f8817b6ac74d4dd71d7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:12 -0800
Subject: iommu sg: kill __clear_bit_string and find_next_zero_string

This kills unused __clear_bit_string and find_next_zero_string (they
were used by only gart and calgary IOMMUs).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Muli Ben-Yehuda <mulix@mulix.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/Makefile       |  2 +-
 arch/x86/lib/bitstr_64.c    | 28 ----------------------------
 include/asm-x86/bitops_64.h | 16 ----------------
 3 files changed, 1 insertion(+), 45 deletions(-)
 delete mode 100644 arch/x86/lib/bitstr_64.c

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4876182..25df1c1 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,7 +21,7 @@ else
 
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
-        lib-y += bitstr_64.o bitops_64.o
+        lib-y += bitops_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 endif
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
deleted file mode 100644
index 7445caf..0000000
--- a/arch/x86/lib/bitstr_64.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <linux/module.h>
-#include <linux/bitops.h>
-
-/* Find string of zero bits in a bitmap */ 
-unsigned long 
-find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
-{ 
-	unsigned long n, end, i; 	
-
- again:
-	n = find_next_zero_bit(bitmap, nbits, start);
-	if (n == -1) 
-		return -1;
-	
-	/* could test bitsliced, but it's hardly worth it */
-	end = n+len;
-	if (end > nbits)
-		return -1; 
-	for (i = n+1; i < end; i++) { 
-		if (test_bit(i, bitmap)) {  
-			start = i+1; 
-			goto again; 
-		} 
-	}
-	return n;
-}
-
-EXPORT_SYMBOL(find_next_zero_string);
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h
index 48adbf5..aaf1519 100644
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -37,12 +37,6 @@ static inline long __scanbit(unsigned long val, unsigned long max)
   ((off)+(__scanbit(~(((*(unsigned long *)addr)) >> (off)),(size)-(off)))) : \
 	find_next_zero_bit(addr,size,off)))
 
-/* 
- * Find string of zero bits in a bitmap. -1 when not found.
- */ 
-extern unsigned long 
-find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len);
-
 static inline void set_bit_string(unsigned long *bitmap, unsigned long i, 
 				  int len) 
 { 
@@ -53,16 +47,6 @@ static inline void set_bit_string(unsigned long *bitmap, unsigned long i,
 	}
 } 
 
-static inline void __clear_bit_string(unsigned long *bitmap, unsigned long i, 
-				    int len) 
-{ 
-	unsigned long end = i + len; 
-	while (i < end) {
-		__clear_bit(i, bitmap); 
-		i++;
-	}
-} 
-
 /**
  * ffz - find first zero in word.
  * @word: The word to search
-- 
cgit v1.1


From d22a6966b8029913fac37d078ab2403898d94c63 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:13 -0800
Subject: iommu sg merging: add accessors for segment_boundary_mask in
 device_dma_parameters()

This adds new accessors for segment_boundary_mask in device_dma_parameters
structure in the same way I did for max_segment_size.  So we can easily change
where to place struct device_dma_parameters in the future.

dma_get_segment boundary returns 0xffffffff if dma_parms in struct device
isn't set up properly.  0xffffffff is the default value used in the block
layer and the scsi mid layer.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-mapping.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index df3a361..3320307 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -75,6 +75,21 @@ static inline unsigned int dma_set_max_seg_size(struct device *dev,
 		return -EIO;
 }
 
+static inline unsigned long dma_get_seg_boundary(struct device *dev)
+{
+	return dev->dma_parms ?
+		dev->dma_parms->segment_boundary_mask : 0xffffffff;
+}
+
+static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
+{
+	if (dev->dma_parms) {
+		dev->dma_parms->segment_boundary_mask = mask;
+		return 0;
+	} else
+		return -EIO;
+}
+
 /* flags for the coherent memory api */
 #define	DMA_MEMORY_MAP			0x01
 #define DMA_MEMORY_IO			0x02
-- 
cgit v1.1


From 59fc67dedb46c29442989e52af39da67aea52512 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:14 -0800
Subject: iommu sg merging: PCI: add dma segment boundary support

This adds PCI's accessor for segment_boundary_mask in device_dma_parameters.

The default segment_boundary is set to 0xffffffff, same to the block layer's
default value (and the scsi mid layer uses the same value).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/pci.c   | 8 ++++++++
 drivers/pci/probe.c | 1 +
 include/linux/pci.h | 7 +++++++
 3 files changed, 16 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index be97090..ae3df46 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1459,6 +1459,14 @@ int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size)
 EXPORT_SYMBOL(pci_set_dma_max_seg_size);
 #endif
 
+#ifndef HAVE_ARCH_PCI_SET_DMA_SEGMENT_BOUNDARY
+int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask)
+{
+	return dma_set_seg_boundary(&dev->dev, mask);
+}
+EXPORT_SYMBOL(pci_set_dma_seg_boundary);
+#endif
+
 /**
  * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count
  * @dev: PCI device to query
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f47d596..4d23b9f 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -937,6 +937,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	dev->dev.coherent_dma_mask = 0xffffffffull;
 
 	pci_set_dma_max_seg_size(dev, 65536);
+	pci_set_dma_seg_boundary(dev, 0xffffffff);
 
 	/* Fix up broken headers */
 	pci_fixup_device(pci_fixup_header, dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ba3a7f4..7215d3b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -583,6 +583,7 @@ void pci_msi_off(struct pci_dev *dev);
 int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size);
+int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask);
 int pcix_get_max_mmrbc(struct pci_dev *dev);
 int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
@@ -831,6 +832,12 @@ static inline int pci_set_dma_max_seg_size(struct pci_dev *dev,
 	return -EIO;
 }
 
+static inline int pci_set_dma_seg_boundary(struct pci_dev *dev,
+					unsigned long mask)
+{
+	return -EIO;
+}
+
 static inline int pci_assign_resource(struct pci_dev *dev, int i)
 {
 	return -EBUSY;
-- 
cgit v1.1


From 681cc5cd3efbeafca6386114070e0bfb5012e249 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:16 -0800
Subject: iommu sg merging: swiotlb: respect the segment boundary limits

This patch makes swiotlb not allocate a memory area spanning LLD's segment
boundary.

is_span_boundary() judges whether a memory area spans LLD's segment boundary.
If map_single finds such a area, map_single tries to find the next available
memory area.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/swiotlb.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1a8050a..4bb5a11 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -282,6 +282,15 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 	return (addr & ~mask) != 0;
 }
 
+static inline unsigned int is_span_boundary(unsigned int index,
+					    unsigned int nslots,
+					    unsigned long offset_slots,
+					    unsigned long max_slots)
+{
+	unsigned long offset = (offset_slots + index) & (max_slots - 1);
+	return offset + nslots > max_slots;
+}
+
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -292,6 +301,16 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	char *dma_addr;
 	unsigned int nslots, stride, index, wrap;
 	int i;
+	unsigned long start_dma_addr;
+	unsigned long mask;
+	unsigned long offset_slots;
+	unsigned long max_slots;
+
+	mask = dma_get_seg_boundary(hwdev);
+	start_dma_addr = virt_to_bus(io_tlb_start) & mask;
+
+	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	max_slots = ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 
 	/*
 	 * For mappings greater than a page, we limit the stride (and
@@ -311,10 +330,17 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
 	{
-		wrap = index = ALIGN(io_tlb_index, stride);
-
+		index = ALIGN(io_tlb_index, stride);
 		if (index >= io_tlb_nslabs)
-			wrap = index = 0;
+			index = 0;
+
+		while (is_span_boundary(index, nslots, offset_slots,
+					max_slots)) {
+			index += stride;
+			if (index >= io_tlb_nslabs)
+				index = 0;
+		}
+		wrap = index;
 
 		do {
 			/*
@@ -341,9 +367,12 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 
 				goto found;
 			}
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
+			do {
+				index += stride;
+				if (index >= io_tlb_nslabs)
+					index = 0;
+			} while (is_span_boundary(index, nslots, offset_slots,
+						  max_slots));
 		} while (index != wrap);
 
 		spin_unlock_irqrestore(&io_tlb_lock, flags);
-- 
cgit v1.1


From 99c84dbdc73d158a1ab955a4a5f74c18074796a3 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:28:17 -0800
Subject: iommu sg merging: call dma_set_seg_boundary in __scsi_alloc_queue()

This is a one-line patch to add the following to __scsi_alloc_queue():

dma_set_seg_boundary(dev, shost->dma_boundary);

This is the simplest approach but the result looks odd,
__scsi_alloc_queue() does:

blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));

I think that it would be better to set up segment boundary in the same
way as we did for the maximum segment size. That is, removing
shost->dma_boundary and LLDs call pci_set_dma_seg_boundary (or its
friends).

Then __scsi_alloc_queue() can set up both limits in the same way:

blk_queue_segment_boundary(q, dma_get_seg_boundary(dev));
blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));

killing dma_boundary in scsi_host_template needs a large patch for
libata (dma_boundary is used by only libata and sym53c8xx). I'll send
a patch to do that if it is acceptable. James and Jeff?

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/scsi_lib.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 68e424f..f243fc30 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1584,6 +1584,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	blk_queue_max_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
+	dma_set_seg_boundary(dev, shost->dma_boundary);
 
 	blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
 
-- 
cgit v1.1


From a9c5fff542544c8595bb12efeb278a96d99386fc Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:17 -0800
Subject: gpiolib: add drivers/gpio directory

Add an empty drivers/gpio directory for gpiolib infrastructure and GPIO
expanders.  It will be populated by later patches.

This won't be the only place to hold such gpio_chip code.  Many external chips
add a few GPIOs as secondary functionality (such as MFD drivers) and platform
code frequently needs to closely integrate GPIO and IRQ support.

This is placed *early* in the build/link sequence since it's common for other
drivers to depend on GPIOs to do their work, so they must be initialized early
in the device_initcall() sequence.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig      |  2 ++
 drivers/Kconfig       |  2 ++
 drivers/Makefile      |  1 +
 drivers/gpio/Kconfig  | 32 ++++++++++++++++++++++++++++++++
 drivers/gpio/Makefile |  4 ++++
 5 files changed, 41 insertions(+)
 create mode 100644 drivers/gpio/Kconfig
 create mode 100644 drivers/gpio/Makefile

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 64d19ef..a322f58 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1122,6 +1122,8 @@ source "drivers/i2c/Kconfig"
 
 source "drivers/spi/Kconfig"
 
+source "drivers/gpio/Kconfig"
+
 source "drivers/w1/Kconfig"
 
 source "drivers/power/Kconfig"
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 3f8a231..d74d9fb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -52,6 +52,8 @@ source "drivers/i2c/Kconfig"
 
 source "drivers/spi/Kconfig"
 
+source "drivers/gpio/Kconfig"
+
 source "drivers/w1/Kconfig"
 
 source "drivers/power/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 0ee9a8a..f1c11db 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -5,6 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 #
 
+obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpio/
 obj-$(CONFIG_PCI)		+= pci/
 obj-$(CONFIG_PARISC)		+= parisc/
 obj-$(CONFIG_RAPIDIO)		+= rapidio/
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
new file mode 100644
index 0000000..560687c
--- /dev/null
+++ b/drivers/gpio/Kconfig
@@ -0,0 +1,32 @@
+#
+# GPIO infrastructure and expanders
+#
+
+config HAVE_GPIO_LIB
+	bool
+	help
+	  Platforms select gpiolib if they use this infrastructure
+	  for all their GPIOs, usually starting with ones integrated
+	  into SOC processors.
+
+menu "GPIO Support"
+	depends on HAVE_GPIO_LIB
+
+config DEBUG_GPIO
+	bool "Debug GPIO calls"
+	depends on DEBUG_KERNEL
+	help
+	  Say Y here to add some extra checks and diagnostics to GPIO calls.
+	  The checks help ensure that GPIOs have been properly initialized
+	  before they are used and that sleeping calls aren not made from
+	  nonsleeping contexts.  They can make bitbanged serial protocols
+	  slower.  The diagnostics help catch the type of setup errors
+	  that are most common when setting up new platforms or boards.
+
+# put expanders in the right section, in alphabetical order
+
+comment "I2C GPIO expanders:"
+
+comment "SPI GPIO expanders:"
+
+endmenu
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
new file mode 100644
index 0000000..369e4fc
--- /dev/null
+++ b/drivers/gpio/Makefile
@@ -0,0 +1,4 @@
+# gpio support: dedicated expander chips, etc
+
+ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
+
-- 
cgit v1.1


From d2876d08d86f22ce1f276fc29f6baec8b53e32c6 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:20 -0800
Subject: gpiolib: add gpio provider infrastructure

Provide new implementation infrastructure that platforms may choose to use
when implementing the GPIO programming interface.  Platforms can update their
GPIO support to use this.  In many cases the incremental cost to access a
non-inlined GPIO should be less than a dozen instructions, with the memory
cost being about a page (total) of extra data and code.  The upside is:

  * Providing two features which were "want to have (but OK to defer)" when
    GPIO interfaces were first discussed in November 2006:

    -	A "struct gpio_chip" to plug in GPIOs that aren't directly supported
	by SOC platforms, but come from FPGAs or other multifunction devices
	using conventional device registers (like UCB-1x00 or SM501 GPIOs,
	and southbridges in PCs with more open specs than usual).

    -	Full support for message-based GPIO expanders, where registers are
	accessed through sleeping I/O calls.  Previous support for these
	"cansleep" calls was just stubs.  (One example: the widely used
	pcf8574 I2C chips, with 8 GPIOs each.)

  * Including a non-stub implementation of the gpio_{request,free}() calls,
    making those calls much more useful.  The diagnostic labels are also
    recorded given DEBUG_FS, so /sys/kernel/debug/gpio can show a snapshot
    of all GPIOs known to this infrastructure.

The driver programming interfaces introduced in 2.6.21 do not change at all;
this infrastructure is entirely below those covers.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Makefile      |   2 +
 drivers/gpio/gpiolib.c     | 567 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/gpio.h |  98 ++++++++
 3 files changed, 667 insertions(+)
 create mode 100644 drivers/gpio/gpiolib.c

diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 369e4fc..c3da039 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -2,3 +2,5 @@
 
 ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 
+obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpiolib.o
+
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
new file mode 100644
index 0000000..d8db2f8e
--- /dev/null
+++ b/drivers/gpio/gpiolib.c
@@ -0,0 +1,567 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/spinlock.h>
+
+#include <asm/gpio.h>
+
+
+/* Optional implementation infrastructure for GPIO interfaces.
+ *
+ * Platforms may want to use this if they tend to use very many GPIOs
+ * that aren't part of a System-On-Chip core; or across I2C/SPI/etc.
+ *
+ * When kernel footprint or instruction count is an issue, simpler
+ * implementations may be preferred.  The GPIO programming interface
+ * allows for inlining speed-critical get/set operations for common
+ * cases, so that access to SOC-integrated GPIOs can sometimes cost
+ * only an instruction or two per bit.
+ */
+
+
+/* When debugging, extend minimal trust to callers and platform code.
+ * Also emit diagnostic messages that may help initial bringup, when
+ * board setup or driver bugs are most common.
+ *
+ * Otherwise, minimize overhead in what may be bitbanging codepaths.
+ */
+#ifdef	DEBUG
+#define	extra_checks	1
+#else
+#define	extra_checks	0
+#endif
+
+/* gpio_lock prevents conflicts during gpio_desc[] table updates.
+ * While any GPIO is requested, its gpio_chip is not removable;
+ * each GPIO's "requested" flag serves as a lock and refcount.
+ */
+static DEFINE_SPINLOCK(gpio_lock);
+
+struct gpio_desc {
+	struct gpio_chip	*chip;
+	unsigned long		flags;
+/* flag symbols are bit numbers */
+#define FLAG_REQUESTED	0
+#define FLAG_IS_OUT	1
+
+#ifdef CONFIG_DEBUG_FS
+	const char		*label;
+#endif
+};
+static struct gpio_desc gpio_desc[ARCH_NR_GPIOS];
+
+static inline void desc_set_label(struct gpio_desc *d, const char *label)
+{
+#ifdef CONFIG_DEBUG_FS
+	d->label = label;
+#endif
+}
+
+/* Warn when drivers omit gpio_request() calls -- legal but ill-advised
+ * when setting direction, and otherwise illegal.  Until board setup code
+ * and drivers use explicit requests everywhere (which won't happen when
+ * those calls have no teeth) we can't avoid autorequesting.  This nag
+ * message should motivate switching to explicit requests...
+ */
+static void gpio_ensure_requested(struct gpio_desc *desc)
+{
+	if (test_and_set_bit(FLAG_REQUESTED, &desc->flags) == 0) {
+		pr_warning("GPIO-%d autorequested\n", (int)(desc - gpio_desc));
+		desc_set_label(desc, "[auto]");
+	}
+}
+
+/* caller holds gpio_lock *OR* gpio is marked as requested */
+static inline struct gpio_chip *gpio_to_chip(unsigned gpio)
+{
+	return gpio_desc[gpio].chip;
+}
+
+/**
+ * gpiochip_add() - register a gpio_chip
+ * @chip: the chip to register, with chip->base initialized
+ * Context: potentially before irqs or kmalloc will work
+ *
+ * Returns a negative errno if the chip can't be registered, such as
+ * because the chip->base is invalid or already associated with a
+ * different chip.  Otherwise it returns zero as a success code.
+ */
+int gpiochip_add(struct gpio_chip *chip)
+{
+	unsigned long	flags;
+	int		status = 0;
+	unsigned	id;
+
+	/* NOTE chip->base negative is reserved to mean a request for
+	 * dynamic allocation.  We don't currently support that.
+	 */
+
+	if (chip->base < 0 || (chip->base  + chip->ngpio) >= ARCH_NR_GPIOS) {
+		status = -EINVAL;
+		goto fail;
+	}
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	/* these GPIO numbers must not be managed by another gpio_chip */
+	for (id = chip->base; id < chip->base + chip->ngpio; id++) {
+		if (gpio_desc[id].chip != NULL) {
+			status = -EBUSY;
+			break;
+		}
+	}
+	if (status == 0) {
+		for (id = chip->base; id < chip->base + chip->ngpio; id++) {
+			gpio_desc[id].chip = chip;
+			gpio_desc[id].flags = 0;
+		}
+	}
+
+	spin_unlock_irqrestore(&gpio_lock, flags);
+fail:
+	/* failures here can mean systems won't boot... */
+	if (status)
+		pr_err("gpiochip_add: gpios %d..%d (%s) not registered\n",
+			chip->base, chip->base + chip->ngpio,
+			chip->label ? : "generic");
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpiochip_add);
+
+/**
+ * gpiochip_remove() - unregister a gpio_chip
+ * @chip: the chip to unregister
+ *
+ * A gpio_chip with any GPIOs still requested may not be removed.
+ */
+int gpiochip_remove(struct gpio_chip *chip)
+{
+	unsigned long	flags;
+	int		status = 0;
+	unsigned	id;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	for (id = chip->base; id < chip->base + chip->ngpio; id++) {
+		if (test_bit(FLAG_REQUESTED, &gpio_desc[id].flags)) {
+			status = -EBUSY;
+			break;
+		}
+	}
+	if (status == 0) {
+		for (id = chip->base; id < chip->base + chip->ngpio; id++)
+			gpio_desc[id].chip = NULL;
+	}
+
+	spin_unlock_irqrestore(&gpio_lock, flags);
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpiochip_remove);
+
+
+/* These "optional" allocation calls help prevent drivers from stomping
+ * on each other, and help provide better diagnostics in debugfs.
+ * They're called even less than the "set direction" calls.
+ */
+int gpio_request(unsigned gpio, const char *label)
+{
+	struct gpio_desc	*desc;
+	int			status = -EINVAL;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	if (gpio >= ARCH_NR_GPIOS)
+		goto done;
+	desc = &gpio_desc[gpio];
+	if (desc->chip == NULL)
+		goto done;
+
+	/* NOTE:  gpio_request() can be called in early boot,
+	 * before IRQs are enabled.
+	 */
+
+	if (test_and_set_bit(FLAG_REQUESTED, &desc->flags) == 0) {
+		desc_set_label(desc, label ? : "?");
+		status = 0;
+	} else
+		status = -EBUSY;
+
+done:
+	if (status)
+		pr_debug("gpio_request: gpio-%d (%s) status %d\n",
+			gpio, label ? : "?", status);
+	spin_unlock_irqrestore(&gpio_lock, flags);
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpio_request);
+
+void gpio_free(unsigned gpio)
+{
+	unsigned long		flags;
+	struct gpio_desc	*desc;
+
+	if (gpio >= ARCH_NR_GPIOS) {
+		WARN_ON(extra_checks);
+		return;
+	}
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	desc = &gpio_desc[gpio];
+	if (desc->chip && test_and_clear_bit(FLAG_REQUESTED, &desc->flags))
+		desc_set_label(desc, NULL);
+	else
+		WARN_ON(extra_checks);
+
+	spin_unlock_irqrestore(&gpio_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gpio_free);
+
+
+/**
+ * gpiochip_is_requested - return string iff signal was requested
+ * @chip: controller managing the signal
+ * @offset: of signal within controller's 0..(ngpio - 1) range
+ *
+ * Returns NULL if the GPIO is not currently requested, else a string.
+ * If debugfs support is enabled, the string returned is the label passed
+ * to gpio_request(); otherwise it is a meaningless constant.
+ *
+ * This function is for use by GPIO controller drivers.  The label can
+ * help with diagnostics, and knowing that the signal is used as a GPIO
+ * can help avoid accidentally multiplexing it to another controller.
+ */
+const char *gpiochip_is_requested(struct gpio_chip *chip, unsigned offset)
+{
+	unsigned gpio = chip->base + offset;
+
+	if (gpio >= ARCH_NR_GPIOS || gpio_desc[gpio].chip != chip)
+		return NULL;
+	if (test_bit(FLAG_REQUESTED, &gpio_desc[gpio].flags) == 0)
+		return NULL;
+#ifdef CONFIG_DEBUG_FS
+	return gpio_desc[gpio].label;
+#else
+	return "?";
+#endif
+}
+EXPORT_SYMBOL_GPL(gpiochip_is_requested);
+
+
+/* Drivers MUST set GPIO direction before making get/set calls.  In
+ * some cases this is done in early boot, before IRQs are enabled.
+ *
+ * As a rule these aren't called more than once (except for drivers
+ * using the open-drain emulation idiom) so these are natural places
+ * to accumulate extra debugging checks.  Note that we can't (yet)
+ * rely on gpio_request() having been called beforehand.
+ */
+
+int gpio_direction_input(unsigned gpio)
+{
+	unsigned long		flags;
+	struct gpio_chip	*chip;
+	struct gpio_desc	*desc = &gpio_desc[gpio];
+	int			status = -EINVAL;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	if (gpio >= ARCH_NR_GPIOS)
+		goto fail;
+	chip = desc->chip;
+	if (!chip || !chip->get || !chip->direction_input)
+		goto fail;
+	gpio -= chip->base;
+	if (gpio >= chip->ngpio)
+		goto fail;
+	gpio_ensure_requested(desc);
+
+	/* now we know the gpio is valid and chip won't vanish */
+
+	spin_unlock_irqrestore(&gpio_lock, flags);
+
+	might_sleep_if(extra_checks && chip->can_sleep);
+
+	status = chip->direction_input(chip, gpio);
+	if (status == 0)
+		clear_bit(FLAG_IS_OUT, &desc->flags);
+	return status;
+fail:
+	spin_unlock_irqrestore(&gpio_lock, flags);
+	if (status)
+		pr_debug("%s: gpio-%d status %d\n",
+			__FUNCTION__, gpio, status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpio_direction_input);
+
+int gpio_direction_output(unsigned gpio, int value)
+{
+	unsigned long		flags;
+	struct gpio_chip	*chip;
+	struct gpio_desc	*desc = &gpio_desc[gpio];
+	int			status = -EINVAL;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+
+	if (gpio >= ARCH_NR_GPIOS)
+		goto fail;
+	chip = desc->chip;
+	if (!chip || !chip->set || !chip->direction_output)
+		goto fail;
+	gpio -= chip->base;
+	if (gpio >= chip->ngpio)
+		goto fail;
+	gpio_ensure_requested(desc);
+
+	/* now we know the gpio is valid and chip won't vanish */
+
+	spin_unlock_irqrestore(&gpio_lock, flags);
+
+	might_sleep_if(extra_checks && chip->can_sleep);
+
+	status = chip->direction_output(chip, gpio, value);
+	if (status == 0)
+		set_bit(FLAG_IS_OUT, &desc->flags);
+	return status;
+fail:
+	spin_unlock_irqrestore(&gpio_lock, flags);
+	if (status)
+		pr_debug("%s: gpio-%d status %d\n",
+			__FUNCTION__, gpio, status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpio_direction_output);
+
+
+/* I/O calls are only valid after configuration completed; the relevant
+ * "is this a valid GPIO" error checks should already have been done.
+ *
+ * "Get" operations are often inlinable as reading a pin value register,
+ * and masking the relevant bit in that register.
+ *
+ * When "set" operations are inlinable, they involve writing that mask to
+ * one register to set a low value, or a different register to set it high.
+ * Otherwise locking is needed, so there may be little value to inlining.
+ *
+ *------------------------------------------------------------------------
+ *
+ * IMPORTANT!!!  The hot paths -- get/set value -- assume that callers
+ * have requested the GPIO.  That can include implicit requesting by
+ * a direction setting call.  Marking a gpio as requested locks its chip
+ * in memory, guaranteeing that these table lookups need no more locking
+ * and that gpiochip_remove() will fail.
+ *
+ * REVISIT when debugging, consider adding some instrumentation to ensure
+ * that the GPIO was actually requested.
+ */
+
+/**
+ * __gpio_get_value() - return a gpio's value
+ * @gpio: gpio whose value will be returned
+ * Context: any
+ *
+ * This is used directly or indirectly to implement gpio_get_value().
+ * It returns the zero or nonzero value provided by the associated
+ * gpio_chip.get() method; or zero if no such method is provided.
+ */
+int __gpio_get_value(unsigned gpio)
+{
+	struct gpio_chip	*chip;
+
+	chip = gpio_to_chip(gpio);
+	WARN_ON(extra_checks && chip->can_sleep);
+	return chip->get ? chip->get(chip, gpio - chip->base) : 0;
+}
+EXPORT_SYMBOL_GPL(__gpio_get_value);
+
+/**
+ * __gpio_set_value() - assign a gpio's value
+ * @gpio: gpio whose value will be assigned
+ * @value: value to assign
+ * Context: any
+ *
+ * This is used directly or indirectly to implement gpio_set_value().
+ * It invokes the associated gpio_chip.set() method.
+ */
+void __gpio_set_value(unsigned gpio, int value)
+{
+	struct gpio_chip	*chip;
+
+	chip = gpio_to_chip(gpio);
+	WARN_ON(extra_checks && chip->can_sleep);
+	chip->set(chip, gpio - chip->base, value);
+}
+EXPORT_SYMBOL_GPL(__gpio_set_value);
+
+/**
+ * __gpio_cansleep() - report whether gpio value access will sleep
+ * @gpio: gpio in question
+ * Context: any
+ *
+ * This is used directly or indirectly to implement gpio_cansleep().  It
+ * returns nonzero if access reading or writing the GPIO value can sleep.
+ */
+int __gpio_cansleep(unsigned gpio)
+{
+	struct gpio_chip	*chip;
+
+	/* only call this on GPIOs that are valid! */
+	chip = gpio_to_chip(gpio);
+
+	return chip->can_sleep;
+}
+EXPORT_SYMBOL_GPL(__gpio_cansleep);
+
+
+
+/* There's no value in making it easy to inline GPIO calls that may sleep.
+ * Common examples include ones connected to I2C or SPI chips.
+ */
+
+int gpio_get_value_cansleep(unsigned gpio)
+{
+	struct gpio_chip	*chip;
+
+	might_sleep_if(extra_checks);
+	chip = gpio_to_chip(gpio);
+	return chip->get(chip, gpio - chip->base);
+}
+EXPORT_SYMBOL_GPL(gpio_get_value_cansleep);
+
+void gpio_set_value_cansleep(unsigned gpio, int value)
+{
+	struct gpio_chip	*chip;
+
+	might_sleep_if(extra_checks);
+	chip = gpio_to_chip(gpio);
+	chip->set(chip, gpio - chip->base, value);
+}
+EXPORT_SYMBOL_GPL(gpio_set_value_cansleep);
+
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+
+static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip)
+{
+	unsigned		i;
+	unsigned		gpio = chip->base;
+	struct gpio_desc	*gdesc = &gpio_desc[gpio];
+	int			is_out;
+
+	for (i = 0; i < chip->ngpio; i++, gpio++, gdesc++) {
+		if (!test_bit(FLAG_REQUESTED, &gdesc->flags))
+			continue;
+
+		is_out = test_bit(FLAG_IS_OUT, &gdesc->flags);
+		seq_printf(s, " gpio-%-3d (%-12s) %s %s",
+			gpio, gdesc->label,
+			is_out ? "out" : "in ",
+			chip->get
+				? (chip->get(chip, i) ? "hi" : "lo")
+				: "?  ");
+
+		if (!is_out) {
+			int		irq = gpio_to_irq(gpio);
+			struct irq_desc	*desc = irq_desc + irq;
+
+			/* This races with request_irq(), set_irq_type(),
+			 * and set_irq_wake() ... but those are "rare".
+			 *
+			 * More significantly, trigger type flags aren't
+			 * currently maintained by genirq.
+			 */
+			if (irq >= 0 && desc->action) {
+				char *trigger;
+
+				switch (desc->status & IRQ_TYPE_SENSE_MASK) {
+				case IRQ_TYPE_NONE:
+					trigger = "(default)";
+					break;
+				case IRQ_TYPE_EDGE_FALLING:
+					trigger = "edge-falling";
+					break;
+				case IRQ_TYPE_EDGE_RISING:
+					trigger = "edge-rising";
+					break;
+				case IRQ_TYPE_EDGE_BOTH:
+					trigger = "edge-both";
+					break;
+				case IRQ_TYPE_LEVEL_HIGH:
+					trigger = "level-high";
+					break;
+				case IRQ_TYPE_LEVEL_LOW:
+					trigger = "level-low";
+					break;
+				default:
+					trigger = "?trigger?";
+					break;
+				}
+
+				seq_printf(s, " irq-%d %s%s",
+					irq, trigger,
+					(desc->status & IRQ_WAKEUP)
+						? " wakeup" : "");
+			}
+		}
+
+		seq_printf(s, "\n");
+	}
+}
+
+static int gpiolib_show(struct seq_file *s, void *unused)
+{
+	struct gpio_chip	*chip = NULL;
+	unsigned		gpio;
+	int			started = 0;
+
+	/* REVISIT this isn't locked against gpio_chip removal ... */
+
+	for (gpio = 0; gpio < ARCH_NR_GPIOS; gpio++) {
+		if (chip == gpio_desc[gpio].chip)
+			continue;
+		chip = gpio_desc[gpio].chip;
+		if (!chip)
+			continue;
+
+		seq_printf(s, "%sGPIOs %d-%d, %s%s:\n",
+				started ? "\n" : "",
+				chip->base, chip->base + chip->ngpio - 1,
+				chip->label ? : "generic",
+				chip->can_sleep ? ", can sleep" : "");
+		started = 1;
+		if (chip->dbg_show)
+			chip->dbg_show(s, chip);
+		else
+			gpiolib_dbg_show(s, chip);
+	}
+	return 0;
+}
+
+static int gpiolib_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, gpiolib_show, NULL);
+}
+
+static struct file_operations gpiolib_operations = {
+	.open		= gpiolib_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init gpiolib_debugfs_init(void)
+{
+	/* /sys/kernel/debug/gpio */
+	(void) debugfs_create_file("gpio", S_IFREG | S_IRUGO,
+				NULL, NULL, &gpiolib_operations);
+	return 0;
+}
+subsys_initcall(gpiolib_debugfs_init);
+
+#endif	/* DEBUG_FS */
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 2d0aab1..f29a502 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -1,6 +1,102 @@
 #ifndef _ASM_GENERIC_GPIO_H
 #define _ASM_GENERIC_GPIO_H
 
+#ifdef CONFIG_HAVE_GPIO_LIB
+
+/* Platforms may implement their GPIO interface with library code,
+ * at a small performance cost for non-inlined operations and some
+ * extra memory (for code and for per-GPIO table entries).
+ *
+ * While the GPIO programming interface defines valid GPIO numbers
+ * to be in the range 0..MAX_INT, this library restricts them to the
+ * smaller range 0..ARCH_NR_GPIOS.
+ */
+
+#ifndef ARCH_NR_GPIOS
+#define ARCH_NR_GPIOS		256
+#endif
+
+struct seq_file;
+
+/**
+ * struct gpio_chip - abstract a GPIO controller
+ * @label: for diagnostics
+ * @direction_input: configures signal "offset" as input, or returns error
+ * @get: returns value for signal "offset"; for output signals this
+ *	returns either the value actually sensed, or zero
+ * @direction_output: configures signal "offset" as output, or returns error
+ * @set: assigns output value for signal "offset"
+ * @dbg_show: optional routine to show contents in debugfs; default code
+ *	will be used when this is omitted, but custom code can show extra
+ *	state (such as pullup/pulldown configuration).
+ * @base: identifies the first GPIO number handled by this chip; or, if
+ *	negative during registration, requests dynamic ID allocation.
+ * @ngpio: the number of GPIOs handled by this controller; the last GPIO
+ *	handled is (base + ngpio - 1).
+ * @can_sleep: flag must be set iff get()/set() methods sleep, as they
+ *	must while accessing GPIO expander chips over I2C or SPI
+ *
+ * A gpio_chip can help platforms abstract various sources of GPIOs so
+ * they can all be accessed through a common programing interface.
+ * Example sources would be SOC controllers, FPGAs, multifunction
+ * chips, dedicated GPIO expanders, and so on.
+ *
+ * Each chip controls a number of signals, identified in method calls
+ * by "offset" values in the range 0..(@ngpio - 1).  When those signals
+ * are referenced through calls like gpio_get_value(gpio), the offset
+ * is calculated by subtracting @base from the gpio number.
+ */
+struct gpio_chip {
+	char			*label;
+
+	int			(*direction_input)(struct gpio_chip *chip,
+						unsigned offset);
+	int			(*get)(struct gpio_chip *chip,
+						unsigned offset);
+	int			(*direction_output)(struct gpio_chip *chip,
+						unsigned offset, int value);
+	void			(*set)(struct gpio_chip *chip,
+						unsigned offset, int value);
+	void			(*dbg_show)(struct seq_file *s,
+						struct gpio_chip *chip);
+	int			base;
+	u16			ngpio;
+	unsigned		can_sleep:1;
+};
+
+extern const char *gpiochip_is_requested(struct gpio_chip *chip,
+			unsigned offset);
+
+/* add/remove chips */
+extern int gpiochip_add(struct gpio_chip *chip);
+extern int __must_check gpiochip_remove(struct gpio_chip *chip);
+
+
+/* Always use the library code for GPIO management calls,
+ * or when sleeping may be involved.
+ */
+extern int gpio_request(unsigned gpio, const char *label);
+extern void gpio_free(unsigned gpio);
+
+extern int gpio_direction_input(unsigned gpio);
+extern int gpio_direction_output(unsigned gpio, int value);
+
+extern int gpio_get_value_cansleep(unsigned gpio);
+extern void gpio_set_value_cansleep(unsigned gpio, int value);
+
+
+/* A platform's <asm/gpio.h> code may want to inline the I/O calls when
+ * the GPIO is constant and refers to some always-present controller,
+ * giving direct access to chip registers and tight bitbanging loops.
+ */
+extern int __gpio_get_value(unsigned gpio);
+extern void __gpio_set_value(unsigned gpio, int value);
+
+extern int __gpio_cansleep(unsigned gpio);
+
+
+#else
+
 /* platforms that don't directly support access to GPIOs through I2C, SPI,
  * or other blocking infrastructure can use these wrappers.
  */
@@ -22,4 +118,6 @@ static inline void gpio_set_value_cansleep(unsigned gpio, int value)
 	gpio_set_value(gpio, value);
 }
 
+#endif
+
 #endif /* _ASM_GENERIC_GPIO_H */
-- 
cgit v1.1


From 7c2db759ece63fd166cf0849b7b271589fa1b754 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:21 -0800
Subject: gpiolib: update Documentation/gpio.txt

Update Documentation/gpio.txt, primarily to include the new "gpiolib"
infrastructure.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt | 133 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 121 insertions(+), 12 deletions(-)

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 6bc2ba2..8da724e 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -32,7 +32,7 @@ The exact capabilities of GPIOs vary between systems.  Common options:
   - Input values are likewise readable (1, 0).  Some chips support readback
     of pins configured as "output", which is very useful in such "wire-OR"
     cases (to support bidirectional signaling).  GPIO controllers may have
-    input de-glitch logic, sometimes with software controls.
+    input de-glitch/debounce logic, sometimes with software controls.
 
   - Inputs can often be used as IRQ signals, often edge triggered but
     sometimes level triggered.  Such IRQs may be configurable as system
@@ -60,10 +60,13 @@ used on a board that's wired differently.  Only least-common-denominator
 functionality can be very portable.  Other features are platform-specific,
 and that can be critical for glue logic.
 
-Plus, this doesn't define an implementation framework, just an interface.
+Plus, this doesn't require any implementation framework, just an interface.
 One platform might implement it as simple inline functions accessing chip
 registers; another might implement it by delegating through abstractions
-used for several very different kinds of GPIO controller.
+used for several very different kinds of GPIO controller.  (There is some
+optional code supporting such an implementation strategy, described later
+in this document, but drivers acting as clients to the GPIO interface must
+not care how it's implemented.)
 
 That said, if the convention is supported on their platform, drivers should
 use it when possible.  Platforms should declare GENERIC_GPIO support in
@@ -121,6 +124,11 @@ before tasking is enabled, as part of early board setup.
 For output GPIOs, the value provided becomes the initial output value.
 This helps avoid signal glitching during system startup.
 
+For compatibility with legacy interfaces to GPIOs, setting the direction
+of a GPIO implicitly requests that GPIO (see below) if it has not been
+requested already.  That compatibility may be removed in the future;
+explicitly requesting GPIOs is strongly preferred.
+
 Setting the direction can fail if the GPIO number is invalid, or when
 that particular GPIO can't be used in that mode.  It's generally a bad
 idea to rely on boot firmware to have set the direction correctly, since
@@ -133,6 +141,7 @@ Spinlock-Safe GPIO access
 -------------------------
 Most GPIO controllers can be accessed with memory read/write instructions.
 That doesn't need to sleep, and can safely be done from inside IRQ handlers.
+(That includes hardirq contexts on RT kernels.)
 
 Use these calls to access such GPIOs:
 
@@ -145,7 +154,7 @@ Use these calls to access such GPIOs:
 The values are boolean, zero for low, nonzero for high.  When reading the
 value of an output pin, the value returned should be what's seen on the
 pin ... that won't always match the specified output value, because of
-issues including wire-OR and output latencies.
+issues including open-drain signaling and output latencies.
 
 The get/set calls have no error returns because "invalid GPIO" should have
 been reported earlier from gpio_direction_*().  However, note that not all
@@ -170,7 +179,8 @@ get to the head of a queue to transmit a command and get its response.
 This requires sleeping, which can't be done from inside IRQ handlers.
 
 Platforms that support this type of GPIO distinguish them from other GPIOs
-by returning nonzero from this call:
+by returning nonzero from this call (which requires a valid GPIO number,
+either explicitly or implicitly requested):
 
 	int gpio_cansleep(unsigned gpio);
 
@@ -209,8 +219,11 @@ before tasking is enabled, as part of early board setup.
 These calls serve two basic purposes.  One is marking the signals which
 are actually in use as GPIOs, for better diagnostics; systems may have
 several hundred potential GPIOs, but often only a dozen are used on any
-given board.  Another is to catch conflicts between drivers, reporting
-errors when drivers wrongly think they have exclusive use of that signal.
+given board.  Another is to catch conflicts, identifying errors when
+(a) two or more drivers wrongly think they have exclusive use of that
+signal, or (b) something wrongly believes it's safe to remove drivers
+needed to manage a signal that's in active use.  That is, requesting a
+GPIO can serve as a kind of lock.
 
 These two calls are optional because not not all current Linux platforms
 offer such functionality in their GPIO support; a valid implementation
@@ -223,6 +236,9 @@ Note that requesting a GPIO does NOT cause it to be configured in any
 way; it just marks that GPIO as in use.  Separate code must handle any
 pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
 
+Also note that it's your responsibility to have stopped using a GPIO
+before you free it.
+
 
 GPIOs mapped to IRQs
 --------------------
@@ -238,7 +254,7 @@ map between them using calls like:
 
 Those return either the corresponding number in the other namespace, or
 else a negative errno code if the mapping can't be done.  (For example,
-some GPIOs can't used as IRQs.)  It is an unchecked error to use a GPIO
+some GPIOs can't be used as IRQs.)  It is an unchecked error to use a GPIO
 number that wasn't set up as an input using gpio_direction_input(), or
 to use an IRQ number that didn't originally come from gpio_to_irq().
 
@@ -299,17 +315,110 @@ Related to multiplexing is configuration and enabling of the pullups or
 pulldowns integrated on some platforms.  Not all platforms support them,
 or support them in the same way; and any given board might use external
 pullups (or pulldowns) so that the on-chip ones should not be used.
+(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.)
 
 There are other system-specific mechanisms that are not specified here,
 like the aforementioned options for input de-glitching and wire-OR output.
 Hardware may support reading or writing GPIOs in gangs, but that's usually
 configuration dependent:  for GPIOs sharing the same bank.  (GPIOs are
 commonly grouped in banks of 16 or 32, with a given SOC having several such
-banks.)  Some systems can trigger IRQs from output GPIOs.  Code relying on
-such mechanisms will necessarily be nonportable.
+banks.)  Some systems can trigger IRQs from output GPIOs, or read values
+from pins not managed as GPIOs.  Code relying on such mechanisms will
+necessarily be nonportable.
 
-Dynamic definition of GPIOs is not currently supported; for example, as
+Dynamic definition of GPIOs is not currently standard; for example, as
 a side effect of configuring an add-on board with some GPIO expanders.
 
 These calls are purely for kernel space, but a userspace API could be built
-on top of it.
+on top of them.
+
+
+GPIO implementor's framework (OPTIONAL)
+=======================================
+As noted earlier, there is an optional implementation framework making it
+easier for platforms to support different kinds of GPIO controller using
+the same programming interface.
+
+As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file
+will be found there.  That will list all the controllers registered through
+this framework, and the state of the GPIOs currently in use.
+
+
+Controller Drivers: gpio_chip
+-----------------------------
+In this framework each GPIO controller is packaged as a "struct gpio_chip"
+with information common to each controller of that type:
+
+ - methods to establish GPIO direction
+ - methods used to access GPIO values
+ - flag saying whether calls to its methods may sleep
+ - optional debugfs dump method (showing extra state like pullup config)
+ - label for diagnostics
+
+There is also per-instance data, which may come from device.platform_data:
+the number of its first GPIO, and how many GPIOs it exposes.
+
+The code implementing a gpio_chip should support multiple instances of the
+controller, possibly using the driver model.  That code will configure each
+gpio_chip and issue gpiochip_add().  Removing a GPIO controller should be
+rare; use gpiochip_remove() when it is unavoidable.
+
+Most often a gpio_chip is part of an instance-specific structure with state
+not exposed by the GPIO interfaces, such as addressing, power management,
+and more.  Chips such as codecs will have complex non-GPIO state,
+
+Any debugfs dump method should normally ignore signals which haven't been
+requested as GPIOs.  They can use gpiochip_is_requested(), which returns
+either NULL or the label associated with that GPIO when it was requested.
+
+
+Platform Support
+----------------
+To support this framework, a platform's Kconfig will "select HAVE_GPIO_LIB"
+and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines
+three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep().
+They may also want to provide a custom value for ARCH_NR_GPIOS.
+
+Trivial implementations of those functions can directly use framework
+code, which always dispatches through the gpio_chip:
+
+  #define gpio_get_value	__gpio_get_value
+  #define gpio_set_value	__gpio_set_value
+  #define gpio_cansleep		__gpio_cansleep
+
+Fancier implementations could instead define those as inline functions with
+logic optimizing access to specific SOC-based GPIOs.  For example, if the
+referenced GPIO is the constant "12", getting or setting its value could
+cost as little as two or three instructions, never sleeping.  When such an
+optimization is not possible those calls must delegate to the framework
+code, costing at least a few dozen instructions.  For bitbanged I/O, such
+instruction savings can be significant.
+
+For SOCs, platform-specific code defines and registers gpio_chip instances
+for each bank of on-chip GPIOs.  Those GPIOs should be numbered/labeled to
+match chip vendor documentation, and directly match board schematics.  They
+may well start at zero and go up to a platform-specific limit.  Such GPIOs
+are normally integrated into platform initialization to make them always be
+available, from arch_initcall() or earlier; they can often serve as IRQs.
+
+
+Board Support
+-------------
+For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi
+function devices, FPGAs or CPLDs -- most often board-specific code handles
+registering controller devices and ensures that their drivers know what GPIO
+numbers to use with gpiochip_add().  Their numbers often start right after
+platform-specific GPIOs.
+
+For example, board setup code could create structures identifying the range
+of GPIOs that chip will expose, and passes them to each GPIO expander chip
+using platform_data.  Then the chip driver's probe() routine could pass that
+data to gpiochip_add().
+
+Initialization order can be important.  For example, when a device relies on
+an I2C-based GPIO, its probe() routine should only be called after that GPIO
+becomes available.  That may mean the device should not be registered until
+calls for that GPIO can work.  One way to address such dependencies is for
+such gpio_chip controllers to provide setup() and teardown() callbacks to
+board specific code; those board specific callbacks would register devices
+once all the necessary resources are available.
-- 
cgit v1.1


From 1c44f5f16fee880b294f8068354bfb9dddf1349b Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Mon, 4 Feb 2008 22:28:22 -0800
Subject: gpiolib support for the PXA architecture

This adds gpiolib support for the PXA architecture:
  - move all GPIO API functions from generic.c into gpio.c
  - convert the gpio_get/set_value macros into inline functions

This makes it easier to hook up GPIOs provided by external chips like
ASICs and CPLDs.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Minor ARM fixup from David Brownell folded into this ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig                    |   1 +
 arch/arm/mach-pxa/Makefile          |   3 +-
 arch/arm/mach-pxa/generic.c         |  93 -----------------
 arch/arm/mach-pxa/generic.h         |   1 +
 arch/arm/mach-pxa/gpio.c            | 197 ++++++++++++++++++++++++++++++++++++
 arch/arm/mach-pxa/irq.c             |   2 +
 include/asm-arm/arch-pxa/gpio.h     |  48 ++++-----
 include/asm-arm/arch-pxa/pxa-regs.h |  13 +++
 8 files changed, 236 insertions(+), 122 deletions(-)
 create mode 100644 arch/arm/mach-pxa/gpio.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a322f58..e19e774 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -385,6 +385,7 @@ config ARCH_PXA
 	depends on MMU
 	select ARCH_MTD_XIP
 	select GENERIC_GPIO
+	select HAVE_GPIO_LIB
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
diff --git a/arch/arm/mach-pxa/Makefile b/arch/arm/mach-pxa/Makefile
index 8604938..6e0c4f5 100644
--- a/arch/arm/mach-pxa/Makefile
+++ b/arch/arm/mach-pxa/Makefile
@@ -3,7 +3,8 @@
 #
 
 # Common support (must be linked before board specific support)
-obj-y				+= clock.o devices.o generic.o irq.o dma.o time.o
+obj-y				+= clock.o devices.o generic.o irq.o dma.o \
+				   time.o gpio.o
 obj-$(CONFIG_PXA25x)		+= pxa25x.o
 obj-$(CONFIG_PXA27x)		+= pxa27x.o
 obj-$(CONFIG_PXA3xx)		+= pxa3xx.o mfp.o smemc.o
diff --git a/arch/arm/mach-pxa/generic.c b/arch/arm/mach-pxa/generic.c
index 7697059..80721c6 100644
--- a/arch/arm/mach-pxa/generic.c
+++ b/arch/arm/mach-pxa/generic.c
@@ -32,7 +32,6 @@
 #include <asm/mach/map.h>
 
 #include <asm/arch/pxa-regs.h>
-#include <asm/arch/gpio.h>
 
 #include "generic.h"
 
@@ -67,97 +66,6 @@ unsigned int get_memclk_frequency_10khz(void)
 EXPORT_SYMBOL(get_memclk_frequency_10khz);
 
 /*
- * Handy function to set GPIO alternate functions
- */
-int pxa_last_gpio;
-
-int pxa_gpio_mode(int gpio_mode)
-{
-	unsigned long flags;
-	int gpio = gpio_mode & GPIO_MD_MASK_NR;
-	int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8;
-	int gafr;
-
-	if (gpio > pxa_last_gpio)
-		return -EINVAL;
-
-	local_irq_save(flags);
-	if (gpio_mode & GPIO_DFLT_LOW)
-		GPCR(gpio) = GPIO_bit(gpio);
-	else if (gpio_mode & GPIO_DFLT_HIGH)
-		GPSR(gpio) = GPIO_bit(gpio);
-	if (gpio_mode & GPIO_MD_MASK_DIR)
-		GPDR(gpio) |= GPIO_bit(gpio);
-	else
-		GPDR(gpio) &= ~GPIO_bit(gpio);
-	gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2));
-	GAFR(gpio) = gafr |  (fn  << (((gpio) & 0xf)*2));
-	local_irq_restore(flags);
-
-	return 0;
-}
-
-EXPORT_SYMBOL(pxa_gpio_mode);
-
-int gpio_direction_input(unsigned gpio)
-{
-	unsigned long flags;
-	u32 mask;
-
-	if (gpio > pxa_last_gpio)
-		return -EINVAL;
-
-	mask = GPIO_bit(gpio);
-	local_irq_save(flags);
-	GPDR(gpio) &= ~mask;
-	local_irq_restore(flags);
-
-	return 0;
-}
-EXPORT_SYMBOL(gpio_direction_input);
-
-int gpio_direction_output(unsigned gpio, int value)
-{
-	unsigned long flags;
-	u32 mask;
-
-	if (gpio > pxa_last_gpio)
-		return -EINVAL;
-
-	mask = GPIO_bit(gpio);
-	local_irq_save(flags);
-	if (value)
-		GPSR(gpio) = mask;
-	else
-		GPCR(gpio) = mask;
-	GPDR(gpio) |= mask;
-	local_irq_restore(flags);
-
-	return 0;
-}
-EXPORT_SYMBOL(gpio_direction_output);
-
-/*
- * Return GPIO level
- */
-int pxa_gpio_get_value(unsigned gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-EXPORT_SYMBOL(pxa_gpio_get_value);
-
-/*
- * Set output GPIO level
- */
-void pxa_gpio_set_value(unsigned gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-EXPORT_SYMBOL(pxa_gpio_set_value);
-
-/*
  * Routine to safely enable or disable a clock in the CKEN
  */
 void __pxa_set_cken(int clock, int enable)
@@ -172,7 +80,6 @@ void __pxa_set_cken(int clock, int enable)
 
 	local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL(__pxa_set_cken);
 
 /*
diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h
index 1a16ad3..b3d10b0 100644
--- a/arch/arm/mach-pxa/generic.h
+++ b/arch/arm/mach-pxa/generic.h
@@ -16,6 +16,7 @@ extern void __init pxa_init_irq_low(void);
 extern void __init pxa_init_irq_high(void);
 extern void __init pxa_init_irq_gpio(int gpio_nr);
 extern void __init pxa_init_irq_set_wake(int (*set_wake)(unsigned int, unsigned int));
+extern void __init pxa_init_gpio(int gpio_nr);
 extern void __init pxa25x_init_irq(void);
 extern void __init pxa27x_init_irq(void);
 extern void __init pxa3xx_init_irq(void);
diff --git a/arch/arm/mach-pxa/gpio.c b/arch/arm/mach-pxa/gpio.c
new file mode 100644
index 0000000..8638dd7
--- /dev/null
+++ b/arch/arm/mach-pxa/gpio.c
@@ -0,0 +1,197 @@
+/*
+ *  linux/arch/arm/mach-pxa/gpio.c
+ *
+ *  Generic PXA GPIO handling
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Jun 15, 2001
+ *  Copyright:	MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/gpio.h>
+#include <asm/hardware.h>
+#include <asm/io.h>
+#include <asm/arch/pxa-regs.h>
+
+#include "generic.h"
+
+
+struct pxa_gpio_chip {
+	struct gpio_chip chip;
+	void __iomem     *regbase;
+};
+
+int pxa_last_gpio;
+
+/*
+ * Configure pins for GPIO or other functions
+ */
+int pxa_gpio_mode(int gpio_mode)
+{
+	unsigned long flags;
+	int gpio = gpio_mode & GPIO_MD_MASK_NR;
+	int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8;
+	int gafr;
+
+	if (gpio > pxa_last_gpio)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	if (gpio_mode & GPIO_DFLT_LOW)
+		GPCR(gpio) = GPIO_bit(gpio);
+	else if (gpio_mode & GPIO_DFLT_HIGH)
+		GPSR(gpio) = GPIO_bit(gpio);
+	if (gpio_mode & GPIO_MD_MASK_DIR)
+		GPDR(gpio) |= GPIO_bit(gpio);
+	else
+		GPDR(gpio) &= ~GPIO_bit(gpio);
+	gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2));
+	GAFR(gpio) = gafr |  (fn  << (((gpio) & 0xf)*2));
+	local_irq_restore(flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(pxa_gpio_mode);
+
+static int pxa_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	unsigned long        flags;
+	u32                  mask = 1 << offset;
+	u32                  value;
+	struct pxa_gpio_chip *pxa;
+	void __iomem         *gpdr;
+
+	pxa = container_of(chip, struct pxa_gpio_chip, chip);
+	gpdr = pxa->regbase + GPDR_OFFSET;
+	local_irq_save(flags);
+	value = __raw_readl(gpdr);
+	value &= ~mask;
+	__raw_writel(value, gpdr);
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+static int pxa_gpio_direction_output(struct gpio_chip *chip,
+					unsigned offset, int value)
+{
+	unsigned long        flags;
+	u32                  mask = 1 << offset;
+	u32                  tmp;
+	struct pxa_gpio_chip *pxa;
+	void __iomem         *gpdr;
+
+	pxa = container_of(chip, struct pxa_gpio_chip, chip);
+	__raw_writel(mask,
+			pxa->regbase + (value ? GPSR_OFFSET : GPCR_OFFSET));
+	gpdr = pxa->regbase + GPDR_OFFSET;
+	local_irq_save(flags);
+	tmp = __raw_readl(gpdr);
+	tmp |= mask;
+	__raw_writel(tmp, gpdr);
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+/*
+ * Return GPIO level
+ */
+static int pxa_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	u32                  mask = 1 << offset;
+	struct pxa_gpio_chip *pxa;
+
+	pxa = container_of(chip, struct pxa_gpio_chip, chip);
+	return __raw_readl(pxa->regbase + GPLR_OFFSET) & mask;
+}
+
+/*
+ * Set output GPIO level
+ */
+static void pxa_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	u32                  mask = 1 << offset;
+	struct pxa_gpio_chip *pxa;
+
+	pxa = container_of(chip, struct pxa_gpio_chip, chip);
+
+	if (value)
+		__raw_writel(mask, pxa->regbase + GPSR_OFFSET);
+	else
+		__raw_writel(mask, pxa->regbase + GPCR_OFFSET);
+}
+
+static struct pxa_gpio_chip pxa_gpio_chip[] = {
+	[0] = {
+		.regbase = GPIO0_BASE,
+		.chip = {
+			.label            = "gpio-0",
+			.direction_input  = pxa_gpio_direction_input,
+			.direction_output = pxa_gpio_direction_output,
+			.get              = pxa_gpio_get,
+			.set              = pxa_gpio_set,
+			.base             = 0,
+			.ngpio            = 32,
+		},
+	},
+	[1] = {
+		.regbase = GPIO1_BASE,
+		.chip = {
+			.label            = "gpio-1",
+			.direction_input  = pxa_gpio_direction_input,
+			.direction_output = pxa_gpio_direction_output,
+			.get              = pxa_gpio_get,
+			.set              = pxa_gpio_set,
+			.base             = 32,
+			.ngpio            = 32,
+		},
+	},
+	[2] = {
+		.regbase = GPIO2_BASE,
+		.chip = {
+			.label            = "gpio-2",
+			.direction_input  = pxa_gpio_direction_input,
+			.direction_output = pxa_gpio_direction_output,
+			.get              = pxa_gpio_get,
+			.set              = pxa_gpio_set,
+			.base             = 64,
+			.ngpio            = 32, /* 21 for PXA25x */
+		},
+	},
+#if defined(CONFIG_PXA27x) || defined(CONFIG_PXA3xx)
+	[3] = {
+		.regbase = GPIO3_BASE,
+		.chip = {
+			.label            = "gpio-3",
+			.direction_input  = pxa_gpio_direction_input,
+			.direction_output = pxa_gpio_direction_output,
+			.get              = pxa_gpio_get,
+			.set              = pxa_gpio_set,
+			.base             = 96,
+			.ngpio            = 32,
+		},
+	},
+#endif
+};
+
+void __init pxa_init_gpio(int gpio_nr)
+{
+	int i;
+
+	/* add a GPIO chip for each register bank.
+	 * the last PXA25x register only contains 21 GPIOs
+	 */
+	for (i = 0; i < gpio_nr; i += 32) {
+		if (i+32 > gpio_nr)
+			pxa_gpio_chip[i/32].chip.ngpio = gpio_nr - i;
+		gpiochip_add(&pxa_gpio_chip[i/32].chip);
+	}
+}
diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c
index 5a1d5ee..36c6a68 100644
--- a/arch/arm/mach-pxa/irq.c
+++ b/arch/arm/mach-pxa/irq.c
@@ -311,6 +311,8 @@ void __init pxa_init_irq_gpio(int gpio_nr)
 	/* Install handler for GPIO>=2 edge detect interrupts */
 	set_irq_chip(IRQ_GPIO_2_x, &pxa_internal_chip_low);
 	set_irq_chained_handler(IRQ_GPIO_2_x, pxa_gpio_demux_handler);
+
+	pxa_init_gpio(gpio_nr);
 }
 
 void __init pxa_init_irq_set_wake(int (*set_wake)(unsigned int, unsigned int))
diff --git a/include/asm-arm/arch-pxa/gpio.h b/include/asm-arm/arch-pxa/gpio.h
index 9dbc2dc..bdbf5f9 100644
--- a/include/asm-arm/arch-pxa/gpio.h
+++ b/include/asm-arm/arch-pxa/gpio.h
@@ -28,43 +28,35 @@
 #include <asm/irq.h>
 #include <asm/hardware.h>
 
-static inline int gpio_request(unsigned gpio, const char *label)
-{
-	return 0;
-}
+#include <asm-generic/gpio.h>
 
-static inline void gpio_free(unsigned gpio)
-{
-	return;
-}
 
-extern int gpio_direction_input(unsigned gpio);
-extern int gpio_direction_output(unsigned gpio, int value);
+/* NOTE: some PXAs have fewer on-chip GPIOs (like PXA255, with 85).
+ * Those cases currently cause holes in the GPIO number space.
+ */
+#define NR_BUILTIN_GPIO 128
 
-static inline int __gpio_get_value(unsigned gpio)
+static inline int gpio_get_value(unsigned gpio)
 {
-	return GPLR(gpio) & GPIO_bit(gpio);
+	if (__builtin_constant_p(gpio) && (gpio < NR_BUILTIN_GPIO))
+		return GPLR(gpio) & GPIO_bit(gpio);
+	else
+		return __gpio_get_value(gpio);
 }
 
-#define gpio_get_value(gpio)			\
-	(__builtin_constant_p(gpio) ?		\
-	 __gpio_get_value(gpio) :		\
-	 pxa_gpio_get_value(gpio))
-
-static inline void __gpio_set_value(unsigned gpio, int value)
+static inline void gpio_set_value(unsigned gpio, int value)
 {
-	if (value)
-		GPSR(gpio) = GPIO_bit(gpio);
-	else
-		GPCR(gpio) = GPIO_bit(gpio);
+	if (__builtin_constant_p(gpio) && (gpio < NR_BUILTIN_GPIO)) {
+		if (value)
+			GPSR(gpio) = GPIO_bit(gpio);
+		else
+			GPCR(gpio) = GPIO_bit(gpio);
+	} else {
+		__gpio_set_value(gpio, value);
+	}
 }
 
-#define gpio_set_value(gpio,value)		\
-	(__builtin_constant_p(gpio) ?		\
-	 __gpio_set_value(gpio, value) :	\
-	 pxa_gpio_set_value(gpio, value))
-
-#include <asm-generic/gpio.h>			/* cansleep wrappers */
+#define gpio_cansleep __gpio_cansleep
 
 #define gpio_to_irq(gpio)	IRQ_GPIO(gpio)
 #define irq_to_gpio(irq)	IRQ_TO_GPIO(irq)
diff --git a/include/asm-arm/arch-pxa/pxa-regs.h b/include/asm-arm/arch-pxa/pxa-regs.h
index 16ed24d..ac175b4 100644
--- a/include/asm-arm/arch-pxa/pxa-regs.h
+++ b/include/asm-arm/arch-pxa/pxa-regs.h
@@ -1131,6 +1131,19 @@
  * General Purpose I/O
  */
 
+#define GPIO0_BASE	((void __iomem *)io_p2v(0x40E00000))
+#define GPIO1_BASE	((void __iomem *)io_p2v(0x40E00004))
+#define GPIO2_BASE	((void __iomem *)io_p2v(0x40E00008))
+#define GPIO3_BASE	((void __iomem *)io_p2v(0x40E00100))
+
+#define GPLR_OFFSET	0x00
+#define GPDR_OFFSET	0x0C
+#define GPSR_OFFSET	0x18
+#define GPCR_OFFSET	0x24
+#define GRER_OFFSET	0x30
+#define GFER_OFFSET	0x3C
+#define GEDR_OFFSET	0x48
+
 #define GPLR0		__REG(0x40E00000)  /* GPIO Pin-Level Register GPIO<31:0> */
 #define GPLR1		__REG(0x40E00004)  /* GPIO Pin-Level Register GPIO<63:32> */
 #define GPLR2		__REG(0x40E00008)  /* GPIO Pin-Level Register GPIO<80:64> */
-- 
cgit v1.1


From 15fae37d9f5f21571a9618d8353164b6ddfea6f6 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:24 -0800
Subject: gpiolib: pcf857x i2c gpio expander support

This is a new-style I2C driver for most common 8 and 16 bit I2C based
"quasi-bidirectional" GPIO expanders: pcf8574 or pcf8575, and several
compatible models (mostly faster, supporting I2C at up to 1 MHz).

The driver exposes the GPIO signals using the platform-neutral GPIO
programming interface, so they are easily accessed by other kernel code.  The
lack of such a flexible kernel API has been a big factor in the proliferation
of board-specific drivers for these chips...  stuff that rarely makes it
upstream since it's so ugly.  This driver will let such boards use standard
calls.

Since it's a new-style driver, these devices must be configured as part of
board-specific init.  That eliminates the need for error-prone manual
configuration of module parameters, and makes compatibility with legacy
drivers (pcf8574.c, pc8575.c) for these chips easier (there's a clear
either/or disjunction).

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig        |  23 +++
 drivers/gpio/Makefile       |   1 +
 drivers/gpio/pcf857x.c      | 330 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/pcf857x.h |  45 ++++++
 4 files changed, 399 insertions(+)
 create mode 100644 drivers/gpio/pcf857x.c
 create mode 100644 include/linux/i2c/pcf857x.h

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 560687c..36d5d6a 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -27,6 +27,29 @@ config DEBUG_GPIO
 
 comment "I2C GPIO expanders:"
 
+config GPIO_PCF857X
+	tristate "PCF857x, PCA857x, and PCA967x I2C GPIO expanders"
+	depends on I2C
+	help
+	  Say yes here to provide access to most "quasi-bidirectional" I2C
+	  GPIO expanders used for additional digital outputs or inputs.
+	  Most of these parts are from NXP, though TI is a second source for
+	  some of them.  Compatible models include:
+
+	  8 bits:   pcf8574, pcf8574a, pca8574, pca8574a,
+	            pca9670, pca9672, pca9674, pca9674a
+
+	  16 bits:  pcf8575, pcf8575c, pca8575,
+	            pca9671, pca9673, pca9675
+
+	  Your board setup code will need to declare the expanders in
+	  use, and assign numbers to the GPIOs they expose.  Those GPIOs
+	  can then be used from drivers and other kernel code, just like
+	  other GPIOs, but only accessible from task contexts.
+
+	  This driver provides an in-kernel interface to those GPIOs using
+	  platform-neutral GPIO calls.
+
 comment "SPI GPIO expanders:"
 
 endmenu
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index c3da039..b469ab2 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -4,3 +4,4 @@ ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 
 obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpiolib.o
 
+obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
diff --git a/drivers/gpio/pcf857x.c b/drivers/gpio/pcf857x.c
new file mode 100644
index 0000000..c6b3b53
--- /dev/null
+++ b/drivers/gpio/pcf857x.c
@@ -0,0 +1,330 @@
+/*
+ * pcf857x - driver for pcf857x, pca857x, and pca967x I2C GPIO expanders
+ *
+ * Copyright (C) 2007 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pcf857x.h>
+
+#include <asm/gpio.h>
+
+
+/*
+ * The pcf857x, pca857x, and pca967x chips only expose one read and one
+ * write register.  Writing a "one" bit (to match the reset state) lets
+ * that pin be used as an input; it's not an open-drain model, but acts
+ * a bit like one.  This is described as "quasi-bidirectional"; read the
+ * chip documentation for details.
+ *
+ * Many other I2C GPIO expander chips (like the pca953x models) have
+ * more complex register models and more conventional circuitry using
+ * push/pull drivers.  They often use the same 0x20..0x27 addresses as
+ * pcf857x parts, making the "legacy" I2C driver model problematic.
+ */
+struct pcf857x {
+	struct gpio_chip	chip;
+	struct i2c_client	*client;
+	unsigned		out;		/* software latch */
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* Talk to 8-bit I/O expander */
+
+static int pcf857x_input8(struct gpio_chip *chip, unsigned offset)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+
+	gpio->out |= (1 << offset);
+	return i2c_smbus_write_byte(gpio->client, gpio->out);
+}
+
+static int pcf857x_get8(struct gpio_chip *chip, unsigned offset)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+	s32		value;
+
+	value = i2c_smbus_read_byte(gpio->client);
+	return (value < 0) ? 0 : (value & (1 << offset));
+}
+
+static int pcf857x_output8(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+	unsigned	bit = 1 << offset;
+
+	if (value)
+		gpio->out |= bit;
+	else
+		gpio->out &= ~bit;
+	return i2c_smbus_write_byte(gpio->client, gpio->out);
+}
+
+static void pcf857x_set8(struct gpio_chip *chip, unsigned offset, int value)
+{
+	pcf857x_output8(chip, offset, value);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* Talk to 16-bit I/O expander */
+
+static int i2c_write_le16(struct i2c_client *client, u16 word)
+{
+	u8 buf[2] = { word & 0xff, word >> 8, };
+	int status;
+
+	status = i2c_master_send(client, buf, 2);
+	return (status < 0) ? status : 0;
+}
+
+static int i2c_read_le16(struct i2c_client *client)
+{
+	u8 buf[2];
+	int status;
+
+	status = i2c_master_recv(client, buf, 2);
+	if (status < 0)
+		return status;
+	return (buf[1] << 8) | buf[0];
+}
+
+static int pcf857x_input16(struct gpio_chip *chip, unsigned offset)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+
+	gpio->out |= (1 << offset);
+	return i2c_write_le16(gpio->client, gpio->out);
+}
+
+static int pcf857x_get16(struct gpio_chip *chip, unsigned offset)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+	int		value;
+
+	value = i2c_read_le16(gpio->client);
+	return (value < 0) ? 0 : (value & (1 << offset));
+}
+
+static int pcf857x_output16(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct pcf857x	*gpio = container_of(chip, struct pcf857x, chip);
+	unsigned	bit = 1 << offset;
+
+	if (value)
+		gpio->out |= bit;
+	else
+		gpio->out &= ~bit;
+	return i2c_write_le16(gpio->client, gpio->out);
+}
+
+static void pcf857x_set16(struct gpio_chip *chip, unsigned offset, int value)
+{
+	pcf857x_output16(chip, offset, value);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int pcf857x_probe(struct i2c_client *client)
+{
+	struct pcf857x_platform_data	*pdata;
+	struct pcf857x			*gpio;
+	int				status;
+
+	pdata = client->dev.platform_data;
+	if (!pdata)
+		return -ENODEV;
+
+	/* Allocate, initialize, and register this gpio_chip. */
+	gpio = kzalloc(sizeof *gpio, GFP_KERNEL);
+	if (!gpio)
+		return -ENOMEM;
+
+	gpio->chip.base = pdata->gpio_base;
+	gpio->chip.can_sleep = 1;
+
+	/* NOTE:  the OnSemi jlc1562b is also largely compatible with
+	 * these parts, notably for output.  It has a low-resolution
+	 * DAC instead of pin change IRQs; and its inputs can be the
+	 * result of comparators.
+	 */
+
+	/* 8574 addresses are 0x20..0x27; 8574a uses 0x38..0x3f;
+	 * 9670, 9672, 9764, and 9764a use quite a variety.
+	 *
+	 * NOTE: we don't distinguish here between *4 and *4a parts.
+	 */
+	if (strcmp(client->name, "pcf8574") == 0
+			|| strcmp(client->name, "pca8574") == 0
+			|| strcmp(client->name, "pca9670") == 0
+			|| strcmp(client->name, "pca9672") == 0
+			|| strcmp(client->name, "pca9674") == 0
+			) {
+		gpio->chip.ngpio = 8;
+		gpio->chip.direction_input = pcf857x_input8;
+		gpio->chip.get = pcf857x_get8;
+		gpio->chip.direction_output = pcf857x_output8;
+		gpio->chip.set = pcf857x_set8;
+
+		if (!i2c_check_functionality(client->adapter,
+				I2C_FUNC_SMBUS_BYTE))
+			status = -EIO;
+
+		/* fail if there's no chip present */
+		else
+			status = i2c_smbus_read_byte(client);
+
+	/* '75/'75c addresses are 0x20..0x27, just like the '74;
+	 * the '75c doesn't have a current source pulling high.
+	 * 9671, 9673, and 9765 use quite a variety of addresses.
+	 *
+	 * NOTE: we don't distinguish here between '75 and '75c parts.
+	 */
+	} else if (strcmp(client->name, "pcf8575") == 0
+			|| strcmp(client->name, "pca8575") == 0
+			|| strcmp(client->name, "pca9671") == 0
+			|| strcmp(client->name, "pca9673") == 0
+			|| strcmp(client->name, "pca9675") == 0
+			) {
+		gpio->chip.ngpio = 16;
+		gpio->chip.direction_input = pcf857x_input16;
+		gpio->chip.get = pcf857x_get16;
+		gpio->chip.direction_output = pcf857x_output16;
+		gpio->chip.set = pcf857x_set16;
+
+		if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+			status = -EIO;
+
+		/* fail if there's no chip present */
+		else
+			status = i2c_read_le16(client);
+
+	} else
+		status = -ENODEV;
+
+	if (status < 0)
+		goto fail;
+
+	gpio->chip.label = client->name;
+
+	gpio->client = client;
+	i2c_set_clientdata(client, gpio);
+
+	/* NOTE:  these chips have strange "quasi-bidirectional" I/O pins.
+	 * We can't actually know whether a pin is configured (a) as output
+	 * and driving the signal low, or (b) as input and reporting a low
+	 * value ... without knowing the last value written since the chip
+	 * came out of reset (if any).  We can't read the latched output.
+	 *
+	 * In short, the only reliable solution for setting up pin direction
+	 * is to do it explicitly.  The setup() method can do that, but it
+	 * may cause transient glitching since it can't know the last value
+	 * written (some pins may need to be driven low).
+	 *
+	 * Using pdata->n_latch avoids that trouble.  When left initialized
+	 * to zero, our software copy of the "latch" then matches the chip's
+	 * all-ones reset state.  Otherwise it flags pins to be driven low.
+	 */
+	gpio->out = ~pdata->n_latch;
+
+	status = gpiochip_add(&gpio->chip);
+	if (status < 0)
+		goto fail;
+
+	/* NOTE: these chips can issue "some pin-changed" IRQs, which we
+	 * don't yet even try to use.  Among other issues, the relevant
+	 * genirq state isn't available to modular drivers; and most irq
+	 * methods can't be called from sleeping contexts.
+	 */
+
+	dev_info(&client->dev, "gpios %d..%d on a %s%s\n",
+			gpio->chip.base,
+			gpio->chip.base + gpio->chip.ngpio - 1,
+			client->name,
+			client->irq ? " (irq ignored)" : "");
+
+	/* Let platform code set up the GPIOs and their users.
+	 * Now is the first time anyone could use them.
+	 */
+	if (pdata->setup) {
+		status = pdata->setup(client,
+				gpio->chip.base, gpio->chip.ngpio,
+				pdata->context);
+		if (status < 0)
+			dev_warn(&client->dev, "setup --> %d\n", status);
+	}
+
+	return 0;
+
+fail:
+	dev_dbg(&client->dev, "probe error %d for '%s'\n",
+			status, client->name);
+	kfree(gpio);
+	return status;
+}
+
+static int pcf857x_remove(struct i2c_client *client)
+{
+	struct pcf857x_platform_data	*pdata = client->dev.platform_data;
+	struct pcf857x			*gpio = i2c_get_clientdata(client);
+	int				status = 0;
+
+	if (pdata->teardown) {
+		status = pdata->teardown(client,
+				gpio->chip.base, gpio->chip.ngpio,
+				pdata->context);
+		if (status < 0) {
+			dev_err(&client->dev, "%s --> %d\n",
+					"teardown", status);
+			return status;
+		}
+	}
+
+	status = gpiochip_remove(&gpio->chip);
+	if (status == 0)
+		kfree(gpio);
+	else
+		dev_err(&client->dev, "%s --> %d\n", "remove", status);
+	return status;
+}
+
+static struct i2c_driver pcf857x_driver = {
+	.driver = {
+		.name	= "pcf857x",
+		.owner	= THIS_MODULE,
+	},
+	.probe	= pcf857x_probe,
+	.remove	= pcf857x_remove,
+};
+
+static int __init pcf857x_init(void)
+{
+	return i2c_add_driver(&pcf857x_driver);
+}
+module_init(pcf857x_init);
+
+static void __exit pcf857x_exit(void)
+{
+	i2c_del_driver(&pcf857x_driver);
+}
+module_exit(pcf857x_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");
diff --git a/include/linux/i2c/pcf857x.h b/include/linux/i2c/pcf857x.h
new file mode 100644
index 0000000..ba8ea6e
--- /dev/null
+++ b/include/linux/i2c/pcf857x.h
@@ -0,0 +1,45 @@
+#ifndef __LINUX_PCF857X_H
+#define __LINUX_PCF857X_H
+
+/**
+ * struct pcf857x_platform_data - data to set up pcf857x driver
+ * @gpio_base: number of the chip's first GPIO
+ * @n_latch: optional bit-inverse of initial register value; if
+ *	you leave this initialized to zero the driver will act
+ *	like the chip was just reset
+ * @setup: optional callback issued once the GPIOs are valid
+ * @teardown: optional callback issued before the GPIOs are invalidated
+ * @context: optional parameter passed to setup() and teardown()
+ *
+ * In addition to the I2C_BOARD_INFO() state appropriate to each chip,
+ * the i2c_board_info used with the pcf875x driver must provide the
+ * chip "type" ("pcf8574", "pcf8574a", "pcf8575", "pcf8575c") and its
+ * platform_data (pointer to one of these structures) with at least
+ * the gpio_base value initialized.
+ *
+ * The @setup callback may be used with the kind of board-specific glue
+ * which hands the (now-valid) GPIOs to other drivers, or which puts
+ * devices in their initial states using these GPIOs.
+ *
+ * These GPIO chips are only "quasi-bidirectional"; read the chip specs
+ * to understand the behavior.  They don't have separate registers to
+ * record which pins are used for input or output, record which output
+ * values are driven, or provide access to input values.  That must be
+ * inferred by reading the chip's value and knowing the last value written
+ * to it.  If you leave n_latch initialized to zero, that last written
+ * value is presumed to be all ones (as if the chip were just reset).
+ */
+struct pcf857x_platform_data {
+	unsigned	gpio_base;
+	unsigned	n_latch;
+
+	int		(*setup)(struct i2c_client *client,
+					int gpio, unsigned ngpio,
+					void *context);
+	int		(*teardown)(struct i2c_client *client,
+					int gpio, unsigned ngpio,
+					void *context);
+	void		*context;
+};
+
+#endif /* __LINUX_PCF857X_H */
-- 
cgit v1.1


From e58b9e2762a6ef99e20dba47aba21b911658541d Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:25 -0800
Subject: mcp23s08 spi gpio expander support

Basic driver for 8-bit SPI based MCP23S08 GPIO expander, without support for
IRQs or the shared chipselect mechanism.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig         |   7 +
 drivers/gpio/Makefile        |   1 +
 drivers/gpio/mcp23s08.c      | 357 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/mcp23s08.h |  24 +++
 4 files changed, 389 insertions(+)
 create mode 100644 drivers/gpio/mcp23s08.c
 create mode 100644 include/linux/spi/mcp23s08.h

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 36d5d6a..d924a32 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -52,4 +52,11 @@ config GPIO_PCF857X
 
 comment "SPI GPIO expanders:"
 
+config GPIO_MCP23S08
+	tristate "Microchip MCP23S08 I/O expander"
+	depends on SPI_MASTER
+	help
+	  SPI driver for Microchip MCP23S08 I/O expander.  This provides
+	  a GPIO interface supporting inputs and outputs.
+
 endmenu
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index b469ab2..a1fd877 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -4,4 +4,5 @@ ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 
 obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpiolib.o
 
+obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
diff --git a/drivers/gpio/mcp23s08.c b/drivers/gpio/mcp23s08.c
new file mode 100644
index 0000000..bb60e8c
--- /dev/null
+++ b/drivers/gpio/mcp23s08.c
@@ -0,0 +1,357 @@
+/*
+ * mcp23s08.c - SPI gpio expander driver
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+
+#include <linux/spi/spi.h>
+#include <linux/spi/mcp23s08.h>
+
+#include <asm/gpio.h>
+
+
+/* Registers are all 8 bits wide.
+ *
+ * The mcp23s17 has twice as many bits, and can be configured to work
+ * with either 16 bit registers or with two adjacent 8 bit banks.
+ *
+ * Also, there are I2C versions of both chips.
+ */
+#define MCP_IODIR	0x00		/* init/reset:  all ones */
+#define MCP_IPOL	0x01
+#define MCP_GPINTEN	0x02
+#define MCP_DEFVAL	0x03
+#define MCP_INTCON	0x04
+#define MCP_IOCON	0x05
+#	define IOCON_SEQOP	(1 << 5)
+#	define IOCON_HAEN	(1 << 3)
+#	define IOCON_ODR	(1 << 2)
+#	define IOCON_INTPOL	(1 << 1)
+#define MCP_GPPU	0x06
+#define MCP_INTF	0x07
+#define MCP_INTCAP	0x08
+#define MCP_GPIO	0x09
+#define MCP_OLAT	0x0a
+
+struct mcp23s08 {
+	struct spi_device	*spi;
+	u8			addr;
+
+	/* lock protects the cached values */
+	struct mutex		lock;
+	u8			cache[11];
+
+	struct gpio_chip	chip;
+
+	struct work_struct	work;
+};
+
+static int mcp23s08_read(struct mcp23s08 *mcp, unsigned reg)
+{
+	u8	tx[2], rx[1];
+	int	status;
+
+	tx[0] = mcp->addr | 0x01;
+	tx[1] = reg;
+	status = spi_write_then_read(mcp->spi, tx, sizeof tx, rx, sizeof rx);
+	return (status < 0) ? status : rx[0];
+}
+
+static int mcp23s08_write(struct mcp23s08 *mcp, unsigned reg, u8 val)
+{
+	u8	tx[3];
+
+	tx[0] = mcp->addr;
+	tx[1] = reg;
+	tx[2] = val;
+	return spi_write_then_read(mcp->spi, tx, sizeof tx, NULL, 0);
+}
+
+static int
+mcp23s08_read_regs(struct mcp23s08 *mcp, unsigned reg, u8 *vals, unsigned n)
+{
+	u8	tx[2];
+
+	if ((n + reg) > sizeof mcp->cache)
+		return -EINVAL;
+	tx[0] = mcp->addr | 0x01;
+	tx[1] = reg;
+	return spi_write_then_read(mcp->spi, tx, sizeof tx, vals, n);
+}
+
+/*----------------------------------------------------------------------*/
+
+static int mcp23s08_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	struct mcp23s08	*mcp = container_of(chip, struct mcp23s08, chip);
+	int status;
+
+	mutex_lock(&mcp->lock);
+	mcp->cache[MCP_IODIR] |= (1 << offset);
+	status = mcp23s08_write(mcp, MCP_IODIR, mcp->cache[MCP_IODIR]);
+	mutex_unlock(&mcp->lock);
+	return status;
+}
+
+static int mcp23s08_get(struct gpio_chip *chip, unsigned offset)
+{
+	struct mcp23s08	*mcp = container_of(chip, struct mcp23s08, chip);
+	int status;
+
+	mutex_lock(&mcp->lock);
+
+	/* REVISIT reading this clears any IRQ ... */
+	status = mcp23s08_read(mcp, MCP_GPIO);
+	if (status < 0)
+		status = 0;
+	else {
+		mcp->cache[MCP_GPIO] = status;
+		status = !!(status & (1 << offset));
+	}
+	mutex_unlock(&mcp->lock);
+	return status;
+}
+
+static int __mcp23s08_set(struct mcp23s08 *mcp, unsigned mask, int value)
+{
+	u8 olat = mcp->cache[MCP_OLAT];
+
+	if (value)
+		olat |= mask;
+	else
+		olat &= ~mask;
+	mcp->cache[MCP_OLAT] = olat;
+	return mcp23s08_write(mcp, MCP_OLAT, olat);
+}
+
+static void mcp23s08_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct mcp23s08	*mcp = container_of(chip, struct mcp23s08, chip);
+	u8 mask = 1 << offset;
+
+	mutex_lock(&mcp->lock);
+	__mcp23s08_set(mcp, mask, value);
+	mutex_unlock(&mcp->lock);
+}
+
+static int
+mcp23s08_direction_output(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct mcp23s08	*mcp = container_of(chip, struct mcp23s08, chip);
+	u8 mask = 1 << offset;
+	int status;
+
+	mutex_lock(&mcp->lock);
+	status = __mcp23s08_set(mcp, mask, value);
+	if (status == 0) {
+		mcp->cache[MCP_IODIR] &= ~mask;
+		status = mcp23s08_write(mcp, MCP_IODIR, mcp->cache[MCP_IODIR]);
+	}
+	mutex_unlock(&mcp->lock);
+	return status;
+}
+
+/*----------------------------------------------------------------------*/
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/seq_file.h>
+
+/*
+ * This shows more info than the generic gpio dump code:
+ * pullups, deglitching, open drain drive.
+ */
+static void mcp23s08_dbg_show(struct seq_file *s, struct gpio_chip *chip)
+{
+	struct mcp23s08	*mcp;
+	char		bank;
+	unsigned	t;
+	unsigned	mask;
+
+	mcp = container_of(chip, struct mcp23s08, chip);
+
+	/* NOTE: we only handle one bank for now ... */
+	bank = '0' + ((mcp->addr >> 1) & 0x3);
+
+	mutex_lock(&mcp->lock);
+	t = mcp23s08_read_regs(mcp, 0, mcp->cache, sizeof mcp->cache);
+	if (t < 0) {
+		seq_printf(s, " I/O ERROR %d\n", t);
+		goto done;
+	}
+
+	for (t = 0, mask = 1; t < 8; t++, mask <<= 1) {
+		const char	*label;
+
+		label = gpiochip_is_requested(chip, t);
+		if (!label)
+			continue;
+
+		seq_printf(s, " gpio-%-3d P%c.%d (%-12s) %s %s %s",
+			chip->base + t, bank, t, label,
+			(mcp->cache[MCP_IODIR] & mask) ? "in " : "out",
+			(mcp->cache[MCP_GPIO] & mask) ? "hi" : "lo",
+			(mcp->cache[MCP_GPPU] & mask) ? "  " : "up");
+		/* NOTE:  ignoring the irq-related registers */
+		seq_printf(s, "\n");
+	}
+done:
+	mutex_unlock(&mcp->lock);
+}
+
+#else
+#define mcp23s08_dbg_show	NULL
+#endif
+
+/*----------------------------------------------------------------------*/
+
+static int mcp23s08_probe(struct spi_device *spi)
+{
+	struct mcp23s08			*mcp;
+	struct mcp23s08_platform_data	*pdata;
+	int				status;
+	int				do_update = 0;
+
+	pdata = spi->dev.platform_data;
+	if (!pdata || pdata->slave > 3 || !pdata->base)
+		return -ENODEV;
+
+	mcp = kzalloc(sizeof *mcp, GFP_KERNEL);
+	if (!mcp)
+		return -ENOMEM;
+
+	mutex_init(&mcp->lock);
+
+	mcp->spi = spi;
+	mcp->addr = 0x40 | (pdata->slave << 1);
+
+	mcp->chip.label = "mcp23s08",
+
+	mcp->chip.direction_input = mcp23s08_direction_input;
+	mcp->chip.get = mcp23s08_get;
+	mcp->chip.direction_output = mcp23s08_direction_output;
+	mcp->chip.set = mcp23s08_set;
+	mcp->chip.dbg_show = mcp23s08_dbg_show;
+
+	mcp->chip.base = pdata->base;
+	mcp->chip.ngpio = 8;
+	mcp->chip.can_sleep = 1;
+
+	spi_set_drvdata(spi, mcp);
+
+	/* verify MCP_IOCON.SEQOP = 0, so sequential reads work */
+	status = mcp23s08_read(mcp, MCP_IOCON);
+	if (status < 0)
+		goto fail;
+	if (status & IOCON_SEQOP) {
+		status &= ~IOCON_SEQOP;
+		status = mcp23s08_write(mcp, MCP_IOCON, (u8) status);
+		if (status < 0)
+			goto fail;
+	}
+
+	/* configure ~100K pullups */
+	status = mcp23s08_write(mcp, MCP_GPPU, pdata->pullups);
+	if (status < 0)
+		goto fail;
+
+	status = mcp23s08_read_regs(mcp, 0, mcp->cache, sizeof mcp->cache);
+	if (status < 0)
+		goto fail;
+
+	/* disable inverter on input */
+	if (mcp->cache[MCP_IPOL] != 0) {
+		mcp->cache[MCP_IPOL] = 0;
+		do_update = 1;
+	}
+
+	/* disable irqs */
+	if (mcp->cache[MCP_GPINTEN] != 0) {
+		mcp->cache[MCP_GPINTEN] = 0;
+		do_update = 1;
+	}
+
+	if (do_update) {
+		u8 tx[4];
+
+		tx[0] = mcp->addr;
+		tx[1] = MCP_IPOL;
+		memcpy(&tx[2], &mcp->cache[MCP_IPOL], sizeof(tx) - 2);
+		status = spi_write_then_read(mcp->spi, tx, sizeof tx, NULL, 0);
+
+		/* FIXME check status... */
+	}
+
+	status = gpiochip_add(&mcp->chip);
+
+	/* NOTE:  these chips have a relatively sane IRQ framework, with
+	 * per-signal masking and level/edge triggering.  It's not yet
+	 * handled here...
+	 */
+
+	if (pdata->setup) {
+		status = pdata->setup(spi, mcp->chip.base,
+				mcp->chip.ngpio, pdata->context);
+		if (status < 0)
+			dev_dbg(&spi->dev, "setup --> %d\n", status);
+	}
+
+	return 0;
+
+fail:
+	kfree(mcp);
+	return status;
+}
+
+static int mcp23s08_remove(struct spi_device *spi)
+{
+	struct mcp23s08			*mcp = spi_get_drvdata(spi);
+	struct mcp23s08_platform_data	*pdata = spi->dev.platform_data;
+	int				status = 0;
+
+	if (pdata->teardown) {
+		status = pdata->teardown(spi,
+				mcp->chip.base, mcp->chip.ngpio,
+				pdata->context);
+		if (status < 0) {
+			dev_err(&spi->dev, "%s --> %d\n", "teardown", status);
+			return status;
+		}
+	}
+
+	status = gpiochip_remove(&mcp->chip);
+	if (status == 0)
+		kfree(mcp);
+	else
+		dev_err(&spi->dev, "%s --> %d\n", "remove", status);
+	return status;
+}
+
+static struct spi_driver mcp23s08_driver = {
+	.probe		= mcp23s08_probe,
+	.remove		= mcp23s08_remove,
+	.driver = {
+		.name	= "mcp23s08",
+		.owner	= THIS_MODULE,
+	},
+};
+
+/*----------------------------------------------------------------------*/
+
+static int __init mcp23s08_init(void)
+{
+	return spi_register_driver(&mcp23s08_driver);
+}
+module_init(mcp23s08_init);
+
+static void __exit mcp23s08_exit(void)
+{
+	spi_unregister_driver(&mcp23s08_driver);
+}
+module_exit(mcp23s08_exit);
+
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/spi/mcp23s08.h b/include/linux/spi/mcp23s08.h
new file mode 100644
index 0000000..835ddf4
--- /dev/null
+++ b/include/linux/spi/mcp23s08.h
@@ -0,0 +1,24 @@
+
+/* FIXME driver should be able to handle all four slaves that
+ * can be hooked up to each chipselect, as well as IRQs...
+ */
+
+struct mcp23s08_platform_data {
+	/* four slaves can share one SPI chipselect */
+	u8		slave;
+
+	/* number assigned to the first GPIO */
+	unsigned	base;
+
+	/* pins with pullups */
+	u8		pullups;
+
+	void		*context;	/* param to setup/teardown */
+
+	int		(*setup)(struct spi_device *spi,
+					int gpio, unsigned ngpio,
+					void *context);
+	int		(*teardown)(struct spi_device *spi,
+					int gpio, unsigned ngpio,
+					void *context);
+};
-- 
cgit v1.1


From 9e60fdcf0c2905d7a8fc4cb2b3711ea5c5acaae1 Mon Sep 17 00:00:00 2001
From: eric miao <eric.miao@marvell.com>
Date: Mon, 4 Feb 2008 22:28:26 -0800
Subject: gpiolib: pca9539 i2c gpio expander support

This adds a new-style I2C driver with basic support for the sixteen bit
PCA9539 GPIO expanders.  These chips have multiple registers, push-pull output
drivers, and (not supported in this patch) pin change interrupts.

Board-specific code must provide "pca9539_platform_data" with each chip's
"i2c_board_info".  That provides the GPIO numbers to be used by that chip, and
callbacks for board-specific setup/teardown logic.

Derived from drivers/i2c/chips/pca9539.c (which has no current known users).
This is faster and simpler; it uses 16-bit register access, and cache the
OUTPUT and DIRECTION registers for fast access

Signed-off-by: eric miao <eric.miao@marvell.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Jean Delvare <khali@linux-fr.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig        |  10 ++
 drivers/gpio/Makefile       |   1 +
 drivers/gpio/pca9539.c      | 271 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/pca9539.h |  18 +++
 4 files changed, 300 insertions(+)
 create mode 100644 drivers/gpio/pca9539.c
 create mode 100644 include/linux/i2c/pca9539.h

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index d924a32..74fac0f 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -27,6 +27,16 @@ config DEBUG_GPIO
 
 comment "I2C GPIO expanders:"
 
+config GPIO_PCA9539
+	tristate "PCA9539 16-bit I/O port"
+	depends on I2C
+	help
+	  Say yes here to support the PCA9539 16-bit I/O port. These
+	  parts are made by NXP and TI.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called pca9539.
+
 config GPIO_PCF857X
 	tristate "PCF857x, PCA857x, and PCA967x I2C GPIO expanders"
 	depends on I2C
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index a1fd877..470ecd6 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -5,4 +5,5 @@ ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpiolib.o
 
 obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
+obj-$(CONFIG_GPIO_PCA9539)	+= pca9539.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
diff --git a/drivers/gpio/pca9539.c b/drivers/gpio/pca9539.c
new file mode 100644
index 0000000..3e85c92
--- /dev/null
+++ b/drivers/gpio/pca9539.c
@@ -0,0 +1,271 @@
+/*
+ *  pca9539.c - 16-bit I/O port with interrupt and reset
+ *
+ *  Copyright (C) 2005 Ben Gardner <bgardner@wabtec.com>
+ *  Copyright (C) 2007 Marvell International Ltd.
+ *
+ *  Derived from drivers/i2c/chips/pca9539.c
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca9539.h>
+
+#include <asm/gpio.h>
+
+
+#define NR_PCA9539_GPIOS	16
+
+#define PCA9539_INPUT		0
+#define PCA9539_OUTPUT		2
+#define PCA9539_INVERT		4
+#define PCA9539_DIRECTION	6
+
+struct pca9539_chip {
+	unsigned gpio_start;
+	uint16_t reg_output;
+	uint16_t reg_direction;
+
+	struct i2c_client *client;
+	struct gpio_chip gpio_chip;
+};
+
+/* NOTE:  we can't currently rely on fault codes to come from SMBus
+ * calls, so we map all errors to EIO here and return zero otherwise.
+ */
+static int pca9539_write_reg(struct pca9539_chip *chip, int reg, uint16_t val)
+{
+	if (i2c_smbus_write_word_data(chip->client, reg, val) < 0)
+		return -EIO;
+	else
+		return 0;
+}
+
+static int pca9539_read_reg(struct pca9539_chip *chip, int reg, uint16_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_word_data(chip->client, reg);
+	if (ret < 0) {
+		dev_err(&chip->client->dev, "failed reading register\n");
+		return -EIO;
+	}
+
+	*val = (uint16_t)ret;
+	return 0;
+}
+
+static int pca9539_gpio_direction_input(struct gpio_chip *gc, unsigned off)
+{
+	struct pca9539_chip *chip;
+	uint16_t reg_val;
+	int ret;
+
+	chip = container_of(gc, struct pca9539_chip, gpio_chip);
+
+	reg_val = chip->reg_direction | (1u << off);
+	ret = pca9539_write_reg(chip, PCA9539_DIRECTION, reg_val);
+	if (ret)
+		return ret;
+
+	chip->reg_direction = reg_val;
+	return 0;
+}
+
+static int pca9539_gpio_direction_output(struct gpio_chip *gc,
+		unsigned off, int val)
+{
+	struct pca9539_chip *chip;
+	uint16_t reg_val;
+	int ret;
+
+	chip = container_of(gc, struct pca9539_chip, gpio_chip);
+
+	/* set output level */
+	if (val)
+		reg_val = chip->reg_output | (1u << off);
+	else
+		reg_val = chip->reg_output & ~(1u << off);
+
+	ret = pca9539_write_reg(chip, PCA9539_OUTPUT, reg_val);
+	if (ret)
+		return ret;
+
+	chip->reg_output = reg_val;
+
+	/* then direction */
+	reg_val = chip->reg_direction & ~(1u << off);
+	ret = pca9539_write_reg(chip, PCA9539_DIRECTION, reg_val);
+	if (ret)
+		return ret;
+
+	chip->reg_direction = reg_val;
+	return 0;
+}
+
+static int pca9539_gpio_get_value(struct gpio_chip *gc, unsigned off)
+{
+	struct pca9539_chip *chip;
+	uint16_t reg_val;
+	int ret;
+
+	chip = container_of(gc, struct pca9539_chip, gpio_chip);
+
+	ret = pca9539_read_reg(chip, PCA9539_INPUT, &reg_val);
+	if (ret < 0) {
+		/* NOTE:  diagnostic already emitted; that's all we should
+		 * do unless gpio_*_value_cansleep() calls become different
+		 * from their nonsleeping siblings (and report faults).
+		 */
+		return 0;
+	}
+
+	return (reg_val & (1u << off)) ? 1 : 0;
+}
+
+static void pca9539_gpio_set_value(struct gpio_chip *gc, unsigned off, int val)
+{
+	struct pca9539_chip *chip;
+	uint16_t reg_val;
+	int ret;
+
+	chip = container_of(gc, struct pca9539_chip, gpio_chip);
+
+	if (val)
+		reg_val = chip->reg_output | (1u << off);
+	else
+		reg_val = chip->reg_output & ~(1u << off);
+
+	ret = pca9539_write_reg(chip, PCA9539_OUTPUT, reg_val);
+	if (ret)
+		return;
+
+	chip->reg_output = reg_val;
+}
+
+static int pca9539_init_gpio(struct pca9539_chip *chip)
+{
+	struct gpio_chip *gc;
+
+	gc = &chip->gpio_chip;
+
+	gc->direction_input  = pca9539_gpio_direction_input;
+	gc->direction_output = pca9539_gpio_direction_output;
+	gc->get = pca9539_gpio_get_value;
+	gc->set = pca9539_gpio_set_value;
+
+	gc->base = chip->gpio_start;
+	gc->ngpio = NR_PCA9539_GPIOS;
+	gc->label = "pca9539";
+
+	return gpiochip_add(gc);
+}
+
+static int __devinit pca9539_probe(struct i2c_client *client)
+{
+	struct pca9539_platform_data *pdata;
+	struct pca9539_chip *chip;
+	int ret;
+
+	pdata = client->dev.platform_data;
+	if (pdata == NULL)
+		return -ENODEV;
+
+	chip = kzalloc(sizeof(struct pca9539_chip), GFP_KERNEL);
+	if (chip == NULL)
+		return -ENOMEM;
+
+	chip->client = client;
+
+	chip->gpio_start = pdata->gpio_base;
+
+	/* initialize cached registers from their original values.
+	 * we can't share this chip with another i2c master.
+	 */
+	ret = pca9539_read_reg(chip, PCA9539_OUTPUT, &chip->reg_output);
+	if (ret)
+		goto out_failed;
+
+	ret = pca9539_read_reg(chip, PCA9539_DIRECTION, &chip->reg_direction);
+	if (ret)
+		goto out_failed;
+
+	/* set platform specific polarity inversion */
+	ret = pca9539_write_reg(chip, PCA9539_INVERT, pdata->invert);
+	if (ret)
+		goto out_failed;
+
+	ret = pca9539_init_gpio(chip);
+	if (ret)
+		goto out_failed;
+
+	if (pdata->setup) {
+		ret = pdata->setup(client, chip->gpio_chip.base,
+				chip->gpio_chip.ngpio, pdata->context);
+		if (ret < 0)
+			dev_warn(&client->dev, "setup failed, %d\n", ret);
+	}
+
+	i2c_set_clientdata(client, chip);
+	return 0;
+
+out_failed:
+	kfree(chip);
+	return ret;
+}
+
+static int pca9539_remove(struct i2c_client *client)
+{
+	struct pca9539_platform_data *pdata = client->dev.platform_data;
+	struct pca9539_chip *chip = i2c_get_clientdata(client);
+	int ret = 0;
+
+	if (pdata->teardown) {
+		ret = pdata->teardown(client, chip->gpio_chip.base,
+				chip->gpio_chip.ngpio, pdata->context);
+		if (ret < 0) {
+			dev_err(&client->dev, "%s failed, %d\n",
+					"teardown", ret);
+			return ret;
+		}
+	}
+
+	ret = gpiochip_remove(&chip->gpio_chip);
+	if (ret) {
+		dev_err(&client->dev, "%s failed, %d\n",
+				"gpiochip_remove()", ret);
+		return ret;
+	}
+
+	kfree(chip);
+	return 0;
+}
+
+static struct i2c_driver pca9539_driver = {
+	.driver = {
+		.name	= "pca9539",
+	},
+	.probe		= pca9539_probe,
+	.remove		= pca9539_remove,
+};
+
+static int __init pca9539_init(void)
+{
+	return i2c_add_driver(&pca9539_driver);
+}
+module_init(pca9539_init);
+
+static void __exit pca9539_exit(void)
+{
+	i2c_del_driver(&pca9539_driver);
+}
+module_exit(pca9539_exit);
+
+MODULE_AUTHOR("eric miao <eric.miao@marvell.com>");
+MODULE_DESCRIPTION("GPIO expander driver for PCA9539");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/pca9539.h b/include/linux/i2c/pca9539.h
new file mode 100644
index 0000000..611d84a
--- /dev/null
+++ b/include/linux/i2c/pca9539.h
@@ -0,0 +1,18 @@
+/* platform data for the PCA9539 16-bit I/O expander driver */
+
+struct pca9539_platform_data {
+	/* number of the first GPIO */
+	unsigned	gpio_base;
+
+	/* initial polarity inversion setting */
+	uint16_t	invert;
+
+	void		*context;	/* param to setup/teardown */
+
+	int		(*setup)(struct i2c_client *client,
+				unsigned gpio, unsigned ngpio,
+				void *context);
+	int		(*teardown)(struct i2c_client *client,
+				unsigned gpio, unsigned ngpio,
+				void *context);
+};
-- 
cgit v1.1


From b72540c30c9c8c2c3f17cae29962cfb50fbe166a Mon Sep 17 00:00:00 2001
From: eric miao <eric.miao@marvell.com>
Date: Mon, 4 Feb 2008 22:28:27 -0800
Subject: deprecate obsolete pca9539 driver

Use drivers/gpio/pca9539.c instead.

Signed-off-by: eric miao <eric.miao@marvell.com>
Acked-by: Ben Gardner <bgardner@wabtec.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/i2c/chips/pca9539 | 3 +++
 drivers/i2c/chips/Kconfig       | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Documentation/i2c/chips/pca9539 b/Documentation/i2c/chips/pca9539
index c4fce6a..1d81c53 100644
--- a/Documentation/i2c/chips/pca9539
+++ b/Documentation/i2c/chips/pca9539
@@ -1,6 +1,9 @@
 Kernel driver pca9539
 =====================
 
+NOTE: this driver is deprecated and will be dropped soon, use
+drivers/gpio/pca9539.c instead.
+
 Supported chips:
   * Philips PCA9539
     Prefix: 'pca9539'
diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index bd7082c..b21593f 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -54,8 +54,8 @@ config PCF8575
 	  hardware.  If unsure, say N.
 
 config SENSORS_PCA9539
-	tristate "Philips PCA9539 16-bit I/O port"
-	depends on EXPERIMENTAL
+	tristate "Philips PCA9539 16-bit I/O port (DEPRECATED)"
+	depends on EXPERIMENTAL && GPIO_PCA9539 = "n"
 	help
 	  If you say yes here you get support for the Philips PCA9539
 	  16-bit I/O port.
@@ -63,6 +63,9 @@ config SENSORS_PCA9539
 	  This driver can also be built as a module.  If so, the module
 	  will be called pca9539.
 
+	  This driver is deprecated and will be dropped soon. Use
+	  drivers/gpio/pca9539.c instead.
+
 config SENSORS_PCF8591
 	tristate "Philips PCF8591"
 	depends on EXPERIMENTAL
-- 
cgit v1.1


From b98348bdd08dc4ec11828aa98a78edde15c53cfa Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 4 Feb 2008 22:28:28 -0800
Subject: gpiolib: avr32 at32ap platform support

Teach AVR32 to use the "GPIO Library" when exposing its GPIOs, so that signals
on external chips (like GPIO expanders) can easily be used.

This mostly reorganizes some existing logic, with two minor changes in
behavior:

 - The PSR registers are used instead of the previous "gpio_mask" values,
   matching AT91 behavior and removing some duplication between that role
   and that of "pinmux_mask".

 - NR_IRQs grew to acommodate a bank of external GPIOs.  Eventually this
   number should probably become a board-specific config option.

There's a debugfs dump of status for the built-in GPIOs, showing which pins
have deglitching, pullups, or open drain drive enabled, as well as the ID
string used when requesting each IRQ.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Eric Miao <eric.miao@marvell.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/Kconfig                         |   1 +
 arch/avr32/mach-at32ap/pio.c               | 172 +++++++++++++++--------------
 arch/avr32/mach-at32ap/pio.h               |   2 +-
 include/asm-avr32/arch-at32ap/at32ap700x.h |   2 -
 include/asm-avr32/arch-at32ap/gpio.h       |  34 ++++--
 include/asm-avr32/arch-at32ap/irq.h        |   4 +-
 6 files changed, 121 insertions(+), 94 deletions(-)

diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index c816f29..28e0caf 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -82,6 +82,7 @@ config PLATFORM_AT32AP
 	select SUBARCH_AVR32B
 	select MMU
 	select PERFORMANCE_COUNTERS
+	select HAVE_GPIO_LIB
 
 #
 # CPU types
diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c
index d61a02d..38a8fa3 100644
--- a/arch/avr32/mach-at32ap/pio.c
+++ b/arch/avr32/mach-at32ap/pio.c
@@ -24,11 +24,11 @@
 #define MAX_NR_PIO_DEVICES		8
 
 struct pio_device {
+	struct gpio_chip chip;
 	void __iomem *regs;
 	const struct platform_device *pdev;
 	struct clk *clk;
 	u32 pinmux_mask;
-	u32 gpio_mask;
 	char name[8];
 };
 
@@ -64,7 +64,8 @@ void __init at32_select_periph(unsigned int pin, unsigned int periph,
 		goto fail;
 	}
 
-	if (unlikely(test_and_set_bit(pin_index, &pio->pinmux_mask))) {
+	if (unlikely(test_and_set_bit(pin_index, &pio->pinmux_mask)
+			 || gpiochip_is_requested(&pio->chip, pin_index))) {
 		printk("%s: pin %u is busy\n", pio->name, pin_index);
 		goto fail;
 	}
@@ -79,9 +80,6 @@ void __init at32_select_periph(unsigned int pin, unsigned int periph,
 	if (!(flags & AT32_GPIOF_PULLUP))
 		pio_writel(pio, PUDR, mask);
 
-	/* gpio_request NOT allowed */
-	set_bit(pin_index, &pio->gpio_mask);
-
 	return;
 
 fail:
@@ -130,9 +128,6 @@ void __init at32_select_gpio(unsigned int pin, unsigned long flags)
 
 	pio_writel(pio, PER, mask);
 
-	/* gpio_request now allowed */
-	clear_bit(pin_index, &pio->gpio_mask);
-
 	return;
 
 fail:
@@ -166,96 +161,50 @@ fail:
 
 /* GPIO API */
 
-int gpio_request(unsigned int gpio, const char *label)
+static int direction_input(struct gpio_chip *chip, unsigned offset)
 {
-	struct pio_device *pio;
-	unsigned int pin;
-
-	pio = gpio_to_pio(gpio);
-	if (!pio)
-		return -ENODEV;
+	struct pio_device *pio = container_of(chip, struct pio_device, chip);
+	u32 mask = 1 << offset;
 
-	pin = gpio & 0x1f;
-	if (test_and_set_bit(pin, &pio->gpio_mask))
-		return -EBUSY;
+	if (!(pio_readl(pio, PSR) & mask))
+		return -EINVAL;
 
+	pio_writel(pio, ODR, mask);
 	return 0;
 }
-EXPORT_SYMBOL(gpio_request);
 
-void gpio_free(unsigned int gpio)
+static int gpio_get(struct gpio_chip *chip, unsigned offset)
 {
-	struct pio_device *pio;
-	unsigned int pin;
+	struct pio_device *pio = container_of(chip, struct pio_device, chip);
 
-	pio = gpio_to_pio(gpio);
-	if (!pio) {
-		printk(KERN_ERR
-		       "gpio: attempted to free invalid pin %u\n", gpio);
-		return;
-	}
-
-	pin = gpio & 0x1f;
-	if (!test_and_clear_bit(pin, &pio->gpio_mask))
-		printk(KERN_ERR "gpio: freeing free or non-gpio pin %s-%u\n",
-		       pio->name, pin);
+	return (pio_readl(pio, PDSR) >> offset) & 1;
 }
-EXPORT_SYMBOL(gpio_free);
 
-int gpio_direction_input(unsigned int gpio)
-{
-	struct pio_device *pio;
-	unsigned int pin;
-
-	pio = gpio_to_pio(gpio);
-	if (!pio)
-		return -ENODEV;
-
-	pin = gpio & 0x1f;
-	pio_writel(pio, ODR, 1 << pin);
-
-	return 0;
-}
-EXPORT_SYMBOL(gpio_direction_input);
+static void gpio_set(struct gpio_chip *chip, unsigned offset, int value);
 
-int gpio_direction_output(unsigned int gpio, int value)
+static int direction_output(struct gpio_chip *chip, unsigned offset, int value)
 {
-	struct pio_device *pio;
-	unsigned int pin;
-
-	pio = gpio_to_pio(gpio);
-	if (!pio)
-		return -ENODEV;
+	struct pio_device *pio = container_of(chip, struct pio_device, chip);
+	u32 mask = 1 << offset;
 
-	gpio_set_value(gpio, value);
-
-	pin = gpio & 0x1f;
-	pio_writel(pio, OER, 1 << pin);
+	if (!(pio_readl(pio, PSR) & mask))
+		return -EINVAL;
 
+	gpio_set(chip, offset, value);
+	pio_writel(pio, OER, mask);
 	return 0;
 }
-EXPORT_SYMBOL(gpio_direction_output);
 
-int gpio_get_value(unsigned int gpio)
+static void gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 {
-	struct pio_device *pio = &pio_dev[gpio >> 5];
+	struct pio_device *pio = container_of(chip, struct pio_device, chip);
+	u32 mask = 1 << offset;
 
-	return (pio_readl(pio, PDSR) >> (gpio & 0x1f)) & 1;
-}
-EXPORT_SYMBOL(gpio_get_value);
-
-void gpio_set_value(unsigned int gpio, int value)
-{
-	struct pio_device *pio = &pio_dev[gpio >> 5];
-	u32 mask;
-
-	mask = 1 << (gpio & 0x1f);
 	if (value)
 		pio_writel(pio, SODR, mask);
 	else
 		pio_writel(pio, CODR, mask);
 }
-EXPORT_SYMBOL(gpio_set_value);
 
 /*--------------------------------------------------------------------------*/
 
@@ -339,6 +288,63 @@ gpio_irq_setup(struct pio_device *pio, int irq, int gpio_irq)
 
 /*--------------------------------------------------------------------------*/
 
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/seq_file.h>
+
+/*
+ * This shows more info than the generic gpio dump code:
+ * pullups, deglitching, open drain drive.
+ */
+static void pio_bank_show(struct seq_file *s, struct gpio_chip *chip)
+{
+	struct pio_device *pio = container_of(chip, struct pio_device, chip);
+	u32			psr, osr, imr, pdsr, pusr, ifsr, mdsr;
+	unsigned		i;
+	u32			mask;
+	char			bank;
+
+	psr = pio_readl(pio, PSR);
+	osr = pio_readl(pio, OSR);
+	imr = pio_readl(pio, IMR);
+	pdsr = pio_readl(pio, PDSR);
+	pusr = pio_readl(pio, PUSR);
+	ifsr = pio_readl(pio, IFSR);
+	mdsr = pio_readl(pio, MDSR);
+
+	bank = 'A' + pio->pdev->id;
+
+	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+		const char *label;
+
+		label = gpiochip_is_requested(chip, i);
+		if (!label)
+			continue;
+
+		seq_printf(s, " gpio-%-3d P%c%-2d (%-12s) %s %s %s",
+			chip->base + i, bank, i,
+			label,
+			(osr & mask) ? "out" : "in ",
+			(mask & pdsr) ? "hi" : "lo",
+			(mask & pusr) ? "  " : "up");
+		if (ifsr & mask)
+			seq_printf(s, " deglitch");
+		if ((osr & mdsr) & mask)
+			seq_printf(s, " open-drain");
+		if (imr & mask)
+			seq_printf(s, " irq-%d edge-both",
+				gpio_to_irq(chip->base + i));
+		seq_printf(s, "\n");
+	}
+}
+
+#else
+#define pio_bank_show	NULL
+#endif
+
+
+/*--------------------------------------------------------------------------*/
+
 static int __init pio_probe(struct platform_device *pdev)
 {
 	struct pio_device *pio = NULL;
@@ -349,6 +355,18 @@ static int __init pio_probe(struct platform_device *pdev)
 	pio = &pio_dev[pdev->id];
 	BUG_ON(!pio->regs);
 
+	pio->chip.label = pio->name;
+	pio->chip.base = pdev->id * 32;
+	pio->chip.ngpio = 32;
+
+	pio->chip.direction_input = direction_input;
+	pio->chip.get = gpio_get;
+	pio->chip.direction_output = direction_output;
+	pio->chip.set = gpio_set;
+	pio->chip.dbg_show = pio_bank_show;
+
+	gpiochip_add(&pio->chip);
+
 	gpio_irq_setup(pio, irq, gpio_irq_base);
 
 	platform_set_drvdata(pdev, pio);
@@ -406,12 +424,6 @@ void __init at32_init_pio(struct platform_device *pdev)
 	pio->pdev = pdev;
 	pio->regs = ioremap(regs->start, regs->end - regs->start + 1);
 
-	/*
-	 * request_gpio() is only valid for pins that have been
-	 * explicitly configured as GPIO and not previously requested
-	 */
-	pio->gpio_mask = ~0UL;
-
 	/* start with irqs disabled and acked */
 	pio_writel(pio, IDR, ~0UL);
 	(void) pio_readl(pio, ISR);
diff --git a/arch/avr32/mach-at32ap/pio.h b/arch/avr32/mach-at32ap/pio.h
index 50fa3ac..7795116 100644
--- a/arch/avr32/mach-at32ap/pio.h
+++ b/arch/avr32/mach-at32ap/pio.h
@@ -19,7 +19,7 @@
 #define PIO_OSR                                0x0018
 #define PIO_IFER                               0x0020
 #define PIO_IFDR                               0x0024
-#define PIO_ISFR                               0x0028
+#define PIO_IFSR                               0x0028
 #define PIO_SODR                               0x0030
 #define PIO_CODR                               0x0034
 #define PIO_ODSR                               0x0038
diff --git a/include/asm-avr32/arch-at32ap/at32ap700x.h b/include/asm-avr32/arch-at32ap/at32ap700x.h
index 99684d6..31e48b0 100644
--- a/include/asm-avr32/arch-at32ap/at32ap700x.h
+++ b/include/asm-avr32/arch-at32ap/at32ap700x.h
@@ -13,8 +13,6 @@
 #define GPIO_PERIPH_A	0
 #define GPIO_PERIPH_B	1
 
-#define NR_GPIO_CONTROLLERS	4
-
 /*
  * Pin numbers identifying specific GPIO pins on the chip. They can
  * also be converted to IRQ numbers by passing them through
diff --git a/include/asm-avr32/arch-at32ap/gpio.h b/include/asm-avr32/arch-at32ap/gpio.h
index af7f953..0180f58 100644
--- a/include/asm-avr32/arch-at32ap/gpio.h
+++ b/include/asm-avr32/arch-at32ap/gpio.h
@@ -5,20 +5,36 @@
 #include <asm/irq.h>
 
 
-/* Arch-neutral GPIO API */
-int __must_check gpio_request(unsigned int gpio, const char *label);
-void gpio_free(unsigned int gpio);
+/* Some GPIO chips can manage IRQs; some can't.  The exact numbers can
+ * be changed if needed, but for the moment they're not configurable.
+ */
+#define ARCH_NR_GPIOS	(NR_GPIO_IRQS + 2 * 32)
 
-int gpio_direction_input(unsigned int gpio);
-int gpio_direction_output(unsigned int gpio, int value);
-int gpio_get_value(unsigned int gpio);
-void gpio_set_value(unsigned int gpio, int value);
 
-#include <asm-generic/gpio.h>		/* cansleep wrappers */
+/* Arch-neutral GPIO API, supporting both "native" and external GPIOs. */
+#include <asm-generic/gpio.h>
+
+static inline int gpio_get_value(unsigned int gpio)
+{
+	return __gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned int gpio, int value)
+{
+	__gpio_set_value(gpio, value);
+}
+
+static inline int gpio_cansleep(unsigned int gpio)
+{
+	return __gpio_cansleep(gpio);
+}
+
 
 static inline int gpio_to_irq(unsigned int gpio)
 {
-	return gpio + GPIO_IRQ_BASE;
+	if (gpio < NR_GPIO_IRQS)
+		return gpio + GPIO_IRQ_BASE;
+	return -EINVAL;
 }
 
 static inline int irq_to_gpio(unsigned int irq)
diff --git a/include/asm-avr32/arch-at32ap/irq.h b/include/asm-avr32/arch-at32ap/irq.h
index 5adffab..608e350 100644
--- a/include/asm-avr32/arch-at32ap/irq.h
+++ b/include/asm-avr32/arch-at32ap/irq.h
@@ -3,11 +3,11 @@
 
 #define EIM_IRQ_BASE	NR_INTERNAL_IRQS
 #define NR_EIM_IRQS	32
-
 #define AT32_EXTINT(n)	(EIM_IRQ_BASE + (n))
 
 #define GPIO_IRQ_BASE	(EIM_IRQ_BASE + NR_EIM_IRQS)
-#define NR_GPIO_IRQS	(5 * 32)
+#define NR_GPIO_CTLR	(5 /*internal*/ + 1 /*external*/)
+#define NR_GPIO_IRQS	(NR_GPIO_CTLR * 32)
 
 #define NR_IRQS		(GPIO_IRQ_BASE + NR_GPIO_IRQS)
 
-- 
cgit v1.1


From eebd2aa355692afaf9906f62118620f1a1c19dbb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:29 -0800
Subject: Pagecache zeroing: zero_user_segment, zero_user_segments and
 zero_user

Simplify page cache zeroing of segments of pages through 3 functions

zero_user_segments(page, start1, end1, start2, end2)

        Zeros two segments of the page. It takes the position where to
        start and end the zeroing which avoids length calculations and
	makes code clearer.

zero_user_segment(page, start, end)

        Same for a single segment.

zero_user(page, start, length)

        Length variant for the case where we know the length.

We remove the zero_user_page macro. Issues:

1. Its a macro. Inline functions are preferable.

2. The KM_USER0 macro is only defined for HIGHMEM.

   Having to treat this special case everywhere makes the
   code needlessly complex. The parameter for zeroing is always
   KM_USER0 except in one single case that we open code.

Avoiding KM_USER0 makes a lot of code not having to be dealing
with the special casing for HIGHMEM anymore. Dealing with
kmap is only necessary for HIGHMEM configurations. In those
configurations we use KM_USER0 like we do for a series of other
functions defined in highmem.h.

Since KM_USER0 is depends on HIGHMEM the existing zero_user_page
function could not be a macro. zero_user_* functions introduced
here can be be inline because that constant is not used when these
functions are called.

Also extract the flushing of the caches to be outside of the kmap.

[akpm@linux-foundation.org: fix nfs and ntfs build]
[akpm@linux-foundation.org: fix ntfs build some more]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Steven French <sfrench@us.ibm.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: David Chinner <dgc@sgi.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Steven French <sfrench@us.ibm.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                | 44 ++++++++++++++----------------------------
 fs/cifs/inode.c            |  2 +-
 fs/direct-io.c             |  4 ++--
 fs/ecryptfs/mmap.c         |  5 ++---
 fs/ext3/inode.c            |  4 ++--
 fs/ext4/inode.c            |  4 ++--
 fs/gfs2/bmap.c             |  2 +-
 fs/gfs2/ops_address.c      |  2 +-
 fs/libfs.c                 | 11 ++++-------
 fs/mpage.c                 |  7 ++-----
 fs/nfs/read.c              | 10 +++++-----
 fs/nfs/write.c             |  4 +---
 fs/ntfs/aops.c             | 20 ++++++++++---------
 fs/ntfs/compress.c         |  2 +-
 fs/ntfs/file.c             | 32 +++++++++++++++----------------
 fs/ocfs2/alloc.c           |  2 +-
 fs/ocfs2/aops.c            |  6 +++---
 fs/reiserfs/inode.c        |  4 ++--
 fs/xfs/linux-2.6/xfs_lrw.c |  2 +-
 include/linux/highmem.h    | 48 +++++++++++++++++++++++++++++-----------------
 mm/filemap_xip.c           |  2 +-
 mm/truncate.c              |  2 +-
 22 files changed, 103 insertions(+), 116 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 456c9ab..1de9214 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1798,7 +1798,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 					start = max(from, block_start);
 					size = min(to, block_end) - start;
 
-					zero_user_page(page, start, size, KM_USER0);
+					zero_user(page, start, size);
 					set_buffer_uptodate(bh);
 				}
 
@@ -1861,19 +1861,10 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 					mark_buffer_dirty(bh);
 					continue;
 				}
-				if (block_end > to || block_start < from) {
-					void *kaddr;
-
-					kaddr = kmap_atomic(page, KM_USER0);
-					if (block_end > to)
-						memset(kaddr+to, 0,
-							block_end-to);
-					if (block_start < from)
-						memset(kaddr+block_start,
-							0, from-block_start);
-					flush_dcache_page(page);
-					kunmap_atomic(kaddr, KM_USER0);
-				}
+				if (block_end > to || block_start < from)
+					zero_user_segments(page,
+						to, block_end,
+						block_start, from);
 				continue;
 			}
 		}
@@ -2104,8 +2095,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
-				zero_user_page(page, i * blocksize, blocksize,
-						KM_USER0);
+				zero_user(page, i * blocksize, blocksize);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
@@ -2218,7 +2208,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping,
 						&page, &fsdata);
 		if (err)
 			goto out;
-		zero_user_page(page, zerofrom, len, KM_USER0);
+		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
@@ -2245,7 +2235,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping,
 						&page, &fsdata);
 		if (err)
 			goto out;
-		zero_user_page(page, zerofrom, len, KM_USER0);
+		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
@@ -2422,7 +2412,6 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 	unsigned block_in_page;
 	unsigned block_start, block_end;
 	sector_t block_in_file;
-	char *kaddr;
 	int nr_reads = 0;
 	int ret = 0;
 	int is_mapped_to_disk = 1;
@@ -2493,13 +2482,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 			continue;
 		}
 		if (buffer_new(bh) || !buffer_mapped(bh)) {
-			kaddr = kmap_atomic(page, KM_USER0);
-			if (block_start < from)
-				memset(kaddr+block_start, 0, from-block_start);
-			if (block_end > to)
-				memset(kaddr + to, 0, block_end - to);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_segments(page, block_start, from,
+							to, block_end);
 			continue;
 		}
 		if (buffer_uptodate(bh))
@@ -2636,7 +2620,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
@@ -2709,7 +2693,7 @@ has_buffers:
 		if (page_has_buffers(page))
 			goto has_buffers;
 	}
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 	set_page_dirty(page);
 	err = 0;
 
@@ -2785,7 +2769,7 @@ int block_truncate_page(struct address_space *mapping,
 			goto unlock;
 	}
 
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
 	err = 0;
 
@@ -2831,7 +2815,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d9567ba..47f2621 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1386,7 +1386,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 	if (!page)
 		return -ENOMEM;
 
-	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 	return rc;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index acf0da1..9e81add 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -878,8 +878,8 @@ do_holes:
 					page_cache_release(page);
 					goto out;
 				}
-				zero_user_page(page, block_in_page << blkbits,
-						1 << blkbits, KM_USER0);
+				zero_user(page, block_in_page << blkbits,
+						1 << blkbits);
 				dio->block_in_file++;
 				block_in_page++;
 				goto next_block;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 32c5711..0535412 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -257,8 +257,7 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
 	end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
 	if (to > end_byte_in_page)
 		end_byte_in_page = to;
-	zero_user_page(page, end_byte_in_page,
-		PAGE_CACHE_SIZE - end_byte_in_page, KM_USER0);
+	zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
 out:
 	return 0;
 }
@@ -307,7 +306,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 	 */
 	if ((i_size_read(page->mapping->host) == prev_page_end_size) &&
 	    (from != 0)) {
-		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 	}
 out:
 	return rc;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9b162cd..0775354 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1845,7 +1845,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	 */
 	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
 	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user_page(page, offset, length, KM_USER0);
+		zero_user(page, offset, length);
 		set_page_dirty(page);
 		goto unlock;
 	}
@@ -1898,7 +1898,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 			goto unlock;
 	}
 
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 
 	err = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bb717cb..05c4145 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1840,7 +1840,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 	 */
 	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
 	     ext4_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user_page(page, offset, length, KM_USER0);
+		zero_user(page, offset, length);
 		set_page_dirty(page);
 		goto unlock;
 	}
@@ -1893,7 +1893,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 			goto unlock;
 	}
 
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 
 	BUFFER_TRACE(bh, "zeroed end of block");
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e4effc4..e9456eb 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -932,7 +932,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 	if (!gfs2_is_writeback(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 
 unlock:
 	unlock_page(page);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 38dbe99..ac772b6 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -446,7 +446,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	 * so we need to supply one here. It doesn't happen often.
 	 */
 	if (unlikely(page->index)) {
-		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 		return 0;
 	}
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 6e68b70..5523bde 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -341,13 +341,10 @@ int simple_prepare_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
 	if (!PageUptodate(page)) {
-		if (to - from != PAGE_CACHE_SIZE) {
-			void *kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr, 0, from);
-			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
+		if (to - from != PAGE_CACHE_SIZE)
+			zero_user_segments(page,
+				0, from,
+				to, PAGE_CACHE_SIZE);
 	}
 	return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index d54f8f8..5df5643 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -276,9 +276,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	}
 
 	if (first_hole != blocks_per_page) {
-		zero_user_page(page, first_hole << blkbits,
-				PAGE_CACHE_SIZE - (first_hole << blkbits),
-				KM_USER0);
+		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
 		if (first_hole == 0) {
 			SetPageUptodate(page);
 			unlock_page(page);
@@ -571,8 +569,7 @@ page_is_mapped:
 
 		if (page->index > end_index || !offset)
 			goto confused;
-		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
-				KM_USER0);
+		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	}
 
 	/*
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8fd6dfb..3d7d963 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -79,7 +79,7 @@ void nfs_readdata_release(void *data)
 static
 int nfs_return_empty_page(struct page *page)
 {
-	zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+	zero_user(page, 0, PAGE_CACHE_SIZE);
 	SetPageUptodate(page);
 	unlock_page(page);
 	return 0;
@@ -103,10 +103,10 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	pglen = PAGE_CACHE_SIZE - base;
 	for (;;) {
 		if (remainder <= pglen) {
-			zero_user_page(*pages, base, remainder, KM_USER0);
+			zero_user(*pages, base, remainder);
 			break;
 		}
-		zero_user_page(*pages, base, pglen, KM_USER0);
+		zero_user(*pages, base, pglen);
 		pages++;
 		remainder -= pglen;
 		pglen = PAGE_CACHE_SIZE;
@@ -130,7 +130,7 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		return PTR_ERR(new);
 	}
 	if (len < PAGE_CACHE_SIZE)
-		zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0);
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
 	nfs_list_add_request(new, &one_request);
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
@@ -532,7 +532,7 @@ readpage_async_filler(void *data, struct page *page)
 		goto out_error;
 
 	if (len < PAGE_CACHE_SIZE)
-		zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0);
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 	nfs_pageio_add_request(desc->pgio, new);
 	return 0;
 out_error:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 522efff..b144b19 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -665,9 +665,7 @@ zero_page:
 	 * then we need to zero any uninitalised data. */
 	if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
 			&& !PageUptodate(req->wb_page))
-		zero_user_page(req->wb_page, req->wb_bytes,
-				PAGE_CACHE_SIZE - req->wb_bytes,
-				KM_USER0);
+		zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
 	return req;
 }
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index ad87cb0..00e9ccd 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -87,13 +87,17 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		/* Check for the current buffer head overflowing. */
 		if (unlikely(file_ofs + bh->b_size > init_size)) {
 			int ofs;
+			void *kaddr;
 
 			ofs = 0;
 			if (file_ofs < init_size)
 				ofs = init_size - file_ofs;
 			local_irq_save(flags);
-			zero_user_page(page, bh_offset(bh) + ofs,
-					 bh->b_size - ofs, KM_BIO_SRC_IRQ);
+			kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+			memset(kaddr + bh_offset(bh) + ofs, 0,
+					bh->b_size - ofs);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
 			local_irq_restore(flags);
 		}
 	} else {
@@ -334,7 +338,7 @@ handle_hole:
 		bh->b_blocknr = -1UL;
 		clear_buffer_mapped(bh);
 handle_zblock:
-		zero_user_page(page, i * blocksize, blocksize, KM_USER0);
+		zero_user(page, i * blocksize, blocksize);
 		if (likely(!err))
 			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
@@ -410,7 +414,7 @@ retry_readpage:
 	/* Is the page fully outside i_size? (truncate in progress) */
 	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
 			PAGE_CACHE_SHIFT)) {
-		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 		ntfs_debug("Read outside i_size - truncated?");
 		goto done;
 	}
@@ -459,7 +463,7 @@ retry_readpage:
 	 * ok to ignore the compressed flag here.
 	 */
 	if (unlikely(page->index > 0)) {
-		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 		goto done;
 	}
 	if (!NInoAttr(ni))
@@ -788,8 +792,7 @@ lock_retry_remap:
 		if (err == -ENOENT || lcn == LCN_ENOENT) {
 			bh->b_blocknr = -1;
 			clear_buffer_dirty(bh);
-			zero_user_page(page, bh_offset(bh), blocksize,
-					KM_USER0);
+			zero_user(page, bh_offset(bh), blocksize);
 			set_buffer_uptodate(bh);
 			err = 0;
 			continue;
@@ -1414,8 +1417,7 @@ retry_writepage:
 		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
 			/* The page straddles i_size. */
 			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
-			zero_user_page(page, ofs, PAGE_CACHE_SIZE - ofs,
-					KM_USER0);
+			zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index d1619d0..33ff314 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -565,7 +565,7 @@ int ntfs_read_compressed_block(struct page *page)
 	if (xpage >= max_page) {
 		kfree(bhs);
 		kfree(pages);
-		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 		ntfs_debug("Compressed read outside i_size - truncated?");
 		SetPageUptodate(page);
 		unlock_page(page);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 6cd08df..3c5550c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -607,8 +607,8 @@ do_next_page:
 					ntfs_submit_bh_for_read(bh);
 					*wait_bh++ = bh;
 				} else {
-					zero_user_page(page, bh_offset(bh),
-							blocksize, KM_USER0);
+					zero_user(page, bh_offset(bh),
+							blocksize);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -683,9 +683,8 @@ map_buffer_cached:
 						ntfs_submit_bh_for_read(bh);
 						*wait_bh++ = bh;
 					} else {
-						zero_user_page(page,
-							bh_offset(bh),
-							blocksize, KM_USER0);
+						zero_user(page, bh_offset(bh),
+								blocksize);
 						set_buffer_uptodate(bh);
 					}
 				}
@@ -703,8 +702,8 @@ map_buffer_cached:
 			 */
 			if (bh_end <= pos || bh_pos >= end) {
 				if (!buffer_uptodate(bh)) {
-					zero_user_page(page, bh_offset(bh),
-							blocksize, KM_USER0);
+					zero_user(page, bh_offset(bh),
+							blocksize);
 					set_buffer_uptodate(bh);
 				}
 				mark_buffer_dirty(bh);
@@ -743,8 +742,7 @@ map_buffer_cached:
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			} else if (!buffer_uptodate(bh)) {
-				zero_user_page(page, bh_offset(bh), blocksize,
-						KM_USER0);
+				zero_user(page, bh_offset(bh), blocksize);
 				set_buffer_uptodate(bh);
 			}
 			continue;
@@ -868,8 +866,8 @@ rl_not_mapped_enoent:
 					if (!buffer_uptodate(bh))
 						set_buffer_uptodate(bh);
 				} else if (!buffer_uptodate(bh)) {
-					zero_user_page(page, bh_offset(bh),
-							blocksize, KM_USER0);
+					zero_user(page, bh_offset(bh),
+						blocksize);
 					set_buffer_uptodate(bh);
 				}
 				continue;
@@ -1128,8 +1126,8 @@ rl_not_mapped_enoent:
 
 				if (likely(bh_pos < initialized_size))
 					ofs = initialized_size - bh_pos;
-				zero_user_page(page, bh_offset(bh) + ofs,
-						blocksize - ofs, KM_USER0);
+				zero_user_segment(page, bh_offset(bh) + ofs,
+						blocksize);
 			}
 		} else /* if (unlikely(!buffer_uptodate(bh))) */
 			err = -EIO;
@@ -1269,8 +1267,8 @@ rl_not_mapped_enoent:
 				if (PageUptodate(page))
 					set_buffer_uptodate(bh);
 				else {
-					zero_user_page(page, bh_offset(bh),
-							blocksize, KM_USER0);
+					zero_user(page, bh_offset(bh),
+							blocksize);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -1330,7 +1328,7 @@ err_out:
 		len = PAGE_CACHE_SIZE;
 		if (len > bytes)
 			len = bytes;
-		zero_user_page(*pages, 0, len, KM_USER0);
+		zero_user(*pages, 0, len);
 	}
 	goto out;
 }
@@ -1451,7 +1449,7 @@ err_out:
 		len = PAGE_CACHE_SIZE;
 		if (len > bytes)
 			len = bytes;
-		zero_user_page(*pages, 0, len, KM_USER0);
+		zero_user(*pages, 0, len);
 	}
 	goto out;
 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 64713e1..447206eb 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5670,7 +5670,7 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 
 	if (zero)
-		zero_user_page(page, from, to - from, KM_USER0);
+		zero_user_segment(page, from, to);
 
 	/*
 	 * Need to set the buffers we zero'd into uptodate
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index bc7b4cb..8224312 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -307,7 +307,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
-		zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
+		zero_user(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
@@ -869,7 +869,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		if (block_start >= to)
 			break;
 
-		zero_user_page(page, block_start, bh->b_size, KM_USER0);
+		zero_user(page, block_start, bh->b_size);
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 
@@ -1034,7 +1034,7 @@ static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to
 					start = max(from, block_start);
 					end = min(to, block_end);
 
-					zero_user_page(page, start, end - start, KM_USER0);
+					zero_user_segment(page, start, end);
 					set_buffer_uptodate(bh);
 				}
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 231fd5c..1953098 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2143,7 +2143,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 		/* if we are not on a block boundary */
 		if (length) {
 			length = blocksize - length;
-			zero_user_page(page, offset, length, KM_USER0);
+			zero_user(page, offset, length);
 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 				mark_buffer_dirty(bh);
 			}
@@ -2367,7 +2367,7 @@ static int reiserfs_write_full_page(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
-		zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0);
+		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
 	}
 	bh = head;
 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index d6a8ddd..6f614f3 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -155,7 +155,7 @@ xfs_iozero(
 		if (status)
 			break;
 
-		zero_user_page(page, offset, bytes, KM_USER0);
+		zero_user(page, offset, bytes);
 
 		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
 					page, fsdata);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 1fcb003..61a5e5e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -124,28 +124,40 @@ static inline void clear_highpage(struct page *page)
 	kunmap_atomic(kaddr, KM_USER0);
 }
 
-/*
- * Same but also flushes aliased cache contents to RAM.
- *
- * This must be a macro because KM_USER0 and friends aren't defined if
- * !CONFIG_HIGHMEM
- */
-#define zero_user_page(page, offset, size, km_type)		\
-	do {							\
-		void *kaddr;					\
-								\
-		BUG_ON((offset) + (size) > PAGE_SIZE);		\
-								\
-		kaddr = kmap_atomic(page, km_type);		\
-		memset((char *)kaddr + (offset), 0, (size));	\
-		flush_dcache_page(page);			\
-		kunmap_atomic(kaddr, (km_type));		\
-	} while (0)
+static inline void zero_user_segments(struct page *page,
+	unsigned start1, unsigned end1,
+	unsigned start2, unsigned end2)
+{
+	void *kaddr = kmap_atomic(page, KM_USER0);
+
+	BUG_ON(end1 > PAGE_SIZE || end2 > PAGE_SIZE);
+
+	if (end1 > start1)
+		memset(kaddr + start1, 0, end1 - start1);
+
+	if (end2 > start2)
+		memset(kaddr + start2, 0, end2 - start2);
+
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+}
+
+static inline void zero_user_segment(struct page *page,
+	unsigned start, unsigned end)
+{
+	zero_user_segments(page, start, end, 0, 0);
+}
+
+static inline void zero_user(struct page *page,
+	unsigned start, unsigned size)
+{
+	zero_user_segments(page, start, start + size, 0, 0);
+}
 
 static inline void __deprecated memclear_highpage_flush(struct page *page,
 			unsigned int offset, unsigned int size)
 {
-	zero_user_page(page, offset, size, KM_USER0);
+	zero_user(page, offset, size);
 }
 
 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f874ae8..0420a02 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 		else
 			return PTR_ERR(page);
 	}
-	zero_user_page(page, offset, length, KM_USER0);
+	zero_user(page, offset, length);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/truncate.c b/mm/truncate.c
index c3123b0..3855492 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
-	zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
+	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
 	if (PagePrivate(page))
 		do_invalidatepage(page, partial);
 }
-- 
cgit v1.1


From 48667e7a43c1a1e0ba743f93ae946f8cb34ff2f9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:31 -0800
Subject: Move vmalloc_to_page() to mm/vmalloc.

We already have page table manipulation for vmalloc in vmalloc.c. Move the
vmalloc_to_page() function there as well.

Move the definitions for vmalloc related functions in mm.h to a newly created
section.  A better place would be vmalloc.h but mm.h is basic and may depend
on these functions.  An alternative would be to include vmalloc.h in mm.h
(like done for vmstat.h).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  6 ++++--
 mm/memory.c        | 40 ----------------------------------------
 mm/vmalloc.c       | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1bba678..1961056 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -231,6 +231,10 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
+/* Support for virtually mapped pages */
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
@@ -1089,8 +1093,6 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
-struct page *vmalloc_to_page(void *addr);
-unsigned long vmalloc_to_pfn(void *addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
diff --git a/mm/memory.c b/mm/memory.c
index d902d0e..1b8ca16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2618,46 +2618,6 @@ int make_pages_present(unsigned long addr, unsigned long end)
 	return ret == len ? 0 : -1;
 }
 
-/* 
- * Map a vmalloc()-space virtual address to the physical page.
- */
-struct page * vmalloc_to_page(void * vmalloc_addr)
-{
-	unsigned long addr = (unsigned long) vmalloc_addr;
-	struct page *page = NULL;
-	pgd_t *pgd = pgd_offset_k(addr);
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
-  
-	if (!pgd_none(*pgd)) {
-		pud = pud_offset(pgd, addr);
-		if (!pud_none(*pud)) {
-			pmd = pmd_offset(pud, addr);
-			if (!pmd_none(*pmd)) {
-				ptep = pte_offset_map(pmd, addr);
-				pte = *ptep;
-				if (pte_present(pte))
-					page = pte_page(pte);
-				pte_unmap(ptep);
-			}
-		}
-	}
-	return page;
-}
-
-EXPORT_SYMBOL(vmalloc_to_page);
-
-/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
- */
-unsigned long vmalloc_to_pfn(void * vmalloc_addr)
-{
-	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
-}
-
-EXPORT_SYMBOL(vmalloc_to_pfn);
-
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af77e17..e4c59a3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
 
+/*
+ * Map a vmalloc()-space virtual address to the physical page.
+ */
+struct page *vmalloc_to_page(void *vmalloc_addr)
+{
+	unsigned long addr = (unsigned long) vmalloc_addr;
+	struct page *page = NULL;
+	pgd_t *pgd = pgd_offset_k(addr);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+
+	if (!pgd_none(*pgd)) {
+		pud = pud_offset(pgd, addr);
+		if (!pud_none(*pud)) {
+			pmd = pmd_offset(pud, addr);
+			if (!pmd_none(*pmd)) {
+				ptep = pte_offset_map(pmd, addr);
+				pte = *ptep;
+				if (pte_present(pte))
+					page = pte_page(pte);
+				pte_unmap(ptep);
+			}
+		}
+	}
+	return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(void *vmalloc_addr)
+{
+	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
 static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
 					    unsigned long start, unsigned long end,
 					    int node, gfp_t gfp_mask)
-- 
cgit v1.1


From b3bdda02aa547a0753b4fdbc105e86ef9046b30b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:32 -0800
Subject: vmalloc: add const to void* parameters

Make vmalloc functions work the same way as kfree() and friends that
take a const void * argument.

[akpm@linux-foundation.org: fix consts, coding-style]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      |  4 ++--
 include/linux/vmalloc.h |  6 +++---
 mm/nommu.c              |  8 ++++----
 mm/vmalloc.c            | 16 ++++++++--------
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1961056..a862a96 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -232,8 +232,8 @@ static inline int get_page_unless_zero(struct page *page)
 }
 
 /* Support for virtually mapped pages */
-struct page *vmalloc_to_page(void *addr);
-unsigned long vmalloc_to_pfn(void *addr);
+struct page *vmalloc_to_page(const void *addr);
+unsigned long vmalloc_to_pfn(const void *addr);
 
 static inline struct page *compound_head(struct page *page)
 {
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 89338b4..ce8e7da 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -45,11 +45,11 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
-extern void vfree(void *addr);
+extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
-extern void vunmap(void *addr);
+extern void vunmap(const void *addr);
 
 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 							unsigned long pgoff);
@@ -71,7 +71,7 @@ extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 extern struct vm_struct *get_vm_area_node(unsigned long size,
 					  unsigned long flags, int node,
 					  gfp_t gfp_mask);
-extern struct vm_struct *remove_vm_area(void *addr);
+extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index b989cb9..f3bfd01 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(get_user_pages);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
-void vfree(void *addr)
+void vfree(const void *addr)
 {
 	kfree(addr);
 }
@@ -183,13 +183,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-struct page * vmalloc_to_page(void *addr)
+struct page *vmalloc_to_page(const void *addr)
 {
 	return virt_to_page(addr);
 }
 EXPORT_SYMBOL(vmalloc_to_page);
 
-unsigned long vmalloc_to_pfn(void *addr)
+unsigned long vmalloc_to_pfn(const void *addr)
 {
 	return page_to_pfn(virt_to_page(addr));
 }
@@ -267,7 +267,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_
 }
 EXPORT_SYMBOL(vmap);
 
-void vunmap(void *addr)
+void vunmap(const void *addr)
 {
 	BUG();
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e4c59a3..21abac2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 /*
  * Map a vmalloc()-space virtual address to the physical page.
  */
-struct page *vmalloc_to_page(void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
@@ -198,7 +198,7 @@ EXPORT_SYMBOL(vmalloc_to_page);
 /*
  * Map a vmalloc()-space virtual address to the physical page frame number.
  */
-unsigned long vmalloc_to_pfn(void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 {
 	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
 }
@@ -306,7 +306,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 }
 
 /* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(void *addr)
+static struct vm_struct *__find_vm_area(const void *addr)
 {
 	struct vm_struct *tmp;
 
@@ -319,7 +319,7 @@ static struct vm_struct *__find_vm_area(void *addr)
 }
 
 /* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(const void *addr)
 {
 	struct vm_struct **p, *tmp;
 
@@ -348,7 +348,7 @@ found:
  *	This function returns the found VM area, but using it is NOT safe
  *	on SMP machines, except for its size or flags.
  */
-struct vm_struct *remove_vm_area(void *addr)
+struct vm_struct *remove_vm_area(const void *addr)
 {
 	struct vm_struct *v;
 	write_lock(&vmlist_lock);
@@ -357,7 +357,7 @@ struct vm_struct *remove_vm_area(void *addr)
 	return v;
 }
 
-static void __vunmap(void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages)
 {
 	struct vm_struct *area;
 
@@ -408,7 +408,7 @@ static void __vunmap(void *addr, int deallocate_pages)
  *
  *	Must not be called in interrupt context.
  */
-void vfree(void *addr)
+void vfree(const void *addr)
 {
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 1);
@@ -424,7 +424,7 @@ EXPORT_SYMBOL(vfree);
  *
  *	Must not be called in interrupt context.
  */
-void vunmap(void *addr)
+void vunmap(const void *addr)
 {
 	BUG_ON(in_interrupt());
 	__vunmap(addr, 0);
-- 
cgit v1.1


From 0b7a96114bd5991d355a1f1c1d3d9c0c9d9c1cfc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:33 -0800
Subject: i386: Resolve dependency of asm-i386/pgtable.h on highmem.h

pgtable.h does not include highmem.h but uses various constants from
highmem.h.  We cannot include highmem.h because highmem.h will in turn include
many other include files that also depend on pgtable.h

So move the definitions from highmem.h into pgtable.h.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-x86/highmem.h    | 6 ------
 include/asm-x86/pgtable_32.h | 8 ++++++++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h
index c25cfca..479767c 100644
--- a/include/asm-x86/highmem.h
+++ b/include/asm-x86/highmem.h
@@ -38,11 +38,6 @@ extern pte_t *pkmap_page_table;
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
 /*
  * Ordering is:
  *
@@ -58,7 +53,6 @@ extern pte_t *pkmap_page_table;
  * VMALLOC_START
  * high_memory
  */
-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index 935630d..80dd438 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -66,6 +66,14 @@ void paging_init(void);
 #define VMALLOC_OFFSET	(8*1024*1024)
 #define VMALLOC_START	(((unsigned long) high_memory + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
 #else
-- 
cgit v1.1


From 9e2779fa281cfda13ac060753d674bbcaa23367e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:34 -0800
Subject: is_vmalloc_addr(): Check if an address is within the vmalloc
 boundaries

Checking if an address is a vmalloc address is done in a couple of places.
Define a common version in mm.h and replace the other checks.

Again the include structures suck.  The definition of VMALLOC_START and
VMALLOC_END is not available in vmalloc.h since highmem.c cannot be included
there.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/cxgb3/cxgb3_offload.c |  4 +---
 fs/ntfs/malloc.h                  |  3 +--
 fs/proc/kcore.c                   |  2 +-
 fs/xfs/linux-2.6/kmem.c           |  3 +--
 fs/xfs/linux-2.6/xfs_buf.c        |  3 +--
 include/linux/mm.h                |  8 ++++++++
 mm/sparse.c                       | 10 +---------
 7 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c
index d48c396..901c824 100644
--- a/drivers/net/cxgb3/cxgb3_offload.c
+++ b/drivers/net/cxgb3/cxgb3_offload.c
@@ -1070,9 +1070,7 @@ void *cxgb_alloc_mem(unsigned long size)
  */
 void cxgb_free_mem(void *addr)
 {
-	unsigned long p = (unsigned long)addr;
-
-	if (p >= VMALLOC_START && p < VMALLOC_END)
+	if (is_vmalloc_addr(addr))
 		vfree(addr);
 	else
 		kfree(addr);
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index e38e402..cd0be3f 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,8 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 
 static inline void ntfs_free(void *addr)
 {
-	if (likely(((unsigned long)addr < VMALLOC_START) ||
-			((unsigned long)addr >= VMALLOC_END ))) {
+	if (!is_vmalloc_addr(addr)) {
 		kfree(addr);
 		/* free_page((unsigned long)addr); */
 		return;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1be7308..7dd26e1 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -325,7 +325,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		if (m == NULL) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
-		} else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) {
+		} else if (is_vmalloc_addr((void *)start)) {
 			char * elf_buf;
 			struct vm_struct *m;
 			unsigned long curstart = start;
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index ed2b16d..e040f1c 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -92,8 +92,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
 void
 kmem_free(void *ptr, size_t size)
 {
-	if (((unsigned long)ptr < VMALLOC_START) ||
-	    ((unsigned long)ptr >= VMALLOC_END)) {
+	if (!is_vmalloc_addr(ptr)) {
 		kfree(ptr);
 	} else {
 		vfree(ptr);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index a49dd8d..0382c19 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -709,8 +709,7 @@ static inline struct page *
 mem_to_page(
 	void			*addr)
 {
-	if (((unsigned long)addr < VMALLOC_START) ||
-	    ((unsigned long)addr >= VMALLOC_END)) {
+	if ((!is_vmalloc_addr(addr))) {
 		return virt_to_page(addr);
 	} else {
 		return vmalloc_to_page(addr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a862a96..f28a033 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -235,6 +235,14 @@ static inline int get_page_unless_zero(struct page *page)
 struct page *vmalloc_to_page(const void *addr);
 unsigned long vmalloc_to_pfn(const void *addr);
 
+/* Determine if an address is within the vmalloc range */
+static inline int is_vmalloc_addr(const void *x)
+{
+	unsigned long addr = (unsigned long)x;
+
+	return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
diff --git a/mm/sparse.c b/mm/sparse.c
index a2183cb..7859c80 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 	return __kmalloc_section_memmap(nr_pages);
 }
 
-static int vaddr_in_vmalloc_area(void *addr)
-{
-	if (addr >= (void *)VMALLOC_START &&
-	    addr < (void *)VMALLOC_END)
-		return 1;
-	return 0;
-}
-
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-	if (vaddr_in_vmalloc_area(memmap))
+	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
 		free_pages((unsigned long)memmap,
-- 
cgit v1.1


From bf53d6f8fa467397a16de2a2500312ae26528d34 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:34 -0800
Subject: vmalloc: clean up page array indexing

The page array is repeatedly indexed both in vunmap and vmalloc_area_node().
Add a temporary variable to make it easier to read (and easier to patch
later).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 21abac2..83625b6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -384,8 +384,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
 		int i;
 
 		for (i = 0; i < area->nr_pages; i++) {
-			BUG_ON(!area->pages[i]);
-			__free_page(area->pages[i]);
+			struct page *page = area->pages[i];
+
+			BUG_ON(!page);
+			__free_page(page);
 		}
 
 		if (area->flags & VM_VPAGES)
@@ -489,15 +491,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	}
 
 	for (i = 0; i < area->nr_pages; i++) {
+		struct page *page;
+
 		if (node < 0)
-			area->pages[i] = alloc_page(gfp_mask);
+			page = alloc_page(gfp_mask);
 		else
-			area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
-		if (unlikely(!area->pages[i])) {
+			page = alloc_pages_node(node, gfp_mask, 0);
+
+		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
 			area->nr_pages = i;
 			goto fail;
 		}
+		area->pages[i] = page;
 	}
 
 	if (map_vm_area(area, prot, &pages))
-- 
cgit v1.1


From aec2c3ed01ed54d0cdf7f6b7c4be217c045ac5ea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:35 -0800
Subject: VM: allow get_page_unless_zero on compound pages

Both slab defrag and the large blocksize patches need to ability to take
refcounts on compound pages.  May be useful in other places as well.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28a033..bcbe697 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -227,7 +227,7 @@ static inline int put_page_testzero(struct page *page)
  */
 static inline int get_page_unless_zero(struct page *page)
 {
-	VM_BUG_ON(PageCompound(page));
+	VM_BUG_ON(PageTail(page));
 	return atomic_inc_not_zero(&page->_count);
 }
 
-- 
cgit v1.1


From b98938c373117043598002f197200d7ed08acd49 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:28:36 -0800
Subject: bufferhead: revert constructor removal

The constructor for buffer_head slabs was removed recently.  We need the
constructor back in slab defrag in order to insure that slab objects always
have a definite state even before we allocated them.

I think we mistakenly merged the removal of the constuctor into a cleanup
patch.  You (ie: akpm) had a test that showed that the removal of the
constructor led to a small regression.  The prior state makes things easier
for slab defrag.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 1de9214..826baf4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3153,7 +3153,7 @@ static void recalc_bh_state(void)
 	
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
-	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep,
+	struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
 				set_migrateflags(gfp_flags, __GFP_RECLAIMABLE));
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
@@ -3241,12 +3241,24 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
+static void
+init_buffer_head(struct kmem_cache *cachep, void *data)
+{
+	struct buffer_head *bh = data;
+
+	memset(bh, 0, sizeof(*bh));
+	INIT_LIST_HEAD(&bh->b_assoc_buffers);
+}
+
 void __init buffer_init(void)
 {
 	int nrpages;
 
-	bh_cachep = KMEM_CACHE(buffer_head,
-			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+	bh_cachep = kmem_cache_create("buffer_head",
+			sizeof(struct buffer_head), 0,
+				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+				SLAB_MEM_SPREAD),
+				init_buffer_head);
 
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
-- 
cgit v1.1


From 75897d60a54ccee94253312107f941a83b5077cb Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 4 Feb 2008 22:28:36 -0800
Subject: hugetlb: allow sticky directory mount option

Allow sticky directory mount option for hugetlbfs.  This allows admin
to create a shared hugetlbfs mount point for multiple users, while
prevent accidental file deletion that users may step on each other.
It is similiar to default tmpfs mount option, or typical option used
on /tmp.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <hermes@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 09ee07f..3b3cc28 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -768,7 +768,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 		case Opt_mode:
 			if (match_octal(&args[0], &option))
  				goto bad_val;
-			pconfig->mode = option & 0777U;
+			pconfig->mode = option & 01777U;
 			break;
 
 		case Opt_size: {
-- 
cgit v1.1


From c4cc6d07b2f465fbf5efd99bbe772a49c515f3f2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:40 -0800
Subject: swapin_readahead: excise NUMA bogosity

For three years swapin_readahead has been cluttered with fanciful CONFIG_NUMA
code, advancing addr, and stepping on to the next vma at the boundary, to line
up the mempolicy for each page allocation.

It _might_ be a good idea to allocate swap more according to vma layout; but
the fact is, that's not how we do it at all, 2.6 even less than 2.4: swap is
allocated as needed for pages as they sink to the bottom of the inactive LRUs.
 Sometimes that may match vma layout, but not so often that it's worth going
to these misleading vma->vm_next lengths: rip all that out.

Originally I intended to retain the incrementation of addr, but correct its
initial value: valid_swaphandles generally supplies an offset below the target
addr (this is readaround rather than readahead), but addr has not been
adjusted accordingly, so in the interleave case it has usually been allocating
the target page from the "wrong" node (though that may not matter very much).

But look at the equivalent shmem_swapin code: either by oversight or by
design, though it has all the apparatus for choosing a new mempolicy per page,
it uses the same idx throughout, choosing the same mempolicy and interleave
node for each page of the cluster.

Which is actually a much better strategy: each node has its own LRUs and its
own kswapd, so if you're betting on any particular relationship between swap
and node, the best bet is that nearby swap entries belong to pages from the
same node - even when the mempolicy of the target page is to interleave.  And
examining a map of nodes corresponding to swap entries on a numa=fake system
bears this out.  (We could later tweak swap allocation to make it even more
likely, but this patch is merely about removing cruft.)

So, neither adjust nor increment addr in swapin_readahead, and then
shmem_swapin can use it too; the pseudo-vma to pass policy need only be set up
once per cluster, and so few fields of pvma are used, let's skip the memset -
from shmem_alloc_page also.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 47 ++++++++++++++---------------------------------
 mm/shmem.c  | 43 ++++++++++++-------------------------------
 2 files changed, 26 insertions(+), 64 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1b8ca16..1d803c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1998,45 +1998,26 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
  */
 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
 {
-#ifdef CONFIG_NUMA
-	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
-#endif
-	int i, num;
-	struct page *new_page;
+	int nr_pages;
+	struct page *page;
 	unsigned long offset;
+	unsigned long end_offset;
 
 	/*
-	 * Get the number of handles we should do readahead io to.
+	 * Get starting offset for readaround, and number of pages to read.
+	 * Adjust starting address by readbehind (for NUMA interleave case)?
+	 * No, it's very unlikely that swap layout would follow vma layout,
+	 * more likely that neighbouring swap pages came from the same node:
+	 * so use the same "addr" to choose the same node for each swap read.
 	 */
-	num = valid_swaphandles(entry, &offset);
-	for (i = 0; i < num; offset++, i++) {
+	nr_pages = valid_swaphandles(entry, &offset);
+	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
 		/* Ok, do the async read-ahead now */
-		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
-							   offset), vma, addr);
-		if (!new_page)
+		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+						vma, addr);
+		if (!page)
 			break;
-		page_cache_release(new_page);
-#ifdef CONFIG_NUMA
-		/*
-		 * Find the next applicable VMA for the NUMA policy.
-		 */
-		addr += PAGE_SIZE;
-		if (addr == 0)
-			vma = NULL;
-		if (vma) {
-			if (addr >= vma->vm_end) {
-				vma = next_vma;
-				next_vma = vma ? vma->vm_next : NULL;
-			}
-			if (vma && addr < vma->vm_start)
-				vma = NULL;
-		} else {
-			if (next_vma && addr >= next_vma->vm_start) {
-				vma = next_vma;
-				next_vma = vma->vm_next;
-			}
-		}
-#endif
+		page_cache_release(page);
 	}
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 51b3d6c..88c6685 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1025,53 +1025,34 @@ out:
 	return err;
 }
 
-static struct page *shmem_swapin_async(struct shared_policy *p,
+static struct page *shmem_swapin(struct shmem_inode_info *info,
 				       swp_entry_t entry, unsigned long idx)
 {
-	struct page *page;
 	struct vm_area_struct pvma;
+	struct page *page;
 
 	/* Create a pseudo vma that just contains the policy */
-	memset(&pvma, 0, sizeof(struct vm_area_struct));
-	pvma.vm_end = PAGE_SIZE;
+	pvma.vm_start = 0;
 	pvma.vm_pgoff = idx;
-	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
+	pvma.vm_ops = NULL;
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	swapin_readahead(entry, 0, &pvma);
 	page = read_swap_cache_async(entry, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
 }
 
-static struct page *shmem_swapin(struct shmem_inode_info *info,
-				 swp_entry_t entry, unsigned long idx)
-{
-	struct shared_policy *p = &info->policy;
-	int i, num;
-	struct page *page;
-	unsigned long offset;
-
-	num = valid_swaphandles(entry, &offset);
-	for (i = 0; i < num; offset++, i++) {
-		page = shmem_swapin_async(p,
-				swp_entry(swp_type(entry), offset), idx);
-		if (!page)
-			break;
-		page_cache_release(page);
-	}
-	lru_add_drain();	/* Push any new pages onto the LRU now */
-	return shmem_swapin_async(p, entry, idx);
-}
-
-static struct page *
-shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
-		 unsigned long idx)
+static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
+					unsigned long idx)
 {
 	struct vm_area_struct pvma;
 	struct page *page;
 
-	memset(&pvma, 0, sizeof(struct vm_area_struct));
-	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	/* Create a pseudo vma that just contains the policy */
+	pvma.vm_start = 0;
 	pvma.vm_pgoff = idx;
-	pvma.vm_end = PAGE_SIZE;
+	pvma.vm_ops = NULL;
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	page = alloc_page_vma(gfp, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
-- 
cgit v1.1


From 46017e954826ac59e91df76341a3f76b45467847 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:41 -0800
Subject: swapin_readahead: move and rearrange args

swapin_readahead has never sat well in mm/memory.c: move it to mm/swap_state.c
beside its kindred read_swap_cache_async.  Why were its args in a different
order?  rearrange them.  And since it was always followed by a
read_swap_cache_async of the target page, fold that in and return struct
page*.  Then CONFIG_SWAP=n no longer needs valid_swaphandles and
read_swap_cache_async stubs.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 19 +++++++------------
 mm/memory.c          | 45 +--------------------------------------------
 mm/shmem.c           |  6 ++----
 mm/swap_state.c      | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4f3838a..9fa1aef 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -158,9 +158,6 @@ struct swap_list_t {
 /* Swap 50% full? Release swapcache more aggressively.. */
 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
 
-/* linux/mm/memory.c */
-extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
-
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
@@ -230,9 +227,12 @@ extern int move_from_swap_cache(struct page *, unsigned long,
 		struct address_space *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
-extern struct page * lookup_swap_cache(swp_entry_t);
-extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
-					   unsigned long addr);
+extern struct page *lookup_swap_cache(swp_entry_t);
+extern struct page *read_swap_cache_async(swp_entry_t,
+			struct vm_area_struct *vma, unsigned long addr);
+extern struct page *swapin_readahead(swp_entry_t,
+			struct vm_area_struct *vma, unsigned long addr);
+
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
@@ -306,7 +306,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline struct page *read_swap_cache_async(swp_entry_t swp,
+static inline struct page *swapin_readahead(swp_entry_t swp,
 			struct vm_area_struct *vma, unsigned long addr)
 {
 	return NULL;
@@ -317,11 +317,6 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
 	return NULL;
 }
 
-static inline int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
-	return 0;
-}
-
 #define can_share_swap_page(p)			(page_mapcount(p) == 1)
 
 static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
diff --git a/mm/memory.c b/mm/memory.c
index 1d803c2..ccc9403 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1980,48 +1980,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	return 0;
 }
 
-/**
- * swapin_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
- * @addr: address to start
- * @vma: user vma this addresses belong to
- *
- * Primitive swap readahead code. We simply read an aligned block of
- * (1 << page_cluster) entries in the swap area. This method is chosen
- * because it doesn't cost us any seek time.  We also make sure to queue
- * the 'original' request together with the readahead ones...
- *
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
- */
-void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
-{
-	int nr_pages;
-	struct page *page;
-	unsigned long offset;
-	unsigned long end_offset;
-
-	/*
-	 * Get starting offset for readaround, and number of pages to read.
-	 * Adjust starting address by readbehind (for NUMA interleave case)?
-	 * No, it's very unlikely that swap layout would follow vma layout,
-	 * more likely that neighbouring swap pages came from the same node:
-	 * so use the same "addr" to choose the same node for each swap read.
-	 */
-	nr_pages = valid_swaphandles(entry, &offset);
-	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
-		/* Ok, do the async read-ahead now */
-		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
-						vma, addr);
-		if (!page)
-			break;
-		page_cache_release(page);
-	}
-	lru_add_drain();	/* Push any new pages onto the LRU now */
-}
-
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2049,8 +2007,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		grab_swap_token(); /* Contend for token _before_ read-in */
- 		swapin_readahead(entry, address, vma);
- 		page = read_swap_cache_async(entry, vma, address);
+		page = swapin_readahead(entry, vma, address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
diff --git a/mm/shmem.c b/mm/shmem.c
index 88c6685..3a22a8f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1036,8 +1036,7 @@ static struct page *shmem_swapin(struct shmem_inode_info *info,
 	pvma.vm_pgoff = idx;
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
-	swapin_readahead(entry, 0, &pvma);
-	page = read_swap_cache_async(entry, &pvma, 0);
+	page = swapin_readahead(entry, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
 }
@@ -1067,8 +1066,7 @@ static inline int shmem_parse_mpol(char *value, int *policy,
 static inline struct page *
 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 {
-	swapin_readahead(entry, 0, NULL);
-	return read_swap_cache_async(entry, NULL, 0);
+	return swapin_readahead(entry, NULL, 0);
 }
 
 static inline struct page *
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b526356..668a804 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -368,3 +369,49 @@ struct page *read_swap_cache_async(swp_entry_t entry,
 		page_cache_release(new_page);
 	return found_page;
 }
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @vma: user vma this address belongs to
+ * @addr: target address for mempolicy
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+struct page *swapin_readahead(swp_entry_t entry,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	int nr_pages;
+	struct page *page;
+	unsigned long offset;
+	unsigned long end_offset;
+
+	/*
+	 * Get starting offset for readaround, and number of pages to read.
+	 * Adjust starting address by readbehind (for NUMA interleave case)?
+	 * No, it's very unlikely that swap layout would follow vma layout,
+	 * more likely that neighbouring swap pages came from the same node:
+	 * so use the same "addr" to choose the same node for each swap read.
+	 */
+	nr_pages = valid_swaphandles(entry, &offset);
+	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+		/* Ok, do the async read-ahead now */
+		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+						vma, addr);
+		if (!page)
+			break;
+		page_cache_release(page);
+	}
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+	return read_swap_cache_async(entry, vma, addr);
+}
-- 
cgit v1.1


From 02098feaa42b2e0087fbbe6c6ab9a23e4653b16a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:42 -0800
Subject: swapin needs gfp_mask for loop on tmpfs

Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.

In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high.  But now, with the new aops, it has
become easy to hang on one machine.

Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).

When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.

So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  6 +++---
 mm/memory.c          |  3 ++-
 mm/shmem.c           | 28 ++++++++++++++--------------
 mm/swap_state.c      | 18 +++++++++---------
 mm/swapfile.c        |  3 ++-
 5 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9fa1aef..16fd120 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -228,9 +228,9 @@ extern int move_from_swap_cache(struct page *, unsigned long,
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t);
-extern struct page *read_swap_cache_async(swp_entry_t,
+extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
-extern struct page *swapin_readahead(swp_entry_t,
+extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
 
 /* linux/mm/swapfile.c */
@@ -306,7 +306,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline struct page *swapin_readahead(swp_entry_t swp,
+static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr)
 {
 	return NULL;
diff --git a/mm/memory.c b/mm/memory.c
index ccc9403..bc13775 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2007,7 +2007,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		grab_swap_token(); /* Contend for token _before_ read-in */
-		page = swapin_readahead(entry, vma, address);
+		page = swapin_readahead(entry,
+					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
diff --git a/mm/shmem.c b/mm/shmem.c
index 3a22a8f..55b696a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1025,8 +1025,8 @@ out:
 	return err;
 }
 
-static struct page *shmem_swapin(struct shmem_inode_info *info,
-				       swp_entry_t entry, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+			struct shmem_inode_info *info, unsigned long idx)
 {
 	struct vm_area_struct pvma;
 	struct page *page;
@@ -1036,13 +1036,13 @@ static struct page *shmem_swapin(struct shmem_inode_info *info,
 	pvma.vm_pgoff = idx;
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
-	page = swapin_readahead(entry, &pvma, 0);
+	page = swapin_readahead(entry, gfp, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
 }
 
-static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
-					unsigned long idx)
+static struct page *shmem_alloc_page(gfp_t gfp,
+			struct shmem_inode_info *info, unsigned long idx)
 {
 	struct vm_area_struct pvma;
 	struct page *page;
@@ -1063,14 +1063,14 @@ static inline int shmem_parse_mpol(char *value, int *policy,
 	return 1;
 }
 
-static inline struct page *
-shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+			struct shmem_inode_info *info, unsigned long idx)
 {
-	return swapin_readahead(entry, NULL, 0);
+	return swapin_readahead(entry, gfp, NULL, 0);
 }
 
-static inline struct page *
-shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_alloc_page(gfp_t gfp,
+			struct shmem_inode_info *info, unsigned long idx)
 {
 	return alloc_page(gfp);
 }
@@ -1093,6 +1093,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 	struct page *swappage;
 	swp_entry_t *entry;
 	swp_entry_t swap;
+	gfp_t gfp;
 	int error;
 
 	if (idx >= SHMEM_MAX_INDEX)
@@ -1117,6 +1118,7 @@ repeat:
 	error = 0;
 	if (sgp == SGP_QUICK)
 		goto failed;
+	gfp = mapping_gfp_mask(mapping);
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
@@ -1139,7 +1141,7 @@ repeat:
 				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
-			swappage = shmem_swapin(info, swap, idx);
+			swappage = shmem_swapin(swap, gfp, info, idx);
 			if (!swappage) {
 				spin_lock(&info->lock);
 				entry = shmem_swp_alloc(info, idx, sgp);
@@ -1251,9 +1253,7 @@ repeat:
 
 		if (!filepage) {
 			spin_unlock(&info->lock);
-			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
-						    info,
-						    idx);
+			filepage = shmem_alloc_page(gfp, info, idx);
 			if (!filepage) {
 				shmem_unacct_blocks(info->flags, 1);
 				shmem_free_blocks(inode, 1);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 668a804..e787564 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -96,7 +96,8 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 	return error;
 }
 
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
+static int add_to_swap_cache(struct page *page, swp_entry_t entry,
+				gfp_t gfp_mask)
 {
 	int error;
 
@@ -106,7 +107,7 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
 		return -ENOENT;
 	}
 	SetPageLocked(page);
-	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
+	error = __add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL);
 	/*
 	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
 	 */
@@ -318,7 +319,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
  * A failure return means that either the page allocation failed or that
  * the swap entry is no longer in use.
  */
-struct page *read_swap_cache_async(swp_entry_t entry,
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *found_page, *new_page = NULL;
@@ -338,8 +339,7 @@ struct page *read_swap_cache_async(swp_entry_t entry,
 		 * Get a new page to read into from swap.
 		 */
 		if (!new_page) {
-			new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-								vma, addr);
+			new_page = alloc_page_vma(gfp_mask, vma, addr);
 			if (!new_page)
 				break;		/* Out of memory */
 		}
@@ -354,7 +354,7 @@ struct page *read_swap_cache_async(swp_entry_t entry,
 		 * the just freed swap entry for an existing page.
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
-		err = add_to_swap_cache(new_page, entry);
+		err = add_to_swap_cache(new_page, entry, gfp_mask);
 		if (!err) {
 			/*
 			 * Initiate read into locked page and return.
@@ -388,7 +388,7 @@ struct page *read_swap_cache_async(swp_entry_t entry,
  *
  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
  */
-struct page *swapin_readahead(swp_entry_t entry,
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr)
 {
 	int nr_pages;
@@ -407,11 +407,11 @@ struct page *swapin_readahead(swp_entry_t entry,
 	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
 		/* Ok, do the async read-ahead now */
 		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
-						vma, addr);
+						gfp_mask, vma, addr);
 		if (!page)
 			break;
 		page_cache_release(page);
 	}
 	lru_add_drain();	/* Push any new pages onto the LRU now */
-	return read_swap_cache_async(entry, vma, addr);
+	return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648..ab93505 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -730,7 +730,8 @@ static int try_to_unuse(unsigned int type)
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
-		page = read_swap_cache_async(entry, NULL, 0);
+		page = read_swap_cache_async(entry,
+					GFP_HIGHUSER_MOVABLE, NULL, 0);
 		if (!page) {
 			/*
 			 * Either swap_duplicate() failed because entry
-- 
cgit v1.1


From 27d54b398ec0edea0e7417f003171017300e0efc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:43 -0800
Subject: shmem: SGP_QUICK and SGP_FAULT redundant

Remove SGP_QUICK from the sgp_type enum: it was for shmem_populate and has no
users now.  Remove SGP_FAULT from the enum: SGP_CACHE does just as well (and
shmem_getpage is about to return with page always locked).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 55b696a..20cefe1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,11 +78,9 @@
 
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
-	SGP_QUICK,	/* don't try more than file page cache lookup */
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
-	SGP_FAULT,	/* same as SGP_CACHE, return with page locked */
 };
 
 static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -1116,8 +1114,6 @@ repeat:
 	if (filepage && PageUptodate(filepage))
 		goto done;
 	error = 0;
-	if (sgp == SGP_QUICK)
-		goto failed;
 	gfp = mapping_gfp_mask(mapping);
 
 	spin_lock(&info->lock);
@@ -1292,7 +1288,7 @@ repeat:
 done:
 	if (*pagep != filepage) {
 		*pagep = filepage;
-		if (sgp != SGP_FAULT)
+		if (sgp != SGP_CACHE)
 			unlock_page(filepage);
 
 	}
@@ -1315,7 +1311,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
+	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
 	if (error)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
-- 
cgit v1.1


From d3602444e1e3485890eea5f61366e19a287c00c4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:44 -0800
Subject: shmem_getpage return page locked

In the new aops, write_begin is supposed to return the page locked: though
I've seen no ill effects, that's been overlooked in the case of
shmem_write_begin, and should be fixed.  Then shmem_write_end must unlock the
page: do so _after_ updating i_size, as we found to be important in other
filesystems (though since shmem pages don't go the usual writeback route, they
never suffered from that corruption).

For shmem_write_begin to return the page locked, we need shmem_getpage to
return the page locked in SGP_WRITE case as well as SGP_CACHE case: let's
simplify the interface and return it locked even when SGP_READ.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 20cefe1..43d0719 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -729,6 +729,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 				(void) shmem_getpage(inode,
 					attr->ia_size>>PAGE_CACHE_SHIFT,
 						&page, SGP_READ, NULL);
+				if (page)
+					unlock_page(page);
 			}
 			/*
 			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
@@ -1286,12 +1288,7 @@ repeat:
 		SetPageUptodate(filepage);
 	}
 done:
-	if (*pagep != filepage) {
-		*pagep = filepage;
-		if (sgp != SGP_CACHE)
-			unlock_page(filepage);
-
-	}
+	*pagep = filepage;
 	return 0;
 
 failed:
@@ -1469,12 +1466,13 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 
+	if (pos + copied > inode->i_size)
+		i_size_write(inode, pos + copied);
+
+	unlock_page(page);
 	set_page_dirty(page);
 	page_cache_release(page);
 
-	if (pos+copied > inode->i_size)
-		i_size_write(inode, pos+copied);
-
 	return copied;
 }
 
@@ -1529,6 +1527,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 		if (err)
 			break;
 
+		unlock_page(page);
 		left = bytes;
 		if (PageHighMem(page)) {
 			volatile unsigned char dummy;
@@ -1610,6 +1609,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 				desc->error = 0;
 			break;
 		}
+		if (page)
+			unlock_page(page);
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
@@ -1899,6 +1900,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 			iput(inode);
 			return error;
 		}
+		unlock_page(page);
 		inode->i_op = &shmem_symlink_inode_operations;
 		kaddr = kmap_atomic(page, KM_USER0);
 		memcpy(kaddr, symname, len);
@@ -1926,6 +1928,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct page *page = NULL;
 	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
 	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+	if (page)
+		unlock_page(page);
 	return page;
 }
 
-- 
cgit v1.1


From 5402b976ae0be96b3a32f3508ab7308c380d6477 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:44 -0800
Subject: shmem_file_write is redundant

With the old aops, writing to a tmpfs file had to use its own special method:
the generic method would pass in a fresh page to prepare_write when the right
page was there in swapcache - which was inefficient to handle, even once we'd
concocted the code to handle it.

With the new aops, the generic method uses shmem_write_end, which lets
shmem_getpage find the right page: so now abandon shmem_file_write in favour
of the generic method.  Yes, that does do several things that tmpfs hasn't
really needed (notably balance_dirty_pages_ratelimited, which ramfs also
calls); but more use of common code is preferable.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 109 ++-----------------------------------------------------------
 1 file changed, 3 insertions(+), 106 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 43d0719..5dfe790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1106,7 +1106,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 	 * Normally, filepage is NULL on entry, and either found
 	 * uptodate immediately, or allocated and zeroed, or read
 	 * in under swappage, which is then assigned to filepage.
-	 * But shmem_readpage and shmem_write_begin pass in a locked
+	 * But shmem_readpage (required for splice) passes in a locked
 	 * filepage, which may be found not uptodate by other callers
 	 * too, and may need to be copied from the swappage read in.
 	 */
@@ -1476,110 +1476,6 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
-static ssize_t
-shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
-	struct inode	*inode = file->f_path.dentry->d_inode;
-	loff_t		pos;
-	unsigned long	written;
-	ssize_t		err;
-
-	if ((ssize_t) count < 0)
-		return -EINVAL;
-
-	if (!access_ok(VERIFY_READ, buf, count))
-		return -EFAULT;
-
-	mutex_lock(&inode->i_mutex);
-
-	pos = *ppos;
-	written = 0;
-
-	err = generic_write_checks(file, &pos, &count, 0);
-	if (err || !count)
-		goto out;
-
-	err = remove_suid(file->f_path.dentry);
-	if (err)
-		goto out;
-
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-
-	do {
-		struct page *page = NULL;
-		unsigned long bytes, index, offset;
-		char *kaddr;
-		int left;
-
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		/*
-		 * We don't hold page lock across copy from user -
-		 * what would it guard against? - so no deadlock here.
-		 * But it still may be a good idea to prefault below.
-		 */
-
-		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
-		if (err)
-			break;
-
-		unlock_page(page);
-		left = bytes;
-		if (PageHighMem(page)) {
-			volatile unsigned char dummy;
-			__get_user(dummy, buf);
-			__get_user(dummy, buf + bytes - 1);
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			left = __copy_from_user_inatomic(kaddr + offset,
-							buf, bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		if (left) {
-			kaddr = kmap(page);
-			left = __copy_from_user(kaddr + offset, buf, bytes);
-			kunmap(page);
-		}
-
-		written += bytes;
-		count -= bytes;
-		pos += bytes;
-		buf += bytes;
-		if (pos > inode->i_size)
-			i_size_write(inode, pos);
-
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-
-		if (left) {
-			pos -= left;
-			written -= left;
-			err = -EFAULT;
-			break;
-		}
-
-		/*
-		 * Our dirty pages are not counted in nr_dirty,
-		 * and we do not attempt to balance dirty pages.
-		 */
-
-		cond_resched();
-	} while (count);
-
-	*ppos = pos;
-	if (written)
-		err = written;
-out:
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -2354,7 +2250,8 @@ static const struct file_operations shmem_file_operations = {
 #ifdef CONFIG_TMPFS
 	.llseek		= generic_file_llseek,
 	.read		= shmem_file_read,
-	.write		= shmem_file_write,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.fsync		= simple_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
-- 
cgit v1.1


From 8952898b0d25223f38daf46b86156fd1c4d17ad0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:45 -0800
Subject: swapin: fix valid_swaphandles defect

valid_swaphandles is supposed to do a quick pass over the swap map entries
neigbouring the entry which swapin_readahead is targetting, to determine for
it a range worth reading all together.  But since it always starts its search
from the beginning of the swap "cluster", a reject (free entry) there
immediately curtails the readaround, and every swapin_readahead from that
cluster is for just a single page.  Instead scan forwards and backwards around
the target entry.

Use better names for some variables: a swap_info pointer is usually called
"si" not "swapdev".  And at the end, if only the target page should be read,
return count of 0 to disable readaround, to avoid the unnecessarily repeated
call to read_swap_cache_async.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ab93505..f5ba723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1769,31 +1769,48 @@ get_swap_info_struct(unsigned type)
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 {
+	struct swap_info_struct *si;
 	int our_page_cluster = page_cluster;
-	int ret = 0, i = 1 << our_page_cluster;
-	unsigned long toff;
-	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+	pgoff_t target, toff;
+	pgoff_t base, end;
+	int nr_pages = 0;
 
 	if (!our_page_cluster)	/* no readahead */
 		return 0;
-	toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
-	if (!toff)		/* first page is swap header */
-		toff++, i--;
-	*offset = toff;
+
+	si = &swap_info[swp_type(entry)];
+	target = swp_offset(entry);
+	base = (target >> our_page_cluster) << our_page_cluster;
+	end = base + (1 << our_page_cluster);
+	if (!base)		/* first page is swap header */
+		base++;
 
 	spin_lock(&swap_lock);
-	do {
-		/* Don't read-ahead past the end of the swap area */
-		if (toff >= swapdev->max)
+	if (end > si->max)	/* don't go beyond end of map */
+		end = si->max;
+
+	/* Count contiguous allocated slots above our target */
+	for (toff = target; ++toff < end; nr_pages++) {
+		/* Don't read in free or bad pages */
+		if (!si->swap_map[toff])
+			break;
+		if (si->swap_map[toff] == SWAP_MAP_BAD)
 			break;
+	}
+	/* Count contiguous allocated slots below our target */
+	for (toff = target; --toff >= base; nr_pages++) {
 		/* Don't read in free or bad pages */
-		if (!swapdev->swap_map[toff])
+		if (!si->swap_map[toff])
 			break;
-		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+		if (si->swap_map[toff] == SWAP_MAP_BAD)
 			break;
-		toff++;
-		ret++;
-	} while (--i);
+	}
 	spin_unlock(&swap_lock);
-	return ret;
+
+	/*
+	 * Indicate starting offset, and return number of pages to get:
+	 * if only 1, say 0, since there's then no readahead to be done.
+	 */
+	*offset = ++toff;
+	return nr_pages? ++nr_pages: 0;
 }
-- 
cgit v1.1


From 2e441889c38fe1b6ef6b963e6993076aa120176c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:46 -0800
Subject: swapoff: scan ptes preemptibly

Provided that CONFIG_HIGHPTE is not set, unuse_pte_range can reduce latency
in swapoff by scanning the page table preemptibly: so long as unuse_pte is
careful to recheck that entry under pte lock.

(To tell the truth, this patch was not inspired by any cries for lower
latency here: rather, this restructuring permits a future memory controller
patch to allocate with GFP_KERNEL in unuse_pte, where before it could not.
But it would be wrong to tuck this change away inside a memcgroup patch.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index f5ba723..14bc4f2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -506,9 +506,19 @@ unsigned int count_swap_pages(int type, int free)
  * just let do_wp_page work it out if a write is requested later - to
  * force COW, vm_page_prot omits write permission from any private vma.
  */
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
+	spinlock_t *ptl;
+	pte_t *pte;
+	int found = 1;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
+		found = 0;
+		goto out;
+	}
+
 	inc_mm_counter(vma->vm_mm, anon_rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +530,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 	 * immediately swapped out again after swapon.
 	 */
 	activate_page(page);
+out:
+	pte_unmap_unlock(pte, ptl);
+	return found;
 }
 
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,22 +541,33 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	pte_t swp_pte = swp_entry_to_pte(entry);
 	pte_t *pte;
-	spinlock_t *ptl;
 	int found = 0;
 
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	/*
+	 * We don't actually need pte lock while scanning for swp_pte: since
+	 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
+	 * page table while we're scanning; though it could get zapped, and on
+	 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
+	 * of unmatched parts which look like swp_pte, so unuse_pte must
+	 * recheck under pte lock.  Scanning without pte lock lets it be
+	 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+	 */
+	pte = pte_offset_map(pmd, addr);
 	do {
 		/*
 		 * swapoff spends a _lot_ of time in this loop!
 		 * Test inline before going to call unuse_pte.
 		 */
 		if (unlikely(pte_same(*pte, swp_pte))) {
-			unuse_pte(vma, pte++, addr, entry, page);
-			found = 1;
-			break;
+			pte_unmap(pte);
+			found = unuse_pte(vma, pmd, addr, entry, page);
+			if (found)
+				goto out;
+			pte = pte_offset_map(pmd, addr);
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
+	pte_unmap(pte - 1);
+out:
 	return found;
 }
 
-- 
cgit v1.1


From 5b04c6890f0dc7ea6c85b9adebc883c55c667d97 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 4 Feb 2008 22:28:47 -0800
Subject: shmem: factor out sbi->free_inodes manipulations

The shmem_sb_info structure has a number of free_inodes. This
value is altered in appropriate places under spinlock and with
the sbi->max_inodes != 0 check.

Consolidate these manipulations into two helpers.

This is minus 42 bytes of shmem.o and minus 4 :) lines of code.

[akpm@linux-foundation.org: fix error return values]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 77 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 5dfe790..ce64b66 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -205,6 +205,31 @@ static void shmem_free_blocks(struct inode *inode, long pages)
 	}
 }
 
+static int shmem_reserve_inode(struct super_block *sb)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	if (sbinfo->max_inodes) {
+		spin_lock(&sbinfo->stat_lock);
+		if (!sbinfo->free_inodes) {
+			spin_unlock(&sbinfo->stat_lock);
+			return -ENOSPC;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	return 0;
+}
+
+static void shmem_free_inode(struct super_block *sb)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	if (sbinfo->max_inodes) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
 /*
  * shmem_recalc_inode - recalculate the size of an inode
  *
@@ -762,7 +787,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 
 static void shmem_delete_inode(struct inode *inode)
 {
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_op->truncate == shmem_truncate) {
@@ -777,11 +801,7 @@ static void shmem_delete_inode(struct inode *inode)
 		}
 	}
 	BUG_ON(inode->i_blocks);
-	if (sbinfo->max_inodes) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
+	shmem_free_inode(inode->i_sb);
 	clear_inode(inode);
 }
 
@@ -1371,15 +1391,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
-	if (sbinfo->max_inodes) {
-		spin_lock(&sbinfo->stat_lock);
-		if (!sbinfo->free_inodes) {
-			spin_unlock(&sbinfo->stat_lock);
-			return NULL;
-		}
-		sbinfo->free_inodes--;
-		spin_unlock(&sbinfo->stat_lock);
-	}
+	if (shmem_reserve_inode(sb))
+		return NULL;
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -1423,11 +1436,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 						NULL);
 			break;
 		}
-	} else if (sbinfo->max_inodes) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
+	} else
+		shmem_free_inode(sb);
 	return inode;
 }
 
@@ -1670,22 +1680,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
-	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	int ret;
 
 	/*
 	 * No ordinary (disk based) filesystem counts links as inodes;
 	 * but each new link needs a new dentry, pinning lowmem, and
 	 * tmpfs dentries cannot be pruned until they are unlinked.
 	 */
-	if (sbinfo->max_inodes) {
-		spin_lock(&sbinfo->stat_lock);
-		if (!sbinfo->free_inodes) {
-			spin_unlock(&sbinfo->stat_lock);
-			return -ENOSPC;
-		}
-		sbinfo->free_inodes--;
-		spin_unlock(&sbinfo->stat_lock);
-	}
+	ret = shmem_reserve_inode(inode->i_sb);
+	if (ret)
+		goto out;
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1693,21 +1697,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	atomic_inc(&inode->i_count);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
 	d_instantiate(dentry, inode);
-	return 0;
+out:
+	return ret;
 }
 
 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
-	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
-		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-		if (sbinfo->max_inodes) {
-			spin_lock(&sbinfo->stat_lock);
-			sbinfo->free_inodes++;
-			spin_unlock(&sbinfo->stat_lock);
-		}
-	}
+	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+		shmem_free_inode(inode->i_sb);
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- 
cgit v1.1


From 818db35992c249dc32c1d86daf7d533fb0952f5d Mon Sep 17 00:00:00 2001
From: Michael Marineau <mike@marineau.org>
Date: Mon, 4 Feb 2008 22:28:48 -0800
Subject: tmpfs: fix mounts when size is less than the page size

When tmpfs is mounted with a size less than one page, the number of blocks
is set to 0 which makes the tmpfs mount unlimited.  This can lead to a
quick and surprising death if someone typos a tmpfs mount command and
writes too much.

tmpfs can still be mounted as unlimited if size or nr_blocks is exactly 0,
as Documentation/filesystems/tmpfs.txt says.

Hugh: do this by rounding size up instead of down in all cases: which
slightly expands other odd-sized tmpfs mounts, but in a consistent way.

Signed-off-by: Michael Marineau <mike@marineau.org>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ce64b66..7be9434 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2012,7 +2012,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 			}
 			if (*rest)
 				goto bad_val;
-			*blocks = size >> PAGE_CACHE_SHIFT;
+			*blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
 		} else if (!strcmp(this_char,"nr_blocks")) {
 			*blocks = memparse(value,&rest);
 			if (*rest)
-- 
cgit v1.1


From bb63be0a091c512fb566ee235eb8320d5831b6e2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:49 -0800
Subject: tmpfs: move swap_state stats update

Both unionfs and memcgroups pose challenges to tmpfs and shmem.  To help fix,
it's best to move the swap swizzling functions from swap_state.c to shmem.c.
As a preliminary to that, move swap stats updating down into
__add_to_swap_cache, which will remain internal to swap_state.c.

Well, actually, just move down the incrementation of add_total: remove
noent_race and exist_race completely, they are relics of my 2.4.11 testing.
Alt-SysRq-m users will be thrilled if 2.6.25 is at last free of "race M+N"s.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index e787564..18fce36 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -52,16 +52,13 @@ static struct {
 	unsigned long del_total;
 	unsigned long find_success;
 	unsigned long find_total;
-	unsigned long noent_race;
-	unsigned long exist_race;
 } swap_cache_info;
 
 void show_swap_cache_info(void)
 {
-	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+	printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
 		swap_cache_info.add_total, swap_cache_info.del_total,
-		swap_cache_info.find_success, swap_cache_info.find_total,
-		swap_cache_info.noent_race, swap_cache_info.exist_race);
+		swap_cache_info.find_success, swap_cache_info.find_total);
 	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
 	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
@@ -89,6 +86,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			set_page_private(page, entry.val);
 			total_swapcache_pages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
+			INC_CACHE_INFO(add_total);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
@@ -102,10 +100,9 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry,
 	int error;
 
 	BUG_ON(PageLocked(page));
-	if (!swap_duplicate(entry)) {
-		INC_CACHE_INFO(noent_race);
+	if (!swap_duplicate(entry))
 		return -ENOENT;
-	}
+
 	SetPageLocked(page);
 	error = __add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL);
 	/*
@@ -114,11 +111,8 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry,
 	if (error) {
 		ClearPageLocked(page);
 		swap_free(entry);
-		if (error == -EEXIST)
-			INC_CACHE_INFO(exist_race);
 		return error;
 	}
-	INC_CACHE_INFO(add_total);
 	return 0;
 }
 
@@ -178,11 +172,9 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
 		case 0:				/* Success */
 			SetPageUptodate(page);
 			SetPageDirty(page);
-			INC_CACHE_INFO(add_total);
 			return 1;
 		case -EEXIST:
 			/* Raced with "speculative" read_swap_cache_async */
-			INC_CACHE_INFO(exist_race);
 			swap_free(entry);
 			continue;
 		default:
@@ -225,9 +217,7 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 		if (!swap_duplicate(entry))
 			BUG();
 		SetPageDirty(page);
-		INC_CACHE_INFO(add_total);
-	} else if (err == -EEXIST)
-		INC_CACHE_INFO(exist_race);
+	}
 	return err;
 }
 
-- 
cgit v1.1


From f000944d03a5b74ab3c92b2fcdf0e944cc898065 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:49 -0800
Subject: tmpfs: shuffle add_to_swap_caches

add_to_swap_cache doesn't amount to much: merge it into its sole caller
read_swap_cache_async.  But we'll be needing to call __add_to_swap_cache from
shmem.c, so promote it to the new add_to_swap_cache.  Both were static, so
there's no interface confusion to worry about.

And lose that inappropriate "Anon pages are already on the LRU" comment in the
merging: they're not already on the LRU, as Nick Piggin noticed.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
No-problems-with: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 53 +++++++++++++++++++----------------------------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 18fce36..c75eda2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -64,10 +64,10 @@ void show_swap_cache_info(void)
 }
 
 /*
- * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
+static int add_to_swap_cache(struct page *page, swp_entry_t entry,
 			       gfp_t gfp_mask)
 {
 	int error;
@@ -94,28 +94,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 	return error;
 }
 
-static int add_to_swap_cache(struct page *page, swp_entry_t entry,
-				gfp_t gfp_mask)
-{
-	int error;
-
-	BUG_ON(PageLocked(page));
-	if (!swap_duplicate(entry))
-		return -ENOENT;
-
-	SetPageLocked(page);
-	error = __add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL);
-	/*
-	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
-	 */
-	if (error) {
-		ClearPageLocked(page);
-		swap_free(entry);
-		return error;
-	}
-	return 0;
-}
-
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
@@ -165,7 +143,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
 		/*
 		 * Add it to the swap cache and mark it dirty
 		 */
-		err = __add_to_swap_cache(page, entry,
+		err = add_to_swap_cache(page, entry,
 				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
 		switch (err) {
@@ -210,7 +188,7 @@ void delete_from_swap_cache(struct page *page)
  */
 int move_to_swap_cache(struct page *page, swp_entry_t entry)
 {
-	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
+	int err = add_to_swap_cache(page, entry, GFP_ATOMIC);
 	if (!err) {
 		remove_from_page_cache(page);
 		page_cache_release(page);	/* pagecache ref */
@@ -335,16 +313,21 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		}
 
 		/*
+		 * Swap entry may have been freed since our caller observed it.
+		 */
+		if (!swap_duplicate(entry))
+			break;
+
+		/*
 		 * Associate the page with swap entry in the swap cache.
-		 * May fail (-ENOENT) if swap entry has been freed since
-		 * our caller observed it.  May fail (-EEXIST) if there
-		 * is already a page associated with this entry in the
-		 * swap cache: added by a racing read_swap_cache_async,
-		 * or by try_to_swap_out (or shmem_writepage) re-using
-		 * the just freed swap entry for an existing page.
+		 * May fail (-EEXIST) if there is already a page associated
+		 * with this entry in the swap cache: added by a racing
+		 * read_swap_cache_async, or add_to_swap or shmem_writepage
+		 * re-using the just freed swap entry for an existing page.
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
-		err = add_to_swap_cache(new_page, entry, gfp_mask);
+		SetPageLocked(new_page);
+		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
 		if (!err) {
 			/*
 			 * Initiate read into locked page and return.
@@ -353,7 +336,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
-	} while (err != -ENOENT && err != -ENOMEM);
+		ClearPageLocked(new_page);
+		swap_free(entry);
+	} while (err != -ENOMEM);
 
 	if (new_page)
 		page_cache_release(new_page);
-- 
cgit v1.1


From 73b1262fa43a778b1e154deea632cdef5009d6a1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:50 -0800
Subject: tmpfs: move swap swizzling into shmem

move_to_swap_cache and move_from_swap_cache functions (which swizzle a page
between tmpfs page cache and swap cache, to avoid page copying) are only used
by shmem.c; and our subsequent fix for unionfs needs different treatments in
the two instances of move_from_swap_cache.  Move them from swap_state.c into
their callsites shmem_writepage, shmem_unuse_inode and shmem_getpage, making
add_to_swap_cache externally visible.

shmem.c likes to say set_page_dirty where swap_state.c liked to say
SetPageDirty: respect that diversity, which __set_page_dirty_no_writeback
makes moot (and implies we should lose that "shift page from clean_pages to
dirty_pages list" comment: it's on neither).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 15 ++++-----------
 mm/shmem.c           | 16 ++++++++++++----
 mm/swap_state.c      | 35 +----------------------------------
 3 files changed, 17 insertions(+), 49 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 16fd120..353153e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -220,11 +220,9 @@ extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, gfp_t);
+extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
-extern int move_to_swap_cache(struct page *, swp_entry_t);
-extern int move_from_swap_cache(struct page *, unsigned long,
-		struct address_space *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t);
@@ -319,15 +317,10 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
 
 #define can_share_swap_page(p)			(page_mapcount(p) == 1)
 
-static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
+static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
+							gfp_t gfp_mask)
 {
-	return 1;
-}
-
-static inline int move_from_swap_cache(struct page *page, unsigned long index,
-					struct address_space *mapping)
-{
-	return 1;
+	return -1;
 }
 
 static inline void __delete_from_swap_cache(struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 7be9434..e577adf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -884,7 +884,9 @@ lost2:
 found:
 	idx += offset;
 	inode = &info->vfs_inode;
-	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
+	if (add_to_page_cache(page, inode->i_mapping, idx, GFP_ATOMIC) == 0) {
+		delete_from_swap_cache(page);
+		set_page_dirty(page);
 		info->flags |= SHMEM_PAGEIN;
 		shmem_swp_set(info, ptr + offset, 0);
 	}
@@ -972,7 +974,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	BUG_ON(!entry);
 	BUG_ON(entry->val);
 
-	if (move_to_swap_cache(page, swap) == 0) {
+	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+		remove_from_page_cache(page);
 		shmem_swp_set(info, entry, swap.val);
 		shmem_swp_unmap(entry);
 		spin_unlock(&info->lock);
@@ -982,6 +985,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 			list_move_tail(&info->swaplist, &shmem_swaplist);
 			spin_unlock(&shmem_swaplist_lock);
 		}
+		swap_duplicate(swap);
+		page_cache_release(page);	/* pagecache ref */
+		set_page_dirty(page);
 		unlock_page(page);
 		return 0;
 	}
@@ -1217,13 +1223,15 @@ repeat:
 			SetPageUptodate(filepage);
 			set_page_dirty(filepage);
 			swap_free(swap);
-		} else if (!(error = move_from_swap_cache(
-				swappage, idx, mapping))) {
+		} else if (!(error = add_to_page_cache(
+				swappage, mapping, idx, GFP_ATOMIC))) {
 			info->flags |= SHMEM_PAGEIN;
 			shmem_swp_set(info, entry, 0);
 			shmem_swp_unmap(entry);
+			delete_from_swap_cache(swappage);
 			spin_unlock(&info->lock);
 			filepage = swappage;
+			set_page_dirty(filepage);
 			swap_free(swap);
 		} else {
 			shmem_swp_unmap(entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c75eda2..65b81c9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,8 +67,7 @@ void show_swap_cache_info(void)
  * add_to_swap_cache resembles add_to_page_cache on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
-static int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			       gfp_t gfp_mask)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
 	int error;
 
@@ -183,38 +182,6 @@ void delete_from_swap_cache(struct page *page)
 	page_cache_release(page);
 }
 
-/*
- * Strange swizzling function only for use by shmem_writepage
- */
-int move_to_swap_cache(struct page *page, swp_entry_t entry)
-{
-	int err = add_to_swap_cache(page, entry, GFP_ATOMIC);
-	if (!err) {
-		remove_from_page_cache(page);
-		page_cache_release(page);	/* pagecache ref */
-		if (!swap_duplicate(entry))
-			BUG();
-		SetPageDirty(page);
-	}
-	return err;
-}
-
-/*
- * Strange swizzling function for shmem_getpage (and shmem_unuse)
- */
-int move_from_swap_cache(struct page *page, unsigned long index,
-		struct address_space *mapping)
-{
-	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
-	if (!err) {
-		delete_from_swap_cache(page);
-		/* shift page from clean_pages to dirty_pages list */
-		ClearPageDirty(page);
-		set_page_dirty(page);
-	}
-	return err;
-}
-
 /* 
  * If we are the only user, then try to free up the swap cache. 
  * 
-- 
cgit v1.1


From d9fe526a83b84edc9c5ff217a00c896bfc20b2ce Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:51 -0800
Subject: tmpfs: allow filepage alongside swappage

tmpfs has long allowed for a fresh filepage to be created in pagecache, just
before shmem_getpage gets the chance to match it up with the swappage which
already belongs to that offset.  But unionfs_writepage now does a
find_or_create_page, divorced from shmem_getpage, which leaves conflicting
filepage and swappage outstanding indefinitely, when unionfs is over tmpfs.

Therefore shmem_writepage (where a page is swizzled from file to swap) must
now be on the lookout for existing swap, ready to free it in favour of the
more uptodate filepage, instead of BUGging on that clash.  And when the
add_to_page_cache fails in shmem_unuse_inode, it must defer to an uptodate
filepage, otherwise swapoff would hang.  Whereas when add_to_page_cache fails
in shmem_getpage, it should retry in the same way it already does.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 69 +++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index e577adf..4ae47f5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -827,6 +827,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	struct page *subdir;
 	swp_entry_t *ptr;
 	int offset;
+	int error;
 
 	idx = 0;
 	ptr = info->i_direct;
@@ -884,7 +885,20 @@ lost2:
 found:
 	idx += offset;
 	inode = &info->vfs_inode;
-	if (add_to_page_cache(page, inode->i_mapping, idx, GFP_ATOMIC) == 0) {
+	error = add_to_page_cache(page, inode->i_mapping, idx, GFP_ATOMIC);
+	if (error == -EEXIST) {
+		struct page *filepage = find_get_page(inode->i_mapping, idx);
+		if (filepage) {
+			/*
+			 * There might be a more uptodate page coming down
+			 * from a stacked writepage: forget our swappage if so.
+			 */
+			if (PageUptodate(filepage))
+				error = 0;
+			page_cache_release(filepage);
+		}
+	}
+	if (!error) {
 		delete_from_swap_cache(page);
 		set_page_dirty(page);
 		info->flags |= SHMEM_PAGEIN;
@@ -937,44 +951,45 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	struct inode *inode;
 
 	BUG_ON(!PageLocked(page));
-	/*
-	 * shmem_backing_dev_info's capabilities prevent regular writeback or
-	 * sync from ever calling shmem_writepage; but a stacking filesystem
-	 * may use the ->writepage of its underlying filesystem, in which case
-	 * we want to do nothing when that underlying filesystem is tmpfs
-	 * (writing out to swap is useful as a response to memory pressure, but
-	 * of no use to stabilize the data) - just redirty the page, unlock it
-	 * and claim success in this case.  AOP_WRITEPAGE_ACTIVATE, and the
-	 * page_mapped check below, must be avoided unless we're in reclaim.
-	 */
-	if (!wbc->for_reclaim) {
-		set_page_dirty(page);
-		unlock_page(page);
-		return 0;
-	}
-	BUG_ON(page_mapped(page));
-
 	mapping = page->mapping;
 	index = page->index;
 	inode = mapping->host;
 	info = SHMEM_I(inode);
 	if (info->flags & VM_LOCKED)
 		goto redirty;
-	swap = get_swap_page();
-	if (!swap.val)
+	if (!total_swap_pages)
 		goto redirty;
 
+	/*
+	 * shmem_backing_dev_info's capabilities prevent regular writeback or
+	 * sync from ever calling shmem_writepage; but a stacking filesystem
+	 * may use the ->writepage of its underlying filesystem, in which case
+	 * tmpfs should write out to swap only in response to memory pressure,
+	 * and not for pdflush or sync.  However, in those cases, we do still
+	 * want to check if there's a redundant swappage to be discarded.
+	 */
+	if (wbc->for_reclaim)
+		swap = get_swap_page();
+	else
+		swap.val = 0;
+
 	spin_lock(&info->lock);
-	shmem_recalc_inode(inode);
 	if (index >= info->next_index) {
 		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
 		goto unlock;
 	}
 	entry = shmem_swp_entry(info, index, NULL);
-	BUG_ON(!entry);
-	BUG_ON(entry->val);
+	if (entry->val) {
+		/*
+		 * The more uptodate page coming down from a stacked
+		 * writepage should replace our old swappage.
+		 */
+		free_swap_and_cache(*entry);
+		shmem_swp_set(info, entry, 0);
+	}
+	shmem_recalc_inode(inode);
 
-	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+	if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
 		remove_from_page_cache(page);
 		shmem_swp_set(info, entry, swap.val);
 		shmem_swp_unmap(entry);
@@ -986,6 +1001,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 			spin_unlock(&shmem_swaplist_lock);
 		}
 		swap_duplicate(swap);
+		BUG_ON(page_mapped(page));
 		page_cache_release(page);	/* pagecache ref */
 		set_page_dirty(page);
 		unlock_page(page);
@@ -998,7 +1014,10 @@ unlock:
 	swap_free(swap);
 redirty:
 	set_page_dirty(page);
-	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+	if (wbc->for_reclaim)
+		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
+	unlock_page(page);
+	return 0;
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.1


From a0ee5ec520ede1dc8e2194623bcebfd9fab408f2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:51 -0800
Subject: tmpfs: allocate on read when stacked

tmpfs is expected to limit the memory used (unless mounted with nr_blocks=0 or
size=0).  But if a stacked filesystem such as unionfs gets pages from a sparse
tmpfs file by reading holes, and then writes to them, it can easily exceed any
such limit at present.

So suppress the SGP_READ "don't allocate page" ZERO_PAGE optimization when
reading for the kernel (a KERNEL_DS check, ugh, sorry about that).  Indeed,
pessimistically mark such pages as dirty, so they cannot get reclaimed and
unaccounted by mistake.  The venerable shmem_recalc_inode code (originally to
account for the reclaim of clean pages) suffices to get the accounting right
when swappages are dropped in favour of more uptodate filepages.

This also fixes the NULL shmem_swp_entry BUG or oops in shmem_writepage,
caused by unionfs writing to a very sparse tmpfs file: to minimize memory
allocation in swapout, tmpfs requires the swap vector be allocated upfront,
which wasn't always happening in this stacked case.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 4ae47f5..c919ed5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,6 +80,7 @@
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
+	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
 };
 
@@ -1333,6 +1334,8 @@ repeat:
 		clear_highpage(filepage);
 		flush_dcache_page(filepage);
 		SetPageUptodate(filepage);
+		if (sgp == SGP_DIRTY)
+			set_page_dirty(filepage);
 	}
 done:
 	*pagep = filepage;
@@ -1518,6 +1521,15 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index, offset;
+	enum sgp_type sgp = SGP_READ;
+
+	/*
+	 * Might this read be for a stacking filesystem?  Then when reading
+	 * holes of a sparse file, we actually need to allocate those pages,
+	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
+	 */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		sgp = SGP_DIRTY;
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1536,7 +1548,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 				break;
 		}
 
-		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+		desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
 		if (desc->error) {
 			if (desc->error == -EINVAL)
 				desc->error = 0;
-- 
cgit v1.1


From cb5f7b9a47963d9238398cd0c2676473e3c6896d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:52 -0800
Subject: tmpfs: make shmem_unuse more preemptible

shmem_unuse is at present an unbroken search through every swap vector page of
every tmpfs file which might be swapped, all under shmem_swaplist_lock.  This
dates from long ago, when the caller held mmlist_lock over it all too: long
gone, but there's never been much pressure for preemptible swapoff.

Make it a little more preemptible, replacing shmem_swaplist_lock by
shmem_swaplist_mutex, inserting a cond_resched in the main loop, and a
cond_resched_lock (on info->lock) at one convenient point in the
shmem_unuse_inode loop, where it has no outstanding kmap_atomic.

If we're serious about preemptible swapoff, there's much further to go e.g.
I'm stupid to let the kmap_atomics of the decreasingly significant HIGHMEM
case dictate preemptiblility for other configs.  But as in the earlier patch
to make swapoff scan ptes preemptibly, my hidden agenda is really towards
making memcgroups work, hardly about preemptibility at all.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index c919ed5..2e03d60 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -193,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 };
 
 static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
 
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
@@ -796,9 +796,9 @@ static void shmem_delete_inode(struct inode *inode)
 		inode->i_size = 0;
 		shmem_truncate(inode);
 		if (!list_empty(&info->swaplist)) {
-			spin_lock(&shmem_swaplist_lock);
+			mutex_lock(&shmem_swaplist_mutex);
 			list_del_init(&info->swaplist);
-			spin_unlock(&shmem_swaplist_lock);
+			mutex_unlock(&shmem_swaplist_mutex);
 		}
 	}
 	BUG_ON(inode->i_blocks);
@@ -851,6 +851,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
 		if (unlikely(idx == stage)) {
 			shmem_dir_unmap(dir-1);
+			if (cond_resched_lock(&info->lock)) {
+				/* check it has not been truncated */
+				if (limit > info->next_index) {
+					limit = info->next_index;
+					if (idx >= limit)
+						goto lost2;
+				}
+			}
 			dir = shmem_dir_map(info->i_indirect) +
 			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 			while (!*dir) {
@@ -924,7 +932,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	struct shmem_inode_info *info;
 	int found = 0;
 
-	spin_lock(&shmem_swaplist_lock);
+	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(p, next, &shmem_swaplist) {
 		info = list_entry(p, struct shmem_inode_info, swaplist);
 		if (!info->swapped)
@@ -935,8 +943,9 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 			found = 1;
 			break;
 		}
+		cond_resched();
 	}
-	spin_unlock(&shmem_swaplist_lock);
+	mutex_unlock(&shmem_swaplist_mutex);
 	return found;
 }
 
@@ -996,10 +1005,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		shmem_swp_unmap(entry);
 		spin_unlock(&info->lock);
 		if (list_empty(&info->swaplist)) {
-			spin_lock(&shmem_swaplist_lock);
+			mutex_lock(&shmem_swaplist_mutex);
 			/* move instead of add in case we're racing */
 			list_move_tail(&info->swaplist, &shmem_swaplist);
-			spin_unlock(&shmem_swaplist_lock);
+			mutex_unlock(&shmem_swaplist_mutex);
 		}
 		swap_duplicate(swap);
 		BUG_ON(page_mapped(page));
-- 
cgit v1.1


From 2e0e26c76a35de8f8bec6b2b917518cfeb52888a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:53 -0800
Subject: tmpfs: open a window in shmem_unuse_inode

There are a couple of reasons (patches follow) why it would be good to open a
window for sleep in shmem_unuse_inode, between its search for a matching swap
entry, and its handling of the entry found.

shmem_unuse_inode must then use igrab to hold the inode against deletion in
that window, and its corresponding iput might result in deletion: so it had
better unlock_page before the iput, and might as well release the page too.

Nor is there any need to hold on to shmem_swaplist_mutex once we know we'll
leave the loop.  So this unwinding moves from try_to_unuse and shmem_unuse
into shmem_unuse_inode, in the case when it finds a match.

Let try_to_unuse break on error in the shmem_unuse case, as it does in the
unuse_mm case: though at this point in the series, no error to break on.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c    | 57 +++++++++++++++++++++++++++++++++++----------------------
 mm/swapfile.c | 23 ++++++++++-------------
 2 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 2e03d60..a0126c4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -838,10 +838,8 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	if (size > SHMEM_NR_DIRECT)
 		size = SHMEM_NR_DIRECT;
 	offset = shmem_find_swp(entry, ptr, ptr+size);
-	if (offset >= 0) {
-		shmem_swp_balance_unmap();
+	if (offset >= 0)
 		goto found;
-	}
 	if (!info->i_indirect)
 		goto lost2;
 
@@ -879,11 +877,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
 			offset = shmem_find_swp(entry, ptr, ptr+size);
+			shmem_swp_unmap(ptr);
 			if (offset >= 0) {
 				shmem_dir_unmap(dir);
 				goto found;
 			}
-			shmem_swp_unmap(ptr);
 		}
 	}
 lost1:
@@ -893,10 +891,25 @@ lost2:
 	return 0;
 found:
 	idx += offset;
-	inode = &info->vfs_inode;
-	error = add_to_page_cache(page, inode->i_mapping, idx, GFP_ATOMIC);
+	inode = igrab(&info->vfs_inode);
+	spin_unlock(&info->lock);
+
+	/* move head to start search for next from here */
+	list_move_tail(&shmem_swaplist, &info->swaplist);
+	mutex_unlock(&shmem_swaplist_mutex);
+
+	error = 1;
+	if (!inode)
+		goto out;
+
+	spin_lock(&info->lock);
+	ptr = shmem_swp_entry(info, idx, NULL);
+	if (ptr && ptr->val == entry.val)
+		error = add_to_page_cache(page, inode->i_mapping,
+						idx, GFP_ATOMIC);
 	if (error == -EEXIST) {
 		struct page *filepage = find_get_page(inode->i_mapping, idx);
+		error = 1;
 		if (filepage) {
 			/*
 			 * There might be a more uptodate page coming down
@@ -911,16 +924,18 @@ found:
 		delete_from_swap_cache(page);
 		set_page_dirty(page);
 		info->flags |= SHMEM_PAGEIN;
-		shmem_swp_set(info, ptr + offset, 0);
+		shmem_swp_set(info, ptr, 0);
+		swap_free(entry);
+		error = 1;	/* not an error, but entry was found */
 	}
-	shmem_swp_unmap(ptr);
+	if (ptr)
+		shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
-	/*
-	 * Decrement swap count even when the entry is left behind:
-	 * try_to_unuse will skip over mms, then reincrement count.
-	 */
-	swap_free(entry);
-	return 1;
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	iput(inode);		/* allows for NULL */
+	return error;
 }
 
 /*
@@ -935,18 +950,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(p, next, &shmem_swaplist) {
 		info = list_entry(p, struct shmem_inode_info, swaplist);
-		if (!info->swapped)
+		if (info->swapped)
+			found = shmem_unuse_inode(info, entry, page);
+		else
 			list_del_init(&info->swaplist);
-		else if (shmem_unuse_inode(info, entry, page)) {
-			/* move head to start search for next from here */
-			list_move_tail(&shmem_swaplist, &info->swaplist);
-			found = 1;
-			break;
-		}
 		cond_resched();
+		if (found)
+			goto out;
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
-	return found;
+out:	return found;	/* 0 or 1 or -ENOMEM */
 }
 
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14bc4f2..eade24d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -814,7 +814,7 @@ static int try_to_unuse(unsigned int type)
 			atomic_inc(&new_start_mm->mm_users);
 			atomic_inc(&prev_mm->mm_users);
 			spin_lock(&mmlist_lock);
-			while (*swap_map > 1 && !retval &&
+			while (*swap_map > 1 && !retval && !shmem &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
 				if (!atomic_inc_not_zero(&mm->mm_users))
@@ -846,6 +846,13 @@ static int try_to_unuse(unsigned int type)
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+		if (shmem) {
+			/* page has already been unlocked and released */
+			if (shmem > 0)
+				continue;
+			retval = shmem;
+			break;
+		}
 		if (retval) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -884,12 +891,6 @@ static int try_to_unuse(unsigned int type)
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
-		 *
-		 * Note shmem_unuse already deleted a swappage from
-		 * the swap cache, unless the move to filepage failed:
-		 * in which case it left swappage in cache, lowered its
-		 * swap count to pass quickly through the loops above,
-		 * and now we must reincrement count to try again later.
 		 */
 		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 			struct writeback_control wbc = {
@@ -900,12 +901,8 @@ static int try_to_unuse(unsigned int type)
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
-		if (PageSwapCache(page)) {
-			if (shmem)
-				swap_duplicate(entry);
-			else
-				delete_from_swap_cache(page);
-		}
+		if (PageSwapCache(page))
+			delete_from_swap_cache(page);
 
 		/*
 		 * So we could skip searching mms once swap count went
-- 
cgit v1.1


From b409f9fcf04692c0f603d28c73d2e3dfed27bf54 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:54 -0800
Subject: tmpfs: radix_tree_preloading

Nick has observed that shmem.c still uses GFP_ATOMIC when adding to page cache
or swap cache, without any radix tree preload: so tending to deplete emergency
reserves of memory.

GFP_ATOMIC remains appropriate in shmem_writepage's add_to_swap_cache: it's
being called under memory pressure, so must not wait for more memory to become
available.  But shmem_unuse_inode now has a window in which it can and should
preload with GFP_KERNEL, and say GFP_NOWAIT instead of GFP_ATOMIC in its
add_to_page_cache.

shmem_getpage is not so straightforward: its filepage/swappage integrity
relies upon exchanging between caches under spinlock, and it would need a lot
of restructuring to place the preloads correctly.  Instead, follow its pattern
of retrying on races: use GFP_NOWAIT instead of GFP_ATOMIC in
add_to_page_cache, and begin each circuit of the repeat loop with a sleeping
radix_tree_preload, followed immediately by radix_tree_preload_end - that
won't guarantee success in the next add_to_page_cache, but doesn't need to.

And we can then remove that bothersome congestion_wait: when needed, it'll
automatically get done in the course of the radix_tree_preload.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Looks-good-to: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index a0126c4..530c503 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -901,12 +901,16 @@ found:
 	error = 1;
 	if (!inode)
 		goto out;
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		goto out;
+	error = 1;
 
 	spin_lock(&info->lock);
 	ptr = shmem_swp_entry(info, idx, NULL);
 	if (ptr && ptr->val == entry.val)
 		error = add_to_page_cache(page, inode->i_mapping,
-						idx, GFP_ATOMIC);
+						idx, GFP_NOWAIT);
 	if (error == -EEXIST) {
 		struct page *filepage = find_get_page(inode->i_mapping, idx);
 		error = 1;
@@ -931,6 +935,7 @@ found:
 	if (ptr)
 		shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
+	radix_tree_preload_end();
 out:
 	unlock_page(page);
 	page_cache_release(page);
@@ -1185,6 +1190,16 @@ repeat:
 		goto done;
 	error = 0;
 	gfp = mapping_gfp_mask(mapping);
+	if (!filepage) {
+		/*
+		 * Try to preload while we can wait, to not make a habit of
+		 * draining atomic reserves; but don't latch on to this cpu.
+		 */
+		error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+		if (error)
+			goto failed;
+		radix_tree_preload_end();
+	}
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
@@ -1266,7 +1281,7 @@ repeat:
 			set_page_dirty(filepage);
 			swap_free(swap);
 		} else if (!(error = add_to_page_cache(
-				swappage, mapping, idx, GFP_ATOMIC))) {
+				swappage, mapping, idx, GFP_NOWAIT))) {
 			info->flags |= SHMEM_PAGEIN;
 			shmem_swp_set(info, entry, 0);
 			shmem_swp_unmap(entry);
@@ -1280,10 +1295,6 @@ repeat:
 			spin_unlock(&info->lock);
 			unlock_page(swappage);
 			page_cache_release(swappage);
-			if (error == -ENOMEM) {
-				/* let kswapd refresh zone for GFP_ATOMICs */
-				congestion_wait(WRITE, HZ/50);
-			}
 			goto repeat;
 		}
 	} else if (sgp == SGP_READ && !filepage) {
@@ -1338,7 +1349,7 @@ repeat:
 				shmem_swp_unmap(entry);
 			}
 			if (error || swap.val || 0 != add_to_page_cache_lru(
-					filepage, mapping, idx, GFP_ATOMIC)) {
+					filepage, mapping, idx, GFP_NOWAIT)) {
 				spin_unlock(&info->lock);
 				page_cache_release(filepage);
 				shmem_unacct_blocks(info->flags, 1);
-- 
cgit v1.1


From 1b1b32f2c6f6bb32535d2da62075b51c980880eb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:28:55 -0800
Subject: tmpfs: fix shmem_swaplist races

Intensive swapoff testing shows shmem_unuse spinning on an entry in
shmem_swaplist pointing to itself: how does that come about?  Days pass...

First guess is this: shmem_delete_inode tests list_empty without taking the
global mutex (so the swapping case doesn't slow down the common case); but
there's an instant in shmem_unuse_inode's list_move_tail when the list entry
may appear empty (a rare case, because it's actually moving the head not the
the list member).  So there's a danger of leaving the inode on the swaplist
when it's freed, then reinitialized to point to itself when reused.  Fix that
by skipping the list_move_tail when it's a no-op, which happens to plug this.

But this same spinning then surfaces on another machine.  Ah, I'd never
suspected it, but shmem_writepage's swaplist manipulation is unsafe: though we
still hold page lock, which would hold off inode deletion if the page were in
pagecache, it doesn't hold off once it's in swapcache (free_swap_and_cache
doesn't wait on locked pages).  Hmm: we could put the the inode on swaplist
earlier, but then shmem_unuse_inode could never prune unswapped inodes.

Fix this with an igrab before dropping info->lock, as in shmem_unuse_inode;
though I am a little uneasy about the iput which has to follow - it works, and
I see nothing wrong with it, but it is surprising that shmem inode deletion
may now occur below shmem_writepage.  Revisit this fix later?

And while we're looking at these races: the way shmem_unuse tests swapped
without holding info->lock looks unsafe, if we've more than one swap area: a
racing shmem_writepage on another page of the same inode could be putting it
in swapcache, just as we're deciding to remove the inode from swaplist -
there's a danger of going on swap without being listed, so a later swapoff
would hang, being unable to locate the entry.  Move that test and removal down
into shmem_unuse_inode, once info->lock is held.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 530c503..ee90244 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -833,6 +833,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	idx = 0;
 	ptr = info->i_direct;
 	spin_lock(&info->lock);
+	if (!info->swapped) {
+		list_del_init(&info->swaplist);
+		goto lost2;
+	}
 	limit = info->next_index;
 	size = limit;
 	if (size > SHMEM_NR_DIRECT)
@@ -894,8 +898,15 @@ found:
 	inode = igrab(&info->vfs_inode);
 	spin_unlock(&info->lock);
 
-	/* move head to start search for next from here */
-	list_move_tail(&shmem_swaplist, &info->swaplist);
+	/*
+	 * Move _head_ to start search for next from here.
+	 * But be careful: shmem_delete_inode checks list_empty without taking
+	 * mutex, and there's an instant in list_move_tail when info->swaplist
+	 * would appear empty, if it were the only one on shmem_swaplist.  We
+	 * could avoid doing it if inode NULL; or use this minor optimization.
+	 */
+	if (shmem_swaplist.next != &info->swaplist)
+		list_move_tail(&shmem_swaplist, &info->swaplist);
 	mutex_unlock(&shmem_swaplist_mutex);
 
 	error = 1;
@@ -955,10 +966,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(p, next, &shmem_swaplist) {
 		info = list_entry(p, struct shmem_inode_info, swaplist);
-		if (info->swapped)
-			found = shmem_unuse_inode(info, entry, page);
-		else
-			list_del_init(&info->swaplist);
+		found = shmem_unuse_inode(info, entry, page);
 		cond_resched();
 		if (found)
 			goto out;
@@ -1021,18 +1029,23 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		remove_from_page_cache(page);
 		shmem_swp_set(info, entry, swap.val);
 		shmem_swp_unmap(entry);
+		if (list_empty(&info->swaplist))
+			inode = igrab(inode);
+		else
+			inode = NULL;
 		spin_unlock(&info->lock);
-		if (list_empty(&info->swaplist)) {
-			mutex_lock(&shmem_swaplist_mutex);
-			/* move instead of add in case we're racing */
-			list_move_tail(&info->swaplist, &shmem_swaplist);
-			mutex_unlock(&shmem_swaplist_mutex);
-		}
 		swap_duplicate(swap);
 		BUG_ON(page_mapped(page));
 		page_cache_release(page);	/* pagecache ref */
 		set_page_dirty(page);
 		unlock_page(page);
+		if (inode) {
+			mutex_lock(&shmem_swaplist_mutex);
+			/* move instead of add in case we're racing */
+			list_move_tail(&info->swaplist, &shmem_swaplist);
+			mutex_unlock(&shmem_swaplist_mutex);
+			iput(inode);
+		}
 		return 0;
 	}
 
-- 
cgit v1.1


From 61d5048f149572434daee0cce5e1374a8a7cf3e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Feb 2008 22:28:56 -0800
Subject: clean up vmtruncate

vmtruncate is a twisted maze of gotos, this patch cleans it up to have a
proper if else for the two major cases of extending and truncating truncate
and thus makes it a lot more readable while keeping exactly the same
functinality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 69 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index bc13775..b7cb2e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1909,50 +1909,49 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 int vmtruncate(struct inode * inode, loff_t offset)
 {
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
+	if (inode->i_size < offset) {
+		unsigned long limit;
+
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+		i_size_write(inode, offset);
+	} else {
+		struct address_space *mapping = inode->i_mapping;
 
-	if (inode->i_size < offset)
-		goto do_expand;
-	/*
-	 * truncation of in-use swapfiles is disallowed - it would cause
-	 * subsequent swapout to scribble on the now-freed blocks.
-	 */
-	if (IS_SWAPFILE(inode))
-		goto out_busy;
-	i_size_write(inode, offset);
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+		i_size_write(inode, offset);
+
+		/*
+		 * unmap_mapping_range is called twice, first simply for
+		 * efficiency so that truncate_inode_pages does fewer
+		 * single-page unmaps.  However after this first call, and
+		 * before truncate_inode_pages finishes, it is possible for
+		 * private pages to be COWed, which remain after
+		 * truncate_inode_pages finishes, hence the second
+		 * unmap_mapping_range call must be made for correctness.
+		 */
+		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+		truncate_inode_pages(mapping, offset);
+		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+	}
 
-	/*
-	 * unmap_mapping_range is called twice, first simply for efficiency
-	 * so that truncate_inode_pages does fewer single-page unmaps. However
-	 * after this first call, and before truncate_inode_pages finishes,
-	 * it is possible for private pages to be COWed, which remain after
-	 * truncate_inode_pages finishes, hence the second unmap_mapping_range
-	 * call must be made for correctness.
-	 */
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	truncate_inode_pages(mapping, offset);
-	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-	goto out_truncate;
-
-do_expand:
-	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit)
-		goto out_sig;
-	if (offset > inode->i_sb->s_maxbytes)
-		goto out_big;
-	i_size_write(inode, offset);
-
-out_truncate:
 	if (inode->i_op && inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 	return 0;
+
 out_sig:
 	send_sig(SIGXFSZ, current, 0);
 out_big:
 	return -EFBIG;
-out_busy:
-	return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
 
-- 
cgit v1.1


From ec4dd3eb35759f9fbeb5c1abb01403b2fde64cc9 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Mon, 4 Feb 2008 22:28:56 -0800
Subject: maps4: add proportional set size accounting in smaps

The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing
it.  So if a process has 1000 pages all to itself, and 1000 shared with one
other process, its PSS will be 1500.

               - lwn.net: "ELC: How much memory are applications really using?"

The PSS proposed by Matt Mackall is a very nice metic for measuring an
process's memory footprint.  So collect and export it via
/proc/<pid>/smaps.

Matt Mackall's pagemap/kpagemap and John Berthels's exmap can also do the
job.  They are comprehensive tools.  But for PSS, let's do it in the simple
way.

Cc: John Berthels <jjberthels@gmail.com>
Cc: Bernardo Innocenti <bernie@codewiz.org>
Cc: Padraig Brady <P@draigBrady.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8043a3e..8952ce7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -114,6 +114,25 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+/*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it.  So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500.
+ *
+ * To keep (accumulated) division errors low, we adopt a 64bit
+ * fixed-point pss counter to minimize division errors. So (pss >>
+ * PSS_SHIFT) would be the real byte count.
+ *
+ * A shift of 12 before division means (assuming 4K page size):
+ * 	- 1M 3-user-pages add up to 8KB errors;
+ * 	- supports mapcount up to 2^24, or 16M;
+ * 	- supports PSS up to 2^52 bytes, or 4PB.
+ */
+#define PSS_SHIFT 12
+
 struct mem_size_stats
 {
 	unsigned long resident;
@@ -122,6 +141,7 @@ struct mem_size_stats
 	unsigned long private_clean;
 	unsigned long private_dirty;
 	unsigned long referenced;
+	u64 pss;
 };
 
 struct pmd_walker {
@@ -195,6 +215,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 		seq_printf(m,
 			   "Size:           %8lu kB\n"
 			   "Rss:            %8lu kB\n"
+			   "Pss:            %8lu kB\n"
 			   "Shared_Clean:   %8lu kB\n"
 			   "Shared_Dirty:   %8lu kB\n"
 			   "Private_Clean:  %8lu kB\n"
@@ -202,6 +223,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 			   "Referenced:     %8lu kB\n",
 			   (vma->vm_end - vma->vm_start) >> 10,
 			   mss->resident >> 10,
+			   (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
 			   mss->shared_clean  >> 10,
 			   mss->shared_dirty  >> 10,
 			   mss->private_clean >> 10,
@@ -226,6 +248,7 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
+	int mapcount;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -242,16 +265,19 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		/* Accumulate the size in pages that have been accessed. */
 		if (pte_young(ptent) || PageReferenced(page))
 			mss->referenced += PAGE_SIZE;
-		if (page_mapcount(page) >= 2) {
+		mapcount = page_mapcount(page);
+		if (mapcount >= 2) {
 			if (pte_dirty(ptent))
 				mss->shared_dirty += PAGE_SIZE;
 			else
 				mss->shared_clean += PAGE_SIZE;
+			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
 		} else {
 			if (pte_dirty(ptent))
 				mss->private_dirty += PAGE_SIZE;
 			else
 				mss->private_clean += PAGE_SIZE;
+			mss->pss += (PAGE_SIZE << PSS_SHIFT);
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
-- 
cgit v1.1


From 824552574162ac00ae636fa41386b1072379ea4a Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Mon, 4 Feb 2008 22:28:59 -0800
Subject: maps4: rework TASK_SIZE macros

The following replaces the earlier patches sent.  It should address
David Rientjes's comments, and has been compile tested on all the
architectures that it touches, save for parisc.

For the /proc/<pid>/pagemap code[1], we need to able to query how
much virtual address space a particular task has.  The trick is
that we do it through /proc and can't use TASK_SIZE since it
references "current" on some arches.  The process opening the
/proc file might be a 32-bit process opening a 64-bit process's
pagemap file.

x86_64 already has a TASK_SIZE_OF() macro:

#define TASK_SIZE_OF(child)     ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)

I'd like to have that for other architectures.  So, add it
for all the architectures that actually use "current" in
their TASK_SIZE.  For the others, just add a quick #define
in sched.h to use plain old TASK_SIZE.

1. http://www.linuxworld.com/news/2007/042407-kernel.html

- MIPS portion from Ralf Baechle <ralf@linux-mips.org>

[akpm@linux-foundation.org: fix mips build]
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-ia64/processor.h    | 3 ++-
 include/asm-mips/processor.h    | 2 ++
 include/asm-parisc/processor.h  | 3 ++-
 include/asm-powerpc/processor.h | 3 ++-
 include/asm-s390/processor.h    | 3 ++-
 include/linux/sched.h           | 4 ++++
 6 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index be3b0ae..666385b 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -31,7 +31,8 @@
  * each (assuming 8KB page size), for a total of 8TB of user virtual
  * address space.
  */
-#define TASK_SIZE		(current->thread.task_size)
+#define TASK_SIZE_OF(tsk)	((tsk)->thread.task_size)
+#define TASK_SIZE       	TASK_SIZE_OF(current)
 
 /*
  * This decides where the kernel will search for a free chunk of vm
diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h
index 83bc945..36f42de 100644
--- a/include/asm-mips/processor.h
+++ b/include/asm-mips/processor.h
@@ -65,6 +65,8 @@ extern unsigned int vced_count, vcei_count;
 #define TASK_UNMAPPED_BASE						\
 	(test_thread_flag(TIF_32BIT_ADDR) ?				\
 		PAGE_ALIGN(TASK_SIZE32 / 3) : PAGE_ALIGN(TASK_SIZE / 3))
+#define TASK_SIZE_OF(tsk)						\
+	(test_tsk_thread_flag(tsk, TIF_32BIT_ADDR) ? TASK_SIZE32 : TASK_SIZE)
 #endif
 
 #define NUM_FPU_REGS	32
diff --git a/include/asm-parisc/processor.h b/include/asm-parisc/processor.h
index 6b294fb..3bb06e8 100644
--- a/include/asm-parisc/processor.h
+++ b/include/asm-parisc/processor.h
@@ -32,7 +32,8 @@
 #endif
 #define current_text_addr() ({ void *pc; current_ia(pc); pc; })
 
-#define TASK_SIZE               (current->thread.task_size)
+#define TASK_SIZE_OF(tsk)       ((tsk)->thread.task_size)
+#define TASK_SIZE	        TASK_SIZE_OF(current)
 #define TASK_UNMAPPED_BASE      (current->thread.map_base)
 
 #define DEFAULT_TASK_SIZE32	(0xFFF00000UL)
diff --git a/include/asm-powerpc/processor.h b/include/asm-powerpc/processor.h
index dba7c94..1f4765d 100644
--- a/include/asm-powerpc/processor.h
+++ b/include/asm-powerpc/processor.h
@@ -99,8 +99,9 @@ extern struct task_struct *last_task_used_spe;
  */
 #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
 
-#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \
+#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
 		TASK_SIZE_USER32 : TASK_SIZE_USER64)
+#define TASK_SIZE	  TASK_SIZE_OF(current)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
diff --git a/include/asm-s390/processor.h b/include/asm-s390/processor.h
index c86b982..4f74460 100644
--- a/include/asm-s390/processor.h
+++ b/include/asm-s390/processor.h
@@ -70,8 +70,9 @@ extern int get_cpu_capability(unsigned int *);
 
 #else /* __s390x__ */
 
-# define TASK_SIZE		(test_thread_flag(TIF_31BIT) ? \
+# define TASK_SIZE_OF(tsk)	(test_tsk_thread_flag(tsk, TIF_31BIT) ? \
 					(0x80000000UL) : (0x40000000000UL))
+# define TASK_SIZE		TASK_SIZE_OF(current)
 # define TASK_UNMAPPED_BASE	(TASK_SIZE / 2)
 # define DEFAULT_TASK_SIZE	(0x40000000000UL)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 483ea4e..c30d174 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2087,6 +2087,10 @@ static inline void migration_init(void)
 }
 #endif
 
+#ifndef TASK_SIZE_OF
+#define TASK_SIZE_OF(tsk)	TASK_SIZE
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif
-- 
cgit v1.1


From 698dd4ba6b12e34e1e432c944c01478c0b2cd773 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:00 -0800
Subject: maps4: move is_swap_pte

Move is_swap_pte helper function to swapops.h for use by pagemap code

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 6 ++++++
 mm/migrate.c            | 5 -----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index ceb6cc5..7bf2d14 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -42,6 +42,12 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK(entry);
 }
 
+/* check whether a pte points to a swap entry */
+static inline int is_swap_pte(pte_t pte)
+{
+	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
  * arch-independent swp_entry_t.
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a207e8..4ee4cca 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
-static inline int is_swap_pte(pte_t pte)
-{
-	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
-}
-
 /*
  * Restore a potential migration pte to a working pte entry
  */
-- 
cgit v1.1


From e6473092bd9116583ce9ab8cf1b6570e1aa6fc83 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:01 -0800
Subject: maps4: introduce a generic page walker

Introduce a general page table walker

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  22 +++++++++
 mm/Makefile        |   2 +-
 mm/pagewalk.c      | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 mm/pagewalk.c

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcbe697..89d7c69 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -718,6 +718,28 @@ unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
+
+/**
+ * mm_walk - callbacks for walk_page_range
+ * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
+ * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
+ * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
+ * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
+ * @pte_hole: if set, called for each hole at all levels
+ *
+ * (see walk_page_range for more details)
+ */
+struct mm_walk {
+	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+	int (*pte_hole)(unsigned long, unsigned long, void *);
+};
+
+int walk_page_range(const struct mm_struct *, unsigned long addr,
+		    unsigned long end, const struct mm_walk *walk,
+		    void *private);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
diff --git a/mm/Makefile b/mm/Makefile
index 5c0b0ea..07f1213 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o
+			   vmalloc.o pagewalk.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 0000000..b4f27d2
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,131 @@
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			  const struct mm_walk *walk, void *private)
+{
+	pte_t *pte;
+	int err = 0;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+		if (err)
+		       break;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	pte_unmap(pte);
+	return err;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			  const struct mm_walk *walk, void *private)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err = 0;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, private);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry)
+			err = walk->pmd_entry(pmd, addr, next, private);
+		if (!err && walk->pte_entry)
+			err = walk_pte_range(pmd, addr, next, walk, private);
+		if (err)
+			break;
+	} while (pmd++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  const struct mm_walk *walk, void *private)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err = 0;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, private);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pud_entry)
+			err = walk->pud_entry(pud, addr, next, private);
+		if (!err && (walk->pmd_entry || walk->pte_entry))
+			err = walk_pmd_range(pud, addr, next, walk, private);
+		if (err)
+			break;
+	} while (pud++, addr = next, addr != end);
+
+	return err;
+}
+
+/**
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
+ * @walk - set of callbacks to invoke for each level of the tree
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA,
+ * calling supplied callbacks. Callbacks are called in-order (first
+ * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ *
+ * Each callback receives an entry pointer, the start and end of the
+ * associated range, and a caller-supplied private data pointer.
+ *
+ * No locks are taken, but the bottom level iterator will map PTE
+ * directories from highmem if necessary.
+ *
+ * If any callback returns a non-zero value, the walk is aborted and
+ * the return value is propagated back to the caller. Otherwise 0 is returned.
+ */
+int walk_page_range(const struct mm_struct *mm,
+		    unsigned long addr, unsigned long end,
+		    const struct mm_walk *walk, void *private)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	if (addr >= end)
+		return err;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, private);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pgd_entry)
+			err = walk->pgd_entry(pgd, addr, next, private);
+		if (!err &&
+		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+			err = walk_pud_range(pgd, addr, next, walk, private);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
-- 
cgit v1.1


From b3ae5acbbb98d95c1300c8ced56d15f97d09c506 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:01 -0800
Subject: maps4: use pagewalker in clear_refs and smaps

Use the generic pagewalker for smaps and clear_refs

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 95 ++++++++++--------------------------------------------
 1 file changed, 17 insertions(+), 78 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8952ce7..791b2d4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -135,6 +135,7 @@ static void pad_len_spaces(struct seq_file *m, int len)
 
 struct mem_size_stats
 {
+	struct vm_area_struct *vma;
 	unsigned long resident;
 	unsigned long shared_clean;
 	unsigned long shared_dirty;
@@ -144,13 +145,6 @@ struct mem_size_stats
 	u64 pss;
 };
 
-struct pmd_walker {
-	struct vm_area_struct *vma;
-	void *private;
-	void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
-		       unsigned long, void *);
-};
-
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -240,11 +234,11 @@ static int show_map(struct seq_file *m, void *v)
 	return show_map_internal(m, v, NULL);
 }
 
-static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-			    unsigned long addr, unsigned long end,
-			    void *private)
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   void *private)
 {
 	struct mem_size_stats *mss = private;
+	struct vm_area_struct *vma = mss->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -282,12 +276,13 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
+	return 0;
 }
 
-static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				 unsigned long addr, unsigned long end,
-				 void *private)
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, void *private)
 {
+	struct vm_area_struct *vma = private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -308,71 +303,10 @@ static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
+	return 0;
 }
 
-static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
-				  unsigned long addr, unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	for (pmd = pmd_offset(pud, addr); addr != end;
-	     pmd++, addr = next) {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		walker->action(walker->vma, pmd, addr, next, walker->private);
-	}
-}
-
-static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
-				  unsigned long addr, unsigned long end)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	for (pud = pud_offset(pgd, addr); addr != end;
-	     pud++, addr = next) {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		walk_pmd_range(walker, pud, addr, next);
-	}
-}
-
-/*
- * walk_page_range - walk the page tables of a VMA with a callback
- * @vma - VMA to walk
- * @action - callback invoked for every bottom-level (PTE) page table
- * @private - private data passed to the callback function
- *
- * Recursively walk the page table for the memory area in a VMA, calling
- * a callback for every bottom-level (PTE) page table.
- */
-static inline void walk_page_range(struct vm_area_struct *vma,
-				   void (*action)(struct vm_area_struct *,
-						  pmd_t *, unsigned long,
-						  unsigned long, void *),
-				   void *private)
-{
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	struct pmd_walker walker = {
-		.vma		= vma,
-		.private	= private,
-		.action		= action,
-	};
-	pgd_t *pgd;
-	unsigned long next;
-
-	for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
-	     pgd++, addr = next) {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		walk_pud_range(&walker, pgd, addr, next);
-	}
-}
+static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
 
 static int show_smap(struct seq_file *m, void *v)
 {
@@ -380,11 +314,15 @@ static int show_smap(struct seq_file *m, void *v)
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
+	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		walk_page_range(vma, smaps_pte_range, &mss);
+		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+				&smaps_walk, &mss);
 	return show_map_internal(m, v, &mss);
 }
 
+static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
+
 void clear_refs_smap(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -392,7 +330,8 @@ void clear_refs_smap(struct mm_struct *mm)
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
 		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-			walk_page_range(vma, clear_refs_pte_range, NULL);
+			walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
+					&clear_refs_walk, vma);
 	flush_tlb_mm(mm);
 	up_read(&mm->mmap_sem);
 }
-- 
cgit v1.1


From 4752c369789250eafcd7813e11c8fb689235b0d2 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:02 -0800
Subject: maps4: simplify interdependence of maps and smaps

This pulls the shared map display code out of show_map and puts it in
show_smap where it belongs.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 791b2d4..abc44f3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -145,7 +145,7 @@ struct mem_size_stats
 	u64 pss;
 };
 
-static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
+static int show_map(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
@@ -205,35 +205,11 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 	}
 	seq_putc(m, '\n');
 
-	if (mss)
-		seq_printf(m,
-			   "Size:           %8lu kB\n"
-			   "Rss:            %8lu kB\n"
-			   "Pss:            %8lu kB\n"
-			   "Shared_Clean:   %8lu kB\n"
-			   "Shared_Dirty:   %8lu kB\n"
-			   "Private_Clean:  %8lu kB\n"
-			   "Private_Dirty:  %8lu kB\n"
-			   "Referenced:     %8lu kB\n",
-			   (vma->vm_end - vma->vm_start) >> 10,
-			   mss->resident >> 10,
-			   (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
-			   mss->shared_clean  >> 10,
-			   mss->shared_dirty  >> 10,
-			   mss->private_clean >> 10,
-			   mss->private_dirty >> 10,
-			   mss->referenced >> 10);
-
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
 	return 0;
 }
 
-static int show_map(struct seq_file *m, void *v)
-{
-	return show_map_internal(m, v, NULL);
-}
-
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			   void *private)
 {
@@ -312,13 +288,37 @@ static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
+	int ret;
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
 				&smaps_walk, &mss);
-	return show_map_internal(m, v, &mss);
+
+	ret = show_map(m, v);
+	if (ret)
+		return ret;
+
+	seq_printf(m,
+		   "Size:           %8lu kB\n"
+		   "Rss:            %8lu kB\n"
+		   "Pss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n",
+		   (vma->vm_end - vma->vm_start) >> 10,
+		   mss.resident >> 10,
+		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
+		   mss.shared_clean  >> 10,
+		   mss.shared_dirty  >> 10,
+		   mss.private_clean >> 10,
+		   mss.private_dirty >> 10,
+		   mss.referenced >> 10);
+
+	return ret;
 }
 
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-- 
cgit v1.1


From f248dcb34d7b7ac255db70071a20be9d9c6ad491 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:03 -0800
Subject: maps4: move clear_refs code to task_mmu.c

This puts all the clear_refs code where it belongs and probably lets things
compile on MMU-less systems as well.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c          | 40 ----------------------------------------
 fs/proc/internal.h      |  6 +-----
 fs/proc/task_mmu.c      | 44 ++++++++++++++++++++++++++++++++++++--------
 include/linux/proc_fs.h |  3 ++-
 4 files changed, 39 insertions(+), 54 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3353748..1bd646d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -88,10 +88,6 @@
  *	in /proc for a task before it execs a suid executable.
  */
 
-
-/* Worst case buffer size needed for holding an integer. */
-#define PROC_NUMBUF 13
-
 struct pid_entry {
 	char *name;
 	int len;
@@ -935,42 +931,6 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
-#ifdef CONFIG_MMU
-static ssize_t clear_refs_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task;
-	char buffer[PROC_NUMBUF], *end;
-	struct mm_struct *mm;
-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-	if (!simple_strtol(buffer, &end, 0))
-		return -EINVAL;
-	if (*end == '\n')
-		end++;
-	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task)
-		return -ESRCH;
-	mm = get_task_mm(task);
-	if (mm) {
-		clear_refs_smap(mm);
-		mmput(mm);
-	}
-	put_task_struct(task);
-	if (end - buffer == 0)
-		return -EIO;
-	return end - buffer;
-}
-
-static struct file_operations proc_clear_refs_operations = {
-	.write		= clear_refs_write,
-};
-#endif
-
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 05b3e90..ddfaeec3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -56,11 +56,7 @@ extern int proc_pid_statm(struct task_struct *, char *);
 extern const struct file_operations proc_maps_operations;
 extern const struct file_operations proc_numa_maps_operations;
 extern const struct file_operations proc_smaps_operations;
-
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
-
+extern const struct file_operations proc_clear_refs_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index abc44f3..fcdbd23 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -323,19 +323,47 @@ static int show_smap(struct seq_file *m, void *v)
 
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
 
-void clear_refs_smap(struct mm_struct *mm)
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
 {
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 
-	down_read(&mm->mmap_sem);
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-			walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-					&clear_refs_walk, vma);
-	flush_tlb_mm(mm);
-	up_read(&mm->mmap_sem);
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	if (!simple_strtol(buffer, &end, 0))
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if (!is_vm_hugetlb_page(vma))
+				walk_page_range(mm, vma->vm_start, vma->vm_end,
+						&clear_refs_walk, vma);
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+	put_task_struct(task);
+	if (end - buffer == 0)
+		return -EIO;
+	return end - buffer;
 }
 
+const struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+};
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 8f92546..e435515 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -19,6 +19,8 @@ struct completion;
  */
 #define FIRST_PROCESS_ENTRY 256
 
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
 
 /*
  * We always define these enumerators
@@ -117,7 +119,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 char *task_mem(struct mm_struct *, char *);
-void clear_refs_smap(struct mm_struct *mm);
 
 struct proc_dir_entry *de_get(struct proc_dir_entry *de);
 void de_put(struct proc_dir_entry *de);
-- 
cgit v1.1


From a6198797cc3fd659b2d81cdf6bb6b9bba9cd93e9 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:03 -0800
Subject: maps4: regroup task_mmu by interface

Reorder source so that all the code and data for each interface is together.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 417 +++++++++++++++++++++++++++--------------------------
 1 file changed, 210 insertions(+), 207 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fcdbd23..308fc54 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -114,36 +114,122 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
-/*
- * Proportional Set Size(PSS): my share of RSS.
- *
- * PSS of a process is the count of pages it has in memory, where each
- * page is divided by the number of processes sharing it.  So if a
- * process has 1000 pages all to itself, and 1000 shared with one other
- * process, its PSS will be 1500.
- *
- * To keep (accumulated) division errors low, we adopt a 64bit
- * fixed-point pss counter to minimize division errors. So (pss >>
- * PSS_SHIFT) would be the real byte count.
- *
- * A shift of 12 before division means (assuming 4K page size):
- * 	- 1M 3-user-pages add up to 8KB errors;
- * 	- supports mapcount up to 2^24, or 16M;
- * 	- supports PSS up to 2^52 bytes, or 4PB.
- */
-#define PSS_SHIFT 12
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+	if (vma && vma != priv->tail_vma) {
+		struct mm_struct *mm = vma->vm_mm;
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
 
-struct mem_size_stats
+static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct vm_area_struct *vma;
-	unsigned long resident;
-	unsigned long shared_clean;
-	unsigned long shared_dirty;
-	unsigned long private_clean;
-	unsigned long private_dirty;
-	unsigned long referenced;
-	u64 pss;
-};
+	struct proc_maps_private *priv = m->private;
+	unsigned long last_addr = m->version;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma, *tail_vma = NULL;
+	loff_t l = *pos;
+
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
+	/*
+	 * We remember last_addr rather than next_addr to hit with
+	 * mmap_cache most of the time. We have zero last_addr at
+	 * the beginning and also after lseek. We will have -1 last_addr
+	 * after the end of the vmas.
+	 */
+
+	if (last_addr == -1UL)
+		return NULL;
+
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return NULL;
+
+	mm = mm_for_maps(priv->task);
+	if (!mm)
+		return NULL;
+
+	tail_vma = get_gate_vma(priv->task);
+	priv->tail_vma = tail_vma;
+
+	/* Start with last addr hint */
+	vma = find_vma(mm, last_addr);
+	if (last_addr && vma) {
+		vma = vma->vm_next;
+		goto out;
+	}
+
+	/*
+	 * Check the vma index is within the range and do
+	 * sequential scan until m_index.
+	 */
+	vma = NULL;
+	if ((unsigned long)l < mm->map_count) {
+		vma = mm->mmap;
+		while (l-- && vma)
+			vma = vma->vm_next;
+		goto out;
+	}
+
+	if (l != mm->map_count)
+		tail_vma = NULL; /* After gate vma */
+
+out:
+	if (vma)
+		return vma;
+
+	/* End of vmas has been reached */
+	m->version = (tail_vma != NULL)? 0: -1UL;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return tail_vma;
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+	struct vm_area_struct *tail_vma = priv->tail_vma;
+
+	(*pos)++;
+	if (vma && (vma != tail_vma) && vma->vm_next)
+		return vma->vm_next;
+	vma_stop(priv, vma);
+	return (vma != tail_vma)? tail_vma: NULL;
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+			struct seq_operations *ops)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
 
 static int show_map(struct seq_file *m, void *v)
 {
@@ -210,6 +296,56 @@ static int show_map(struct seq_file *m, void *v)
 	return 0;
 }
 
+static struct seq_operations proc_pid_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+const struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+/*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it.  So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500.
+ *
+ * To keep (accumulated) division errors low, we adopt a 64bit
+ * fixed-point pss counter to minimize division errors. So (pss >>
+ * PSS_SHIFT) would be the real byte count.
+ *
+ * A shift of 12 before division means (assuming 4K page size):
+ * 	- 1M 3-user-pages add up to 8KB errors;
+ * 	- supports mapcount up to 2^24, or 16M;
+ * 	- supports PSS up to 2^52 bytes, or 4PB.
+ */
+#define PSS_SHIFT 12
+
+struct mem_size_stats
+{
+	struct vm_area_struct *vma;
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+	unsigned long referenced;
+	u64 pss;
+};
+
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			   void *private)
 {
@@ -255,33 +391,6 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				unsigned long end, void *private)
-{
-	struct vm_area_struct *vma = private;
-	pte_t *pte, ptent;
-	spinlock_t *ptl;
-	struct page *page;
-
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
-		ptent = *pte;
-		if (!pte_present(ptent))
-			continue;
-
-		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
-			continue;
-
-		/* Clear accessed and referenced bits. */
-		ptep_test_and_clear_young(vma, addr, pte);
-		ClearPageReferenced(page);
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-	return 0;
-}
-
 static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
 
 static int show_smap(struct seq_file *m, void *v)
@@ -321,6 +430,52 @@ static int show_smap(struct seq_file *m, void *v)
 	return ret;
 }
 
+static struct seq_operations proc_pid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_smap
+};
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+const struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, void *private)
+{
+	struct vm_area_struct *vma = private;
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+	return 0;
+}
+
 static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
 
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -364,147 +519,6 @@ const struct file_operations proc_clear_refs_operations = {
 	.write		= clear_refs_write,
 };
 
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
-	struct proc_maps_private *priv = m->private;
-	unsigned long last_addr = m->version;
-	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma = NULL;
-	loff_t l = *pos;
-
-	/* Clear the per syscall fields in priv */
-	priv->task = NULL;
-	priv->tail_vma = NULL;
-
-	/*
-	 * We remember last_addr rather than next_addr to hit with
-	 * mmap_cache most of the time. We have zero last_addr at
-	 * the beginning and also after lseek. We will have -1 last_addr
-	 * after the end of the vmas.
-	 */
-
-	if (last_addr == -1UL)
-		return NULL;
-
-	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
-	if (!priv->task)
-		return NULL;
-
-	mm = mm_for_maps(priv->task);
-	if (!mm)
-		return NULL;
-
-	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
-
-	/* Start with last addr hint */
-	if (last_addr && (vma = find_vma(mm, last_addr))) {
-		vma = vma->vm_next;
-		goto out;
-	}
-
-	/*
-	 * Check the vma index is within the range and do
-	 * sequential scan until m_index.
-	 */
-	vma = NULL;
-	if ((unsigned long)l < mm->map_count) {
-		vma = mm->mmap;
-		while (l-- && vma)
-			vma = vma->vm_next;
-		goto out;
-	}
-
-	if (l != mm->map_count)
-		tail_vma = NULL; /* After gate vma */
-
-out:
-	if (vma)
-		return vma;
-
-	/* End of vmas has been reached */
-	m->version = (tail_vma != NULL)? 0: -1UL;
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return tail_vma;
-}
-
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
-	if (vma && vma != priv->tail_vma) {
-		struct mm_struct *mm = vma->vm_mm;
-		up_read(&mm->mmap_sem);
-		mmput(mm);
-	}
-}
-
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = priv->tail_vma;
-
-	(*pos)++;
-	if (vma && (vma != tail_vma) && vma->vm_next)
-		return vma->vm_next;
-	vma_stop(priv, vma);
-	return (vma != tail_vma)? tail_vma: NULL;
-}
-
-static void m_stop(struct seq_file *m, void *v)
-{
-	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-
-	vma_stop(priv, vma);
-	if (priv->task)
-		put_task_struct(priv->task);
-}
-
-static struct seq_operations proc_pid_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_map
-};
-
-static struct seq_operations proc_pid_smaps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_smap
-};
-
-static int do_maps_open(struct inode *inode, struct file *file,
-			struct seq_operations *ops)
-{
-	struct proc_maps_private *priv;
-	int ret = -ENOMEM;
-	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (priv) {
-		priv->pid = proc_pid(inode);
-		ret = seq_open(file, ops);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			m->private = priv;
-		} else {
-			kfree(priv);
-		}
-	}
-	return ret;
-}
-
-static int maps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_pid_maps_op);
-}
-
-const struct file_operations proc_maps_operations = {
-	.open		= maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
@@ -539,14 +553,3 @@ const struct file_operations proc_numa_maps_operations = {
 };
 #endif
 
-static int smaps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_pid_smaps_op);
-}
-
-const struct file_operations proc_smaps_operations = {
-	.open		= smaps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-- 
cgit v1.1


From 85863e475e59afb027b0113290e3796ee6020b7d Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:04 -0800
Subject: maps4: add /proc/pid/pagemap interface

This interface provides a mapping for each page in an address space to its
physical page frame number, allowing precise determination of what pages are
mapped and what pages are shared between processes.

New in this version:

- headers gone again (as recommended by Dave Hansen and Alan Cox)
- 64-bit entries (as per discussion with Andi Kleen)
- swap pte information exported (from Dave Hansen)
- page walker callback for holes (from Dave Hansen)
- direct put_user I/O (as suggested by Rusty Russell)

This patch folds in cleanups and swap PTE support from Dave Hansen
<haveblue@us.ibm.com>.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c     |   4 +-
 fs/proc/internal.h |   2 +
 fs/proc/task_mmu.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 2 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1bd646d..9004db0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -783,7 +783,7 @@ out_no_task:
 }
 #endif
 
-static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
+loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
 	switch (orig) {
 	case 0:
@@ -2252,6 +2252,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_MMU
 	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",      S_IRUGO, smaps),
+	REG("pagemap",    S_IRUSR, pagemap),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
@@ -2580,6 +2581,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_MMU
 	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
+	REG("pagemap",    S_IRUSR, pagemap),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ddfaeec3..7d57e80 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,11 +52,13 @@ extern int proc_tid_stat(struct task_struct *,  char *);
 extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
+extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
 
 extern const struct file_operations proc_maps_operations;
 extern const struct file_operations proc_numa_maps_operations;
 extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 308fc54..bbd9b14 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -5,7 +5,10 @@
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/pagemap.h>
+#include <linux/ptrace.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = {
 	.write		= clear_refs_write,
 };
 
+struct pagemapread {
+	char __user *out, *end;
+};
+
+#define PM_ENTRY_BYTES sizeof(u64)
+#define PM_RESERVED_BITS    3
+#define PM_RESERVED_OFFSET  (64 - PM_RESERVED_BITS)
+#define PM_RESERVED_MASK    (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
+#define PM_SPECIAL(nr)      (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
+#define PM_NOT_PRESENT      PM_SPECIAL(1LL)
+#define PM_SWAP             PM_SPECIAL(2LL)
+#define PM_END_OF_BUFFER    1
+
+static int add_to_pagemap(unsigned long addr, u64 pfn,
+			  struct pagemapread *pm)
+{
+	/*
+	 * Make sure there's room in the buffer for an
+	 * entire entry.  Otherwise, only copy part of
+	 * the pfn.
+	 */
+	if (pm->out + PM_ENTRY_BYTES >= pm->end) {
+		if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
+			return -EFAULT;
+		pm->out = pm->end;
+		return PM_END_OF_BUFFER;
+	}
+
+	if (put_user(pfn, pm->out))
+		return -EFAULT;
+	pm->out += PM_ENTRY_BYTES;
+	return 0;
+}
+
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+				void *private)
+{
+	struct pagemapread *pm = private;
+	unsigned long addr;
+	int err = 0;
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+u64 swap_pte_to_pagemap_entry(pte_t pte)
+{
+	swp_entry_t e = pte_to_swp_entry(pte);
+	return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+}
+
+static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			     void *private)
+{
+	struct pagemapread *pm = private;
+	pte_t *pte;
+	int err = 0;
+
+	for (; addr != end; addr += PAGE_SIZE) {
+		u64 pfn = PM_NOT_PRESENT;
+		pte = pte_offset_map(pmd, addr);
+		if (is_swap_pte(*pte))
+			pfn = swap_pte_to_pagemap_entry(*pte);
+		else if (pte_present(*pte))
+			pfn = pte_pfn(*pte);
+		/* unmap so we're not in atomic when we copy to userspace */
+		pte_unmap(pte);
+		err = add_to_pagemap(addr, pfn, pm);
+		if (err)
+			return err;
+	}
+
+	cond_resched();
+
+	return err;
+}
+
+static struct mm_walk pagemap_walk = {
+	.pmd_entry = pagemap_pte_range,
+	.pte_hole = pagemap_pte_hole
+};
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one 64-bit
+ * entry representing the corresponding physical page frame number
+ * (PFN) if the page is present. If there is a swap entry for the
+ * physical page, then an encoding of the swap file number and the
+ * page's offset into the swap file are returned. If no page is
+ * present at all, PM_NOT_PRESENT is returned. This allows determining
+ * precisely which pages are mapped (or in swap) and comparing mapped
+ * pages between processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct page **pages, *page;
+	unsigned long uaddr, uend;
+	struct mm_struct *mm;
+	struct pagemapread pm;
+	int pagecount;
+	int ret = -ESRCH;
+
+	if (!task)
+		goto out;
+
+	ret = -EACCES;
+	if (!ptrace_may_attach(task))
+		goto out;
+
+	ret = -EINVAL;
+	/* file position must be aligned */
+	if (*ppos % PM_ENTRY_BYTES)
+		goto out;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out;
+
+	ret = -ENOMEM;
+	uaddr = (unsigned long)buf & PAGE_MASK;
+	uend = (unsigned long)(buf + count);
+	pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
+	pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out_task;
+
+	down_read(&current->mm->mmap_sem);
+	ret = get_user_pages(current, current->mm, uaddr, pagecount,
+			     1, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (ret < 0)
+		goto out_free;
+
+	pm.out = buf;
+	pm.end = buf + count;
+
+	if (!ptrace_may_attach(task)) {
+		ret = -EIO;
+	} else {
+		unsigned long src = *ppos;
+		unsigned long svpfn = src / PM_ENTRY_BYTES;
+		unsigned long start_vaddr = svpfn << PAGE_SHIFT;
+		unsigned long end_vaddr = TASK_SIZE_OF(task);
+
+		/* watch out for wraparound */
+		if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+			start_vaddr = end_vaddr;
+
+		/*
+		 * The odds are that this will stop walking way
+		 * before end_vaddr, because the length of the
+		 * user buffer is tracked in "pm", and the walk
+		 * will stop when we hit the end of the buffer.
+		 */
+		ret = walk_page_range(mm, start_vaddr, end_vaddr,
+					&pagemap_walk, &pm);
+		if (ret == PM_END_OF_BUFFER)
+			ret = 0;
+		/* don't need mmap_sem for these, but this looks cleaner */
+		*ppos += pm.out - buf;
+		if (!ret)
+			ret = pm.out - buf;
+	}
+
+	for (; pagecount; pagecount--) {
+		page = pages[pagecount-1];
+		if (!PageReserved(page))
+			SetPageDirty(page);
+		page_cache_release(page);
+	}
+	mmput(mm);
+out_free:
+	kfree(pages);
+out_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+const struct file_operations proc_pagemap_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_read,
+};
+
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
@@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = {
 	.release	= seq_release_private,
 };
 #endif
-
-- 
cgit v1.1


From 161f47bf41c5ece90ac53cbb6a4cb9bf74ce0ef6 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:05 -0800
Subject: maps4: add /proc/kpagecount interface

This makes physical page map counts available to userspace. Together
with /proc/pid/pagemap and /proc/pid/clear_refs, this can be used to
monitor memory usage on a per-page basis.

[akpm@linux-foundation.org: remove unneeded access_ok()]
[bunk@stusta.de: make struct proc_kpagemap static]
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 3462bfd..19b69f9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -46,6 +46,7 @@
 #include <linux/vmalloc.h>
 #include <linux/crash_dump.h>
 #include <linux/pid_namespace.h>
+#include <linux/bootmem.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
@@ -675,6 +676,54 @@ static const struct file_operations proc_sysrq_trigger_operations = {
 };
 #endif
 
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 pcount;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EIO;
+
+	while (count > 0) {
+		ppage = pfn_to_page(pfn++);
+		if (!ppage)
+			pcount = 0;
+		else
+			pcount = atomic_read(&ppage->_count);
+
+		if (put_user(pcount, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static struct file_operations proc_kpagecount_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecount_read,
+};
+
 struct proc_dir_entry *proc_root_kcore;
 
 void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
@@ -755,6 +804,7 @@ void __init proc_misc_init(void)
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
 	}
 #endif
+	create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations);
 #ifdef CONFIG_PROC_VMCORE
 	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
 	if (proc_vmcore)
-- 
cgit v1.1


From 304daa8132a95e998b6716d4b7bd8bd76aa152b2 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:06 -0800
Subject: maps4: add /proc/kpageflags interface

This makes a subset of physical page flags available to userspace. Together
with /proc/pid/kpagemap, this allows tracking of a wide variety of VM behaviors.

Exported flags are decoupled from the kernel's internal flags. This
allows us to reorder flag bits, and synthesize any bits that get
redefined in terms of other bits.

[akpm@linux-foundation.org: remove unneeded access_ok()]
[akpm@linux-foundation.org: s/0/NULL/]
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 19b69f9..fd751ea 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -699,7 +699,10 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		return -EIO;
 
 	while (count > 0) {
-		ppage = pfn_to_page(pfn++);
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
 		if (!ppage)
 			pcount = 0;
 		else
@@ -724,6 +727,84 @@ static struct file_operations proc_kpagecount_operations = {
 	.read = kpagecount_read,
 };
 
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+/* These macros are used to decouple internal flags from exported ones */
+
+#define KPF_LOCKED     0
+#define KPF_ERROR      1
+#define KPF_REFERENCED 2
+#define KPF_UPTODATE   3
+#define KPF_DIRTY      4
+#define KPF_LRU        5
+#define KPF_ACTIVE     6
+#define KPF_SLAB       7
+#define KPF_WRITEBACK  8
+#define KPF_RECLAIM    9
+#define KPF_BUDDY     10
+
+#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 kflags, uflags;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EIO;
+
+	while (count > 0) {
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
+		if (!ppage)
+			kflags = 0;
+		else
+			kflags = ppage->flags;
+
+		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
+			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
+			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
+			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
+			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
+			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
+			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
+			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
+			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
+			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
+
+		if (put_user(uflags, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static struct file_operations proc_kpageflags_operations = {
+	.llseek = mem_lseek,
+	.read = kpageflags_read,
+};
+
 struct proc_dir_entry *proc_root_kcore;
 
 void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
@@ -805,6 +886,7 @@ void __init proc_misc_init(void)
 	}
 #endif
 	create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations);
+	create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations);
 #ifdef CONFIG_PROC_VMCORE
 	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
 	if (proc_vmcore)
-- 
cgit v1.1


From 1e88328111aae3ea408f346763ba9f9bad71f876 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:07 -0800
Subject: maps4: make page monitoring /proc file optional

Make /proc/ page monitoring configurable

This puts the following files under an embedded config option:

/proc/pid/clear_refs
/proc/pid/smaps
/proc/pid/pagemap
/proc/kpagecount
/proc/kpageflags

[akpm@linux-foundation.org: Kconfig fix]
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c      |  4 ++--
 fs/proc/proc_misc.c |  4 ++++
 fs/proc/task_mmu.c  |  2 ++
 init/Kconfig        | 10 ++++++++++
 mm/Makefile         |  3 ++-
 5 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9004db0..cd9f84c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2249,7 +2249,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	LNK("exe",        exe),
 	REG("mounts",     S_IRUGO, mounts),
 	REG("mountstats", S_IRUSR, mountstats),
-#ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",      S_IRUGO, smaps),
 	REG("pagemap",    S_IRUSR, pagemap),
@@ -2578,7 +2578,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	LNK("root",      root),
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
-#ifdef CONFIG_MMU
+#ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
 	REG("pagemap",    S_IRUSR, pagemap),
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fd751ea..51288db 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -676,6 +676,7 @@ static const struct file_operations proc_sysrq_trigger_operations = {
 };
 #endif
 
+#ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
 /* /proc/kpagecount - an array exposing page counts
@@ -804,6 +805,7 @@ static struct file_operations proc_kpageflags_operations = {
 	.llseek = mem_lseek,
 	.read = kpageflags_read,
 };
+#endif /* CONFIG_PROC_PAGE_MONITOR */
 
 struct proc_dir_entry *proc_root_kcore;
 
@@ -885,8 +887,10 @@ void __init proc_misc_init(void)
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
 	}
 #endif
+#ifdef CONFIG_PROC_PAGE_MONITOR
 	create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations);
 	create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations);
+#endif
 #ifdef CONFIG_PROC_VMCORE
 	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
 	if (proc_vmcore)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bbd9b14..38338ed 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -337,6 +337,7 @@ const struct file_operations proc_maps_operations = {
  */
 #define PSS_SHIFT 12
 
+#ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats
 {
 	struct vm_area_struct *vma;
@@ -717,6 +718,7 @@ const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 };
+#endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
diff --git a/init/Kconfig b/init/Kconfig
index dc28836..aaa7e7d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -678,6 +678,16 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config PROC_PAGE_MONITOR
+ 	default y
+	depends on PROC_FS && MMU
+	bool "Enable /proc page monitoring" if EMBEDDED
+ 	help
+	  Various /proc files exist to monitor process memory utilization:
+	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+	  /proc/kpagecount, and /proc/kpageflags. Disabling these
+          interfaces will reduce the size of the kernel by approximately 4kb.
+
 endmenu		# General setup
 
 config SLABINFO
diff --git a/mm/Makefile b/mm/Makefile
index 07f1213..44e2528 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o
+			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
@@ -13,6 +13,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o $(mmu-y)
 
+obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
-- 
cgit v1.1


From f61eaf9fc58f3b2d9e3ad424496620f3381ccd1e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:29:08 -0800
Subject: mm/page-writeback.c: make a function static

task_dirty_limit() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3d3848f..8137482 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -219,7 +219,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
 {
 	long numerator, denominator;
 	long dirty = *pdirty;
-- 
cgit v1.1


From 625d9573d0f905146efd15169a35ea9c5a841198 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Mon, 4 Feb 2008 22:29:08 -0800
Subject: Remove unused code from mm/tiny-shmem.c

This code in mm/tiny-shmem.c is under #if 0 - remove it.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/tiny-shmem.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index d436a9c..7020836 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	return 0;
 }
 
-#if 0
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	file_accessed(file);
-#ifndef CONFIG_MMU
-	return ramfs_nommu_mmap(file, vma);
-#else
-	return 0;
-#endif
-}
-#endif  /*  0  */
-
 #ifndef CONFIG_MMU
 unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long addr,
-- 
cgit v1.1


From e31d9eb5c17ae3b80f9e9403f8a5eaf6dba879c9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:29:09 -0800
Subject: make __vmalloc_area_node() static

__vmalloc_area_node() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 83625b6..4efc41a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -463,8 +463,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
-void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				pgprot_t prot, int node)
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+				 pgprot_t prot, int node)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
-- 
cgit v1.1


From e2848a0efedef4dad52d1334d37f8719cd6268fd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 4 Feb 2008 22:29:10 -0800
Subject: radix-tree: avoid atomic allocations for preloaded insertions

Most pagecache (and some other) radix tree insertions have the great
opportunity to preallocate a few nodes with relaxed gfp flags.  But the
preallocation is squandered when it comes time to allocate a node, we
default to first attempting a GFP_ATOMIC allocation -- that doesn't
normally fail, but it can eat into atomic memory reserves that we don't
need to be using.

Another upshot of this is that it removes the sometimes highly contended
zone->lock from underneath tree_lock.  Pagecache insertions are always
performed with a radix tree preload, and after this change, such a
situation will never fall back to kmem_cache_alloc within
radix_tree_node_alloc.

David Miller reports seeing this allocation fail on a highly threaded
sparc64 system:

[527319.459981] dd: page allocation failure. order:0, mode:0x20
[527319.460403] Call Trace:
[527319.460568]  [00000000004b71e0] __slab_alloc+0x1b0/0x6a8
[527319.460636]  [00000000004b7bbc] kmem_cache_alloc+0x4c/0xa8
[527319.460698]  [000000000055309c] radix_tree_node_alloc+0x20/0x90
[527319.460763]  [0000000000553238] radix_tree_insert+0x12c/0x260
[527319.460830]  [0000000000495cd0] add_to_page_cache+0x38/0xb0
[527319.460893]  [00000000004e4794] mpage_readpages+0x6c/0x134
[527319.460955]  [000000000049c7fc] __do_page_cache_readahead+0x170/0x280
[527319.461028]  [000000000049cc88] ondemand_readahead+0x208/0x214
[527319.461094]  [0000000000496018] do_generic_mapping_read+0xe8/0x428
[527319.461152]  [0000000000497948] generic_file_aio_read+0x108/0x170
[527319.461217]  [00000000004badac] do_sync_read+0x88/0xd0
[527319.461292]  [00000000004bb5cc] vfs_read+0x78/0x10c
[527319.461361]  [00000000004bb920] sys_read+0x34/0x60
[527319.461424]  [0000000000406294] linux_sparc_syscall32+0x3c/0x40

The calltrace is significant: __do_page_cache_readahead allocates a number
of pages with GFP_KERNEL, and hence it should have reclaimed sufficient
memory to satisfy GFP_ATOMIC allocations.  However after the list of pages
goes to mpage_readpages, there can be significant intervals (including disk
IO) before all the pages are inserted into the radix-tree.  So the reserves
can easily be depleted at that point.  The patch is confirmed to fix the
problem.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/radix-tree.c | 15 +++++++++++----
 mm/filemap.c     |  1 -
 mm/rmap.c        |  1 -
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 48c250f..65f0e75 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -95,14 +95,17 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
 static struct radix_tree_node *
 radix_tree_node_alloc(struct radix_tree_root *root)
 {
-	struct radix_tree_node *ret;
+	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
 
-	ret = kmem_cache_alloc(radix_tree_node_cachep,
-				set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
-	if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
+	if (!(gfp_mask & __GFP_WAIT)) {
 		struct radix_tree_preload *rtp;
 
+		/*
+		 * Provided the caller has preloaded here, we will always
+		 * succeed in getting a node here (and never reach
+		 * kmem_cache_alloc)
+		 */
 		rtp = &__get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
@@ -110,6 +113,10 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 			rtp->nr--;
 		}
 	}
+	if (ret == NULL)
+		ret = kmem_cache_alloc(radix_tree_node_cachep,
+				set_migrateflags(gfp_mask, __GFP_RECLAIMABLE));
+
 	BUG_ON(radix_tree_is_indirect_ptr(ret));
 	return ret;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 76bea88..96920f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
- *          ->zone.lock
  *
  *  ->i_mutex
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2..0334c8f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within inode_lock in __sync_single_inode)
- *                   zone->lock (within radix tree node alloc)
  */
 
 #include <linux/mm.h>
-- 
cgit v1.1


From 9f8f2172537de7af0b0fbd33502d18d52b1339bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:29:11 -0800
Subject: Page allocator: clean up pcp draining functions

- Add comments explaing how drain_pages() works.

- Eliminate useless functions

- Rename drain_all_local_pages to drain_all_pages(). It does drain
  all pages not only those of the local processor.

- Eliminate useless interrupt off / on sequences. drain_pages()
  disables interrupts on its own. The execution thread is
  pinned to processor by the caller. So there is no need to
  disable interrupts.

- Put drain_all_pages() declaration in gfp.h and remove the
  declarations from suspend.h and from mm/memory_hotplug.c

- Make software suspend call drain_all_pages(). The draining
  of processor local pages is may not the right approach if
  software suspend wants to support SMP. If they call drain_all_pages
  then we can make drain_pages() static.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h     |  2 ++
 include/linux/suspend.h |  1 -
 kernel/power/snapshot.c |  4 +--
 mm/memory_hotplug.c     |  6 ++--
 mm/page_alloc.c         | 79 ++++++++++++++++++++++++++-----------------------
 5 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7e93a9a..0c6ce51 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -228,5 +228,7 @@ extern void FASTCALL(free_cold_page(struct page *page));
 
 void page_alloc_init(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
+void drain_all_pages(void);
+void drain_local_pages(void *dummy);
 
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 646ce2d..1d7d4c5 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -130,7 +130,6 @@ struct pbe {
 };
 
 /* mm/page_alloc.c */
-extern void drain_local_pages(void);
 extern void mark_free_pages(struct zone *zone);
 
 /**
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6a5df9..95250d7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1203,7 +1203,7 @@ asmlinkage int swsusp_save(void)
 
 	printk(KERN_INFO "PM: Creating hibernation image: \n");
 
-	drain_local_pages();
+	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
 	nr_highmem = count_highmem_pages();
 	printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
@@ -1221,7 +1221,7 @@ asmlinkage int swsusp_save(void)
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
-	drain_local_pages();
+	drain_local_pages(NULL);
 	copy_data_pages(&copy_bm, &orig_bm);
 
 	/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9512a54..7469c50 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-extern void drain_all_local_pages(void);
-
 int offline_pages(unsigned long start_pfn,
 		  unsigned long end_pfn, unsigned long timeout)
 {
@@ -540,7 +538,7 @@ repeat:
 		lru_add_drain_all();
 		flush_scheduled_work();
 		cond_resched();
-		drain_all_local_pages();
+		drain_all_pages();
 	}
 
 	pfn = scan_lru_pages(start_pfn, end_pfn);
@@ -563,7 +561,7 @@ repeat:
 	flush_scheduled_work();
 	yield();
 	/* drain pcp pages , this is synchrouns. */
-	drain_all_local_pages();
+	drain_all_pages();
 	/* check again */
 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
 	if (offlined_pages < 0) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b2838c2..5c7de8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -890,7 +890,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 }
 #endif
 
-static void __drain_pages(unsigned int cpu)
+/*
+ * Drain pages of the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
@@ -915,6 +922,22 @@ static void __drain_pages(unsigned int cpu)
 	}
 }
 
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void *arg)
+{
+	drain_pages(smp_processor_id());
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_pages(void)
+{
+	on_each_cpu(drain_local_pages, NULL, 0, 1);
+}
+
 #ifdef CONFIG_HIBERNATION
 
 void mark_free_pages(struct zone *zone)
@@ -952,37 +975,6 @@ void mark_free_pages(struct zone *zone)
 #endif /* CONFIG_PM */
 
 /*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);	
-	__drain_pages(smp_processor_id());
-	local_irq_restore(flags);	
-}
-
-void smp_drain_local_pages(void *arg)
-{
-	drain_local_pages();
-}
-
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
- */
-void drain_all_local_pages(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__drain_pages(smp_processor_id());
-	local_irq_restore(flags);
-
-	smp_call_function(smp_drain_local_pages, NULL, 0, 1);
-}
-
-/*
  * Free a 0-order page
  */
 static void fastcall free_hot_cold_page(struct page *page, int cold)
@@ -1569,7 +1561,7 @@ nofail_alloc:
 	cond_resched();
 
 	if (order != 0)
-		drain_all_local_pages();
+		drain_all_pages();
 
 	if (likely(did_some_progress)) {
 		page = get_page_from_freelist(gfp_mask, order,
@@ -3978,10 +3970,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 	int cpu = (unsigned long)hcpu;
 
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		local_irq_disable();
-		__drain_pages(cpu);
+		drain_pages(cpu);
+
+		/*
+		 * Spill the event counters of the dead processor
+		 * into the current processors event counters.
+		 * This artificially elevates the count of the current
+		 * processor.
+		 */
 		vm_events_fold_cpu(cpu);
-		local_irq_enable();
+
+		/*
+		 * Zero the differential counters of the dead processor
+		 * so that the vm statistics are consistent.
+		 *
+		 * This is only okay since the processor is dead and cannot
+		 * race with what we are doing.
+		 */
 		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
@@ -4480,7 +4485,7 @@ int set_migratetype_isolate(struct page *page)
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 	if (!ret)
-		drain_all_local_pages();
+		drain_all_pages();
 	return ret;
 }
 
-- 
cgit v1.1


From 5e5419734c8719cbc01af959ad9c0844002c0df5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 4 Feb 2008 22:29:14 -0800
Subject: add mm argument to pte/pmd/pud/pgd_free

(with Martin Schwidefsky <schwidefsky@de.ibm.com>)

The pgd/pud/pmd/pte page table allocation functions get a mm_struct pointer as
first argument.  The free functions do not get the mm_struct argument.  This
is 1) asymmetrical and 2) to do mm related page table allocations the mm
argument is needed on the free function as well.

[kamalesh@linux.vnet.ibm.com: i386 fix]
[akpm@linux-foundation.org: coding-syle fixes]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/smp.c               |  2 +-
 arch/arm/mm/ioremap.c               |  2 +-
 arch/arm/mm/pgd.c                   |  8 ++++----
 arch/frv/mm/pgalloc.c               |  2 +-
 arch/powerpc/mm/pgtable_32.c        |  6 +++---
 arch/ppc/mm/pgtable.c               |  6 +++---
 arch/um/kernel/mem.c                |  2 +-
 arch/um/kernel/skas/mmu.c           |  8 ++++----
 arch/x86/mm/pgtable_32.c            | 12 ++++++------
 include/asm-alpha/pgalloc.h         |  8 ++++----
 include/asm-alpha/tlb.h             |  4 ++--
 include/asm-arm/pgalloc.h           | 10 +++++-----
 include/asm-arm/tlb.h               |  4 ++--
 include/asm-avr32/pgalloc.h         |  6 +++---
 include/asm-cris/pgalloc.h          |  6 +++---
 include/asm-frv/pgalloc.h           |  8 ++++----
 include/asm-frv/pgtable.h           |  2 +-
 include/asm-generic/4level-fixup.h  |  2 +-
 include/asm-generic/pgtable-nopmd.h |  2 +-
 include/asm-generic/pgtable-nopud.h |  2 +-
 include/asm-ia64/pgalloc.h          | 16 ++++++++--------
 include/asm-m32r/pgalloc.h          | 10 +++++-----
 include/asm-m68k/motorola_pgalloc.h | 10 +++++-----
 include/asm-m68k/sun3_pgalloc.h     |  8 ++++----
 include/asm-mips/pgalloc.h          | 12 ++++++------
 include/asm-parisc/pgalloc.h        | 10 +++++-----
 include/asm-parisc/tlb.h            |  4 ++--
 include/asm-powerpc/pgalloc-32.h    | 10 +++++-----
 include/asm-powerpc/pgalloc-64.h    | 10 +++++-----
 include/asm-ppc/pgalloc.h           | 10 +++++-----
 include/asm-s390/pgalloc.h          | 14 +++++++-------
 include/asm-s390/tlb.h              |  8 ++++----
 include/asm-sh/pgalloc.h            |  8 ++++----
 include/asm-sparc/pgalloc.h         | 12 ++++++------
 include/asm-sparc64/pgalloc.h       |  8 ++++----
 include/asm-sparc64/tlb.h           |  4 ++--
 include/asm-um/pgalloc.h            |  8 ++++----
 include/asm-x86/pgalloc_32.h        |  8 ++++----
 include/asm-x86/pgalloc_64.h        | 10 +++++-----
 include/asm-xtensa/pgalloc.h        |  6 +++---
 include/asm-xtensa/tlb.h            |  2 +-
 kernel/fork.c                       |  2 +-
 mm/memory.c                         | 10 +++++-----
 43 files changed, 151 insertions(+), 151 deletions(-)

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e9dfbab..eefae1d 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -150,7 +150,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	secondary_data.pgdir = 0;
 
 	*pmd_offset(pgd, PHYS_OFFSET) = __pmd(0);
-	pgd_free(pgd);
+	pgd_free(&init_mm, pgd);
 
 	if (ret) {
 		printk(KERN_CRIT "CPU%u: processor failed to boot\n", cpu);
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 7595277..303a7ff 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -162,7 +162,7 @@ static void unmap_area_sections(unsigned long virt, unsigned long size)
 			 * Free the page table, if there was one.
 			 */
 			if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE)
-				pte_free_kernel(pmd_page_vaddr(pmd));
+				pte_free_kernel(&init_mm, pmd_page_vaddr(pmd));
 		}
 
 		addr += PGDIR_SIZE;
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 50b9aed..500c961 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -65,14 +65,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	return new_pgd;
 
 no_pte:
-	pmd_free(new_pmd);
+	pmd_free(mm, new_pmd);
 no_pmd:
 	free_pages((unsigned long)new_pgd, 2);
 no_pgd:
 	return NULL;
 }
 
-void free_pgd_slow(pgd_t *pgd)
+void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
 {
 	pmd_t *pmd;
 	struct page *pte;
@@ -94,8 +94,8 @@ void free_pgd_slow(pgd_t *pgd)
 	pmd_clear(pmd);
 	dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
 	pte_lock_deinit(pte);
-	pte_free(pte);
-	pmd_free(pmd);
+	pte_free(mm, pte);
+	pmd_free(mm, pmd);
 free:
 	free_pages((unsigned long) pgd, 2);
 }
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 7787c3c..1a2e5c8 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -140,7 +140,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* in the non-PAE case, clear_page_tables() clears user pgd entries */
  	quicklist_free(0, pgd_dtor, pgd);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6448872..f80f90c 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -86,7 +86,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_pages((unsigned long)pgd, PGDIR_ORDER);
 }
@@ -123,7 +123,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return ptepage;
 }
 
-void pte_free_kernel(pte_t *pte)
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 #ifdef CONFIG_SMP
 	hash_page_sync();
@@ -131,7 +131,7 @@ void pte_free_kernel(pte_t *pte)
 	free_page((unsigned long)pte);
 }
 
-void pte_free(struct page *ptepage)
+void pte_free(struct mm_struct *mm, struct page *ptepage)
 {
 #ifdef CONFIG_SMP
 	hash_page_sync();
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
index fadacfd..409fcaa 100644
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -74,7 +74,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_pages((unsigned long)pgd, PGDIR_ORDER);
 }
@@ -111,7 +111,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return ptepage;
 }
 
-void pte_free_kernel(pte_t *pte)
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 #ifdef CONFIG_SMP
 	hash_page_sync();
@@ -119,7 +119,7 @@ void pte_free_kernel(pte_t *pte)
 	free_page((unsigned long)pte);
 }
 
-void pte_free(struct page *ptepage)
+void pte_free(struct mm_struct *mm, struct page *ptepage)
 {
 #ifdef CONFIG_SMP
 	hash_page_sync();
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 59822dee..e3e7241 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -348,7 +348,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_page((unsigned long) pgd);
 }
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index f859ec3..b56fe8b 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -58,9 +58,9 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	return 0;
 
  out_pmd:
-	pud_free(pud);
+	pud_free(mm, pud);
  out_pte:
-	pmd_free(pmd);
+	pmd_free(mm, pmd);
  out:
 	return -ENOMEM;
 }
@@ -144,10 +144,10 @@ void destroy_context(struct mm_struct *mm)
 	if (!proc_mm || !ptrace_faultinfo) {
 		free_page(mmu->id.stack);
 		pte_lock_deinit(virt_to_page(mmu->last_page_table));
-		pte_free_kernel((pte_t *) mmu->last_page_table);
+		pte_free_kernel(mm, (pte_t *) mmu->last_page_table);
 		dec_zone_page_state(virt_to_page(mmu->last_page_table), NR_PAGETABLE);
 #ifdef CONFIG_3_LEVEL_PGTABLES
-		pmd_free((pmd_t *) mmu->last_pmd);
+		pmd_free(mm, (pmd_t *) mmu->last_pmd);
 #endif
 	}
 
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index c7db504..6c19146 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -272,7 +272,7 @@ static void pgd_dtor(void *pgd)
  * preallocate which never got a corresponding vma will need to be
  * freed manually.
  */
-static void pgd_mop_up_pmds(pgd_t *pgdp)
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 {
 	int i;
 
@@ -285,7 +285,7 @@ static void pgd_mop_up_pmds(pgd_t *pgdp)
 			pgdp[i] = native_make_pgd(0);
 
 			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(pmd);
+			pmd_free(mm, pmd);
 		}
 	}
 }
@@ -313,7 +313,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 		pmd_t *pmd = pmd_alloc_one(mm, addr);
 
 		if (!pmd) {
-			pgd_mop_up_pmds(pgd);
+			pgd_mop_up_pmds(mm, pgd);
 			return 0;
 		}
 
@@ -333,7 +333,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 	return 1;
 }
 
-static void pgd_mop_up_pmds(pgd_t *pgd)
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 {
 }
 #endif	/* CONFIG_X86_PAE */
@@ -352,9 +352,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 }
 
-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	pgd_mop_up_pmds(pgd);
+	pgd_mop_up_pmds(mm, pgd);
 	quicklist_free(0, pgd_dtor, pgd);
 }
 
diff --git a/include/asm-alpha/pgalloc.h b/include/asm-alpha/pgalloc.h
index 471864e..fdbedac 100644
--- a/include/asm-alpha/pgalloc.h
+++ b/include/asm-alpha/pgalloc.h
@@ -31,7 +31,7 @@ pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd)
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 static inline void
-pgd_free(pgd_t *pgd)
+pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_page((unsigned long)pgd);
 }
@@ -44,7 +44,7 @@ pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 }
 
 static inline void
-pmd_free(pmd_t *pmd)
+pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	free_page((unsigned long)pmd);
 }
@@ -52,7 +52,7 @@ pmd_free(pmd_t *pmd)
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 
 static inline void
-pte_free_kernel(pte_t *pte)
+pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
@@ -67,7 +67,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 }
 
 static inline void
-pte_free(struct page *page)
+pte_free(struct mm_struct *mm, struct page *page)
 {
 	__free_page(page);
 }
diff --git a/include/asm-alpha/tlb.h b/include/asm-alpha/tlb.h
index aa91335..c136365 100644
--- a/include/asm-alpha/tlb.h
+++ b/include/asm-alpha/tlb.h
@@ -9,7 +9,7 @@
 
 #include <asm-generic/tlb.h>
 
-#define __pte_free_tlb(tlb,pte)			pte_free(pte)
-#define __pmd_free_tlb(tlb,pmd)			pmd_free(pmd)
+#define __pte_free_tlb(tlb, pte)			pte_free((tlb)->mm, pte)
+#define __pmd_free_tlb(tlb, pmd)			pmd_free((tlb)->mm, pmd)
  
 #endif
diff --git a/include/asm-arm/pgalloc.h b/include/asm-arm/pgalloc.h
index 4d43945..fb6c6e3 100644
--- a/include/asm-arm/pgalloc.h
+++ b/include/asm-arm/pgalloc.h
@@ -27,14 +27,14 @@
  * Since we have only two-level page tables, these are trivial
  */
 #define pmd_alloc_one(mm,addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free(pmd)			do { } while (0)
+#define pmd_free(mm, pmd)		do { } while (0)
 #define pgd_populate(mm,pmd,pte)	BUG()
 
 extern pgd_t *get_pgd_slow(struct mm_struct *mm);
-extern void free_pgd_slow(pgd_t *pgd);
+extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
 
 #define pgd_alloc(mm)			get_pgd_slow(mm)
-#define pgd_free(pgd)			free_pgd_slow(pgd)
+#define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
 
 /*
  * Allocate one PTE table.
@@ -83,7 +83,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 /*
  * Free one PTE table.
  */
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	if (pte) {
 		pte -= PTRS_PER_PTE;
@@ -91,7 +91,7 @@ static inline void pte_free_kernel(pte_t *pte)
 	}
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index cb74002..36bd402 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -85,8 +85,8 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 }
 
 #define tlb_remove_page(tlb,page)	free_page_and_swap_cache(page)
-#define pte_free_tlb(tlb,ptep)		pte_free(ptep)
-#define pmd_free_tlb(tlb,pmdp)		pmd_free(pmdp)
+#define pte_free_tlb(tlb, ptep)		pte_free((tlb)->mm, ptep)
+#define pmd_free_tlb(tlb, pmdp)		pmd_free((tlb)->mm, pmdp)
 
 #define tlb_migrate_finish(mm)		do { } while (0)
 
diff --git a/include/asm-avr32/pgalloc.h b/include/asm-avr32/pgalloc.h
index 0e680f4..b77e364 100644
--- a/include/asm-avr32/pgalloc.h
+++ b/include/asm-avr32/pgalloc.h
@@ -30,7 +30,7 @@ static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return kcalloc(USER_PTRS_PER_PGD, sizeof(pgd_t), GFP_KERNEL);
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	kfree(pgd);
 }
@@ -55,12 +55,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return pte;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
diff --git a/include/asm-cris/pgalloc.h b/include/asm-cris/pgalloc.h
index deaddfe..8ddd66f 100644
--- a/include/asm-cris/pgalloc.h
+++ b/include/asm-cris/pgalloc.h
@@ -16,7 +16,7 @@ static inline pgd_t *pgd_alloc (struct mm_struct *mm)
 	return (pgd_t *)get_zeroed_page(GFP_KERNEL);
 }
 
-static inline void pgd_free (pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_page((unsigned long)pgd);
 }
@@ -34,12 +34,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add
 	return pte;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
diff --git a/include/asm-frv/pgalloc.h b/include/asm-frv/pgalloc.h
index ce982a6..e89620e 100644
--- a/include/asm-frv/pgalloc.h
+++ b/include/asm-frv/pgalloc.h
@@ -31,18 +31,18 @@ do {										\
  */
 
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(pgd_t *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
 
 extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
@@ -55,7 +55,7 @@ static inline void pte_free(struct page *pte)
  * (In the PAE case we free the pmds as part of the pgd.)
  */
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *) 2); })
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define __pmd_free_tlb(tlb,x)		do { } while (0)
 
 #endif /* CONFIG_MMU */
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index 3c402af..6c0682e 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -226,7 +226,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
  * inside the pgd, so has no extra memory associated with it.
  */
 #define pud_alloc_one(mm, address)		NULL
-#define pud_free(x)				do { } while (0)
+#define pud_free(mm, x)				do { } while (0)
 #define __pud_free_tlb(tlb, x)			do { } while (0)
 
 /*
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index 7b88d39..9d40e87 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -28,7 +28,7 @@
 
 #undef pud_free_tlb
 #define pud_free_tlb(tlb, x)            do { } while (0)
-#define pud_free(x)			do { } while (0)
+#define pud_free(mm, x)			do { } while (0)
 #define __pud_free_tlb(tlb, x)		do { } while (0)
 
 #undef  pud_addr_end
diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h
index 29ff5d8..087325e 100644
--- a/include/asm-generic/pgtable-nopmd.h
+++ b/include/asm-generic/pgtable-nopmd.h
@@ -54,7 +54,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
  * inside the pud, so has no extra memory associated with it.
  */
 #define pmd_alloc_one(mm, address)		NULL
-#define pmd_free(x)				do { } while (0)
+#define pmd_free(mm, x)				do { } while (0)
 #define __pmd_free_tlb(tlb, x)			do { } while (0)
 
 #undef  pmd_addr_end
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 5664645..87cf449 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -51,7 +51,7 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address)
  * inside the pgd, so has no extra memory associated with it.
  */
 #define pud_alloc_one(mm, address)		NULL
-#define pud_free(x)				do { } while (0)
+#define pud_free(mm, x)				do { } while (0)
 #define __pud_free_tlb(tlb, x)			do { } while (0)
 
 #undef  pud_addr_end
diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
index 67552ca..556d988 100644
--- a/include/asm-ia64/pgalloc.h
+++ b/include/asm-ia64/pgalloc.h
@@ -27,7 +27,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pgd_free(pgd_t * pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	quicklist_free(0, NULL, pgd);
 }
@@ -44,11 +44,11 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pud_free(pud_t * pud)
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	quicklist_free(0, NULL, pud);
 }
-#define __pud_free_tlb(tlb, pud)	pud_free(pud)
+#define __pud_free_tlb(tlb, pud)	pud_free((tlb)->mm, pud)
 #endif /* CONFIG_PGTABLE_4 */
 
 static inline void
@@ -62,12 +62,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pmd_free(pmd_t * pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	quicklist_free(0, NULL, pmd);
 }
 
-#define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
+#define __pmd_free_tlb(tlb, pmd)	pmd_free((tlb)->mm, pmd)
 
 static inline void
 pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
@@ -94,12 +94,12 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	quicklist_free_page(0, NULL, pte);
 }
 
-static inline void pte_free_kernel(pte_t * pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	quicklist_free(0, NULL, pte);
 }
@@ -109,6 +109,6 @@ static inline void check_pgt_cache(void)
 	quicklist_trim(0, NULL, 25, 16);
 }
 
-#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, pte)
 
 #endif				/* _ASM_IA64_PGALLOC_H */
diff --git a/include/asm-m32r/pgalloc.h b/include/asm-m32r/pgalloc.h
index 943ba63..e5921ad 100644
--- a/include/asm-m32r/pgalloc.h
+++ b/include/asm-m32r/pgalloc.h
@@ -24,7 +24,7 @@ static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 }
 
-static __inline__ void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_page((unsigned long)pgd);
 }
@@ -46,17 +46,17 @@ static __inline__ struct page *pte_alloc_one(struct mm_struct *mm,
 	return pte;
 }
 
-static __inline__ void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static __inline__ void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
 
-#define __pte_free_tlb(tlb, pte)	pte_free((pte))
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, (pte))
 
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
@@ -65,7 +65,7 @@ static __inline__ void pte_free(struct page *pte)
  */
 
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define __pmd_free_tlb(tlb, x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)	BUG()
 
diff --git a/include/asm-m68k/motorola_pgalloc.h b/include/asm-m68k/motorola_pgalloc.h
index 5158412..500ec9b 100644
--- a/include/asm-m68k/motorola_pgalloc.h
+++ b/include/asm-m68k/motorola_pgalloc.h
@@ -22,7 +22,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long ad
 	return pte;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	cache_page(pte);
 	free_page((unsigned long) pte);
@@ -47,7 +47,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add
 	return page;
 }
 
-static inline void pte_free(struct page *page)
+static inline void pte_free(struct mm_struct *mm, struct page *page)
 {
 	cache_page(kmap(page));
 	kunmap(page);
@@ -67,7 +67,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 	return get_pointer_table();
 }
 
-static inline int pmd_free(pmd_t *pmd)
+static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	return free_pointer_table(pmd);
 }
@@ -78,9 +78,9 @@ static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 }
 
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	pmd_free((pmd_t *)pgd);
+	pmd_free(mm, (pmd_t *)pgd);
 }
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/include/asm-m68k/sun3_pgalloc.h b/include/asm-m68k/sun3_pgalloc.h
index fd82411..a5a91e7 100644
--- a/include/asm-m68k/sun3_pgalloc.h
+++ b/include/asm-m68k/sun3_pgalloc.h
@@ -21,12 +21,12 @@ extern const char bad_pmd_string[];
 #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
 
 
-static inline void pte_free_kernel(pte_t * pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
         free_page((unsigned long) pte);
 }
 
-static inline void pte_free(struct page *page)
+static inline void pte_free(struct mm_struct *mm, struct page *page)
 {
         __free_page(page);
 }
@@ -72,10 +72,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
  * inside the pgd, so has no extra memory associated with it.
  */
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define __pmd_free_tlb(tlb, x)		do { } while (0)
 
-static inline void pgd_free(pgd_t * pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
         free_page((unsigned long) pgd);
 }
diff --git a/include/asm-mips/pgalloc.h b/include/asm-mips/pgalloc.h
index 81b7212..c4efece 100644
--- a/include/asm-mips/pgalloc.h
+++ b/include/asm-mips/pgalloc.h
@@ -58,7 +58,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_pages((unsigned long)pgd, PGD_ORDER);
 }
@@ -85,12 +85,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return pte;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_pages((unsigned long)pte, PTE_ORDER);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_pages(pte, PTE_ORDER);
 }
@@ -103,7 +103,7 @@ static inline void pte_free(struct page *pte)
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
  * inside the pgd, so has no extra memory associated with it.
  */
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define __pmd_free_tlb(tlb, x)		do { } while (0)
 
 #endif
@@ -120,12 +120,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pmd;
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	free_pages((unsigned long)pmd, PMD_ORDER);
 }
 
-#define __pmd_free_tlb(tlb, x)	pmd_free(x)
+#define __pmd_free_tlb(tlb, x)	pmd_free((tlb)->mm, x)
 
 #endif
 
diff --git a/include/asm-parisc/pgalloc.h b/include/asm-parisc/pgalloc.h
index 1af1a41..aab66f1 100644
--- a/include/asm-parisc/pgalloc.h
+++ b/include/asm-parisc/pgalloc.h
@@ -43,7 +43,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return actual_pgd;
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 #ifdef CONFIG_64BIT
 	pgd -= PTRS_PER_PGD;
@@ -70,7 +70,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pmd;
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 #ifdef CONFIG_64BIT
 	if(pmd_flag(*pmd) & PxD_FLAG_ATTACHED)
@@ -91,7 +91,7 @@ static inline void pmd_free(pmd_t *pmd)
  */
 
 #define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define pgd_populate(mm, pmd, pte)	BUG()
 
 #endif
@@ -130,12 +130,12 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-#define pte_free(page)	pte_free_kernel(page_address(page))
+#define pte_free(mm, page) pte_free_kernel(page_address(page))
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/include/asm-parisc/tlb.h b/include/asm-parisc/tlb.h
index 33107a2..383b1db 100644
--- a/include/asm-parisc/tlb.h
+++ b/include/asm-parisc/tlb.h
@@ -21,7 +21,7 @@ do {	if (!(tlb)->fullmm)	\
 
 #include <asm-generic/tlb.h>
 
-#define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
-#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define __pmd_free_tlb(tlb, pmd)	pmd_free((tlb)->mm, pmd)
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, pte)
 
 #endif
diff --git a/include/asm-powerpc/pgalloc-32.h b/include/asm-powerpc/pgalloc-32.h
index e130743..c162a4c 100644
--- a/include/asm-powerpc/pgalloc-32.h
+++ b/include/asm-powerpc/pgalloc-32.h
@@ -6,14 +6,14 @@
 extern void __bad_pte(pmd_t *pmd);
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 /*
  * We don't have any real pmd's, and this code never triggers because
  * the pgd will always be present..
  */
 /* #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); }) */
-#define pmd_free(x)                     do { } while (0)
+#define pmd_free(mm, x) 		do { } while (0)
 #define __pmd_free_tlb(tlb,x)		do { } while (0)
 /* #define pgd_populate(mm, pmd, pte)      BUG() */
 
@@ -31,10 +31,10 @@ extern void pgd_free(pgd_t *pgd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-extern void pte_free_kernel(pte_t *pte);
-extern void pte_free(struct page *pte);
+extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
+extern void pte_free(struct mm_struct *mm, struct page *pte);
 
-#define __pte_free_tlb(tlb, pte)	pte_free((pte))
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, (pte))
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/include/asm-powerpc/pgalloc-64.h b/include/asm-powerpc/pgalloc-64.h
index 43214c8..5afae85 100644
--- a/include/asm-powerpc/pgalloc-64.h
+++ b/include/asm-powerpc/pgalloc-64.h
@@ -29,7 +29,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	subpage_prot_free(pgd);
 	kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
@@ -45,7 +45,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline void pud_free(pud_t *pud)
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
 }
@@ -81,7 +81,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
 }
@@ -99,12 +99,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return pte ? virt_to_page(pte) : NULL;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static inline void pte_free(struct page *ptepage)
+static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
 {
 	__free_page(ptepage);
 }
diff --git a/include/asm-ppc/pgalloc.h b/include/asm-ppc/pgalloc.h
index 44d88a9..7c39a95 100644
--- a/include/asm-ppc/pgalloc.h
+++ b/include/asm-ppc/pgalloc.h
@@ -7,14 +7,14 @@
 extern void __bad_pte(pmd_t *pmd);
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 /*
  * We don't have any real pmd's, and this code never triggers because
  * the pgd will always be present..
  */
 #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)                     do { } while (0)
+#define pmd_free(mm, x) 		do { } while (0)
 #define __pmd_free_tlb(tlb,x)		do { } while (0)
 #define pgd_populate(mm, pmd, pte)      BUG()
 
@@ -32,10 +32,10 @@ extern void pgd_free(pgd_t *pgd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-extern void pte_free_kernel(pte_t *pte);
-extern void pte_free(struct page *pte);
+extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
+extern void pte_free(struct mm_struct *mm, struct page *pte);
 
-#define __pte_free_tlb(tlb, pte)	pte_free((pte))
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, (pte))
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index 709dd17..6f6619b 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -57,10 +57,10 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm)
 }
 
 #define pud_alloc_one(mm,address)		({ BUG(); ((pud_t *)2); })
-#define pud_free(x)				do { } while (0)
+#define pud_free(mm, x)				do { } while (0)
 
 #define pmd_alloc_one(mm,address)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)				do { } while (0)
+#define pmd_free(mm, x)				do { } while (0)
 
 #define pgd_populate(mm, pgd, pud)		BUG()
 #define pgd_populate_kernel(mm, pgd, pud)	BUG()
@@ -76,7 +76,7 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm)
 }
 
 #define pud_alloc_one(mm,address)		({ BUG(); ((pud_t *)2); })
-#define pud_free(x)				do { } while (0)
+#define pud_free(mm, x)				do { } while (0)
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 {
@@ -85,7 +85,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 		crst_table_init(crst, _SEGMENT_ENTRY_EMPTY);
 	return (pmd_t *) crst;
 }
-#define pmd_free(pmd) crst_table_free((unsigned long *) pmd)
+#define pmd_free(mm, pmd) crst_table_free((unsigned long *)pmd)
 
 #define pgd_populate(mm, pgd, pud)		BUG()
 #define pgd_populate_kernel(mm, pgd, pud)	BUG()
@@ -115,7 +115,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 		crst_table_init(crst, pgd_entry_type(mm));
 	return (pgd_t *) crst;
 }
-#define pgd_free(pgd) crst_table_free((unsigned long *) pgd)
+#define pgd_free(mm, pgd) crst_table_free((unsigned long *) pgd)
 
 static inline void 
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
@@ -151,9 +151,9 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page)
 #define pte_alloc_one(mm, vmaddr) \
 	virt_to_page(page_table_alloc(s390_noexec))
 
-#define pte_free_kernel(pte) \
+#define pte_free_kernel(mm, pte) \
 	page_table_free((unsigned long *) pte)
-#define pte_free(pte) \
+#define pte_free(mm, pte) \
 	page_table_free((unsigned long *) page_to_phys((struct page *) pte))
 
 #endif /* _S390_PGALLOC_H */
diff --git a/include/asm-s390/tlb.h b/include/asm-s390/tlb.h
index 618693cf..985de2b 100644
--- a/include/asm-s390/tlb.h
+++ b/include/asm-s390/tlb.h
@@ -65,9 +65,9 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb,
 	if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pmds < TLB_NR_PTRS))
 		__tlb_flush_mm(tlb->mm);
 	while (tlb->nr_ptes > 0)
-		pte_free(tlb->array[--tlb->nr_ptes]);
+		pte_free(tlb->mm, tlb->array[--tlb->nr_ptes]);
 	while (tlb->nr_pmds < TLB_NR_PTRS)
-		pmd_free((pmd_t *) tlb->array[tlb->nr_pmds++]);
+		pmd_free(tlb->mm, (pmd_t *) tlb->array[tlb->nr_pmds++]);
 }
 
 static inline void tlb_finish_mmu(struct mmu_gather *tlb,
@@ -102,7 +102,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page)
 		if (tlb->nr_ptes >= tlb->nr_pmds)
 			tlb_flush_mmu(tlb, 0, 0);
 	} else
-		pte_free(page);
+		pte_free(tlb->mm, page);
 }
 
 /*
@@ -117,7 +117,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 		if (tlb->nr_ptes >= tlb->nr_pmds)
 			tlb_flush_mmu(tlb, 0, 0);
 	} else
-		pmd_free(pmd);
+		pmd_free(tlb->mm, pmd);
 #endif
 }
 
diff --git a/include/asm-sh/pgalloc.h b/include/asm-sh/pgalloc.h
index 18b613c..59ca16d 100644
--- a/include/asm-sh/pgalloc.h
+++ b/include/asm-sh/pgalloc.h
@@ -36,7 +36,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return quicklist_alloc(QUICK_PGD, GFP_KERNEL | __GFP_REPEAT, pgd_ctor);
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	quicklist_free(QUICK_PGD, NULL, pgd);
 }
@@ -54,12 +54,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return pg ? virt_to_page(pg) : NULL;
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	quicklist_free(QUICK_PT, NULL, pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	quicklist_free_page(QUICK_PT, NULL, pte);
 }
@@ -71,7 +71,7 @@ static inline void pte_free(struct page *pte)
  * inside the pgd, so has no extra memory associated with it.
  */
 
-#define pmd_free(x)			do { } while (0)
+#define pmd_free(mm, x)			do { } while (0)
 #define __pmd_free_tlb(tlb,x)		do { } while (0)
 
 static inline void check_pgt_cache(void)
diff --git a/include/asm-sparc/pgalloc.h b/include/asm-sparc/pgalloc.h
index a449cd4..b5fbdd3 100644
--- a/include/asm-sparc/pgalloc.h
+++ b/include/asm-sparc/pgalloc.h
@@ -32,7 +32,7 @@ BTFIXUPDEF_CALL(pgd_t *, get_pgd_fast, void)
 BTFIXUPDEF_CALL(void, free_pgd_fast, pgd_t *)
 #define free_pgd_fast(pgd)	BTFIXUP_CALL(free_pgd_fast)(pgd)
 
-#define pgd_free(pgd)	free_pgd_fast(pgd)
+#define pgd_free(mm, pgd)	free_pgd_fast(pgd)
 #define pgd_alloc(mm)	get_pgd_fast()
 
 BTFIXUPDEF_CALL(void, pgd_set, pgd_t *, pmd_t *)
@@ -45,8 +45,8 @@ BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long)
 BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *)
 #define free_pmd_fast(pmd)	BTFIXUP_CALL(free_pmd_fast)(pmd)
 
-#define pmd_free(pmd)           free_pmd_fast(pmd)
-#define __pmd_free_tlb(tlb, pmd) pmd_free(pmd)
+#define pmd_free(mm, pmd)	free_pmd_fast(pmd)
+#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd)
 
 BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *)
 #define pmd_populate(MM, PMD, PTE)        BTFIXUP_CALL(pmd_populate)(PMD, PTE)
@@ -59,10 +59,10 @@ BTFIXUPDEF_CALL(pte_t *, pte_alloc_one_kernel, struct mm_struct *, unsigned long
 #define pte_alloc_one_kernel(mm, addr)	BTFIXUP_CALL(pte_alloc_one_kernel)(mm, addr)
 
 BTFIXUPDEF_CALL(void, free_pte_fast, pte_t *)
-#define pte_free_kernel(pte)	BTFIXUP_CALL(free_pte_fast)(pte)
+#define pte_free_kernel(mm, pte)	BTFIXUP_CALL(free_pte_fast)(pte)
 
 BTFIXUPDEF_CALL(void, pte_free, struct page *)
-#define pte_free(pte)		BTFIXUP_CALL(pte_free)(pte)
-#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define pte_free(mm, pte)	BTFIXUP_CALL(pte_free)(pte)
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, pte)
 
 #endif /* _SPARC_PGALLOC_H */
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 5d66b85..b48f73c 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -20,7 +20,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	quicklist_free(0, NULL, pgd);
 }
@@ -32,7 +32,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return quicklist_alloc(0, GFP_KERNEL, NULL);
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	quicklist_free(0, NULL, pmd);
 }
@@ -50,12 +50,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return pg ? virt_to_page(pg) : NULL;
 }
 		
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	quicklist_free(0, NULL, pte);
 }
 
-static inline void pte_free(struct page *ptepage)
+static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
 {
 	quicklist_free_page(0, NULL, ptepage);
 }
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 349d1d3..ec81cde 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -100,8 +100,8 @@ static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
 }
 
 #define tlb_remove_tlb_entry(mp,ptep,addr) do { } while (0)
-#define pte_free_tlb(mp,ptepage) pte_free(ptepage)
-#define pmd_free_tlb(mp,pmdp) pmd_free(pmdp)
+#define pte_free_tlb(mp, ptepage) pte_free((mp)->mm, ptepage)
+#define pmd_free_tlb(mp, pmdp) pmd_free((mp)->mm, pmdp)
 #define pud_free_tlb(tlb,pudp) __pud_free_tlb(tlb,pudp)
 
 #define tlb_migrate_finish(mm)	do { } while (0)
diff --git a/include/asm-um/pgalloc.h b/include/asm-um/pgalloc.h
index 1490487..4f3e62b 100644
--- a/include/asm-um/pgalloc.h
+++ b/include/asm-um/pgalloc.h
@@ -23,17 +23,17 @@
  * Allocate and free page tables.
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
 extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long) pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
@@ -42,7 +42,7 @@ static inline void pte_free(struct page *pte)
 
 #ifdef CONFIG_3_LEVEL_PGTABLES
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	free_page((unsigned long)pmd);
 }
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index 6c21ef9..bab1271 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -36,17 +36,17 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
  * Allocate and free page tables.
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
 extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	free_page((unsigned long)pte);
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 }
@@ -63,7 +63,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
 	free_page((unsigned long)pmd);
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
index 8bb5646..315314c 100644
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -17,7 +17,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
 	set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
 }
 
-static inline void pmd_free(pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
 	free_page((unsigned long)pmd);
@@ -33,7 +33,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline void pud_free (pud_t *pud)
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
 	free_page((unsigned long)pud);
@@ -77,7 +77,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
 	pgd_list_del(pgd);
@@ -100,13 +100,13 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long add
 /* Should really implement gc for free page table pages. This could be
    done with a reference count in struct page. */
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
 	free_page((unsigned long)pte); 
 }
 
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
 	__free_page(pte);
 } 
diff --git a/include/asm-xtensa/pgalloc.h b/include/asm-xtensa/pgalloc.h
index 3e5b565..1d51ba5 100644
--- a/include/asm-xtensa/pgalloc.h
+++ b/include/asm-xtensa/pgalloc.h
@@ -31,7 +31,7 @@ pgd_alloc(struct mm_struct *mm)
 	return (pgd_t*) __get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
 }
 
-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	free_page((unsigned long)pgd);
 }
@@ -52,12 +52,12 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm,
 	return virt_to_page(pte_alloc_one_kernel(mm, addr));
 }
 
-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	kmem_cache_free(pgtable_cache, pte);
 }
 
-static inline void pte_free(struct page *page)
+static inline void pte_free(struct mm_struct *mm, struct page *page)
 {
 	kmem_cache_free(pgtable_cache, page_address(page));
 }
diff --git a/include/asm-xtensa/tlb.h b/include/asm-xtensa/tlb.h
index 4830232..31c220f 100644
--- a/include/asm-xtensa/tlb.h
+++ b/include/asm-xtensa/tlb.h
@@ -42,6 +42,6 @@
 
 #include <asm-generic/tlb.h>
 
-#define __pte_free_tlb(tlb,pte)			pte_free(pte)
+#define __pte_free_tlb(tlb, pte)		pte_free((tlb)->mm, pte)
 
 #endif	/* _XTENSA_TLB_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6caf4f2..1160f87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -325,7 +325,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
 
 static inline void mm_free_pgd(struct mm_struct * mm)
 {
-	pgd_free(mm->pgd);
+	pgd_free(mm, mm->pgd);
 }
 #else
 #define dup_mmap(mm, oldmm)	(0)
diff --git a/mm/memory.c b/mm/memory.c
index b7cb2e0..1c81fc2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,7 +305,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
-		pte_free(new);
+		pte_free(mm, new);
 	} else {
 		mm->nr_ptes++;
 		inc_zone_page_state(new, NR_PAGETABLE);
@@ -323,7 +323,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 
 	spin_lock(&init_mm.page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
-		pte_free_kernel(new);
+		pte_free_kernel(&init_mm, new);
 	else
 		pmd_populate_kernel(&init_mm, pmd, new);
 	spin_unlock(&init_mm.page_table_lock);
@@ -2501,7 +2501,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
-		pud_free(new);
+		pud_free(mm, new);
 	else
 		pgd_populate(mm, pgd, new);
 	spin_unlock(&mm->page_table_lock);
@@ -2523,12 +2523,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
-		pmd_free(new);
+		pmd_free(mm, new);
 	else
 		pud_populate(mm, pud, new);
 #else
 	if (pgd_present(*pud))		/* Another has populated it */
-		pmd_free(new);
+		pmd_free(mm, new);
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-- 
cgit v1.1


From 08e7d9b557299ba6ce57165ce8df310780bd681c Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 4 Feb 2008 22:29:16 -0800
Subject: arch_rebalance_pgtables call

In order to change the layout of the page tables after an mmap has crossed the
adress space limit of the current page table layout a architecture hook in
get_unmapped_area is needed.  The arguments are the address of the new mapping
and the length of it.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 8295577..bb4c963 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -36,6 +36,10 @@
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
 
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len)		(addr)
+#endif
+
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
@@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (addr & ~PAGE_MASK)
 		return -EINVAL;
 
-	return addr;
+	return arch_rebalance_pgtables(addr, len);
 }
 
 EXPORT_SYMBOL(get_unmapped_area);
-- 
cgit v1.1


From a7f75e25860ac0a7b70cf6e14c37618d2d2bb890 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:29:16 -0800
Subject: vmstat: small revisions to refresh_cpu_vm_stats()

1. Add comments explaining how the function can be called.

2. Collect global diffs in a local array and only spill
   them once into the global counters when the zone scan
   is finished. This means that we only touch each global
   counter once instead of each time we fold cpu counters
   into zone counters.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index e8d846f..9ffc573 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -284,6 +284,10 @@ EXPORT_SYMBOL(dec_zone_page_state);
 /*
  * Update the zone counters for one cpu.
  *
+ * The cpu specified must be either the current cpu or a processor that
+ * is not online. If it is the current cpu then the execution thread must
+ * be pinned to the current cpu.
+ *
  * Note that refresh_cpu_vm_stats strives to only access
  * node local memory. The per cpu pagesets on remote zones are placed
  * in the memory local to the processor using that pageset. So the
@@ -299,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu)
 {
 	struct zone *zone;
 	int i;
-	unsigned long flags;
+	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *p;
@@ -311,15 +315,19 @@ void refresh_cpu_vm_stats(int cpu)
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 			if (p->vm_stat_diff[i]) {
+				unsigned long flags;
+				int v;
+
 				local_irq_save(flags);
-				zone_page_state_add(p->vm_stat_diff[i],
-					zone, i);
+				v = p->vm_stat_diff[i];
 				p->vm_stat_diff[i] = 0;
+				local_irq_restore(flags);
+				atomic_long_add(v, &zone->vm_stat[i]);
+				global_diff[i] += v;
 #ifdef CONFIG_NUMA
 				/* 3 seconds idle till flush */
 				p->expire = 3;
 #endif
-				local_irq_restore(flags);
 			}
 #ifdef CONFIG_NUMA
 		/*
@@ -351,6 +359,10 @@ void refresh_cpu_vm_stats(int cpu)
 			drain_zone_pages(zone, p->pcp + 1);
 #endif
 	}
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (global_diff[i])
+			atomic_long_add(global_diff[i], &vm_stat[i]);
 }
 
 #endif
-- 
cgit v1.1


From 5dc331852848a38ca00a2817e5b98a1d0561b116 Mon Sep 17 00:00:00 2001
From: Robert Bragg <robert@sixbynine.org>
Date: Mon, 4 Feb 2008 22:29:18 -0800
Subject: mm: don't allow ioremapping of ranges larger than vmalloc space

When running with a 16M IOREMAP_MAX_ORDER (on armv7) we found that the
vmlist search routine in __get_vm_area_node can mistakenly allow a driver
to ioremap a range larger than vmalloc space.

If at the time of the ioremap all existing vmlist areas sit below the
determined alignment then the search routine continues past all entries and
exits the for loop - straight into the found: label - without ever testing
for integer wrapping or that the requested size fits.

We were seeing a driver successfully ioremap 128M of flash even though
there was only 120M of vmalloc space.  From that point the system was left
with the remainder of the first 16M of space to vmalloc/ioremap within.

Signed-off-by: Robert Bragg <robert@sixbynine.org>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4efc41a..0536dde 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -254,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
 		if (addr > end - size)
 			goto out;
 	}
+	if ((size + addr) < addr)
+		goto out;
+	if (addr > end - size)
+		goto out;
 
 found:
 	area->next = *p;
-- 
cgit v1.1


From 3dfa5721f12c3d5a441448086bee156887daa961 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:29:19 -0800
Subject: Page allocator: get rid of the list of cold pages

We have repeatedly discussed if the cold pages still have a point. There is
one way to join the two lists: Use a single list and put the cold pages at the
end and the hot pages at the beginning. That way a single list can serve for
both types of allocations.

The discussion of the RFC for this and Mel's measurements indicate that
there may not be too much of a point left to having separate lists for
hot and cold pages (see http://marc.info/?t=119492914200001&r=1&w=2).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Martin Bligh <mbligh@mbligh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/page_alloc.c        | 57 ++++++++++++++++++++++++--------------------------
 mm/vmstat.c            | 30 +++++++++++---------------
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4c4522a..8d8d197 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -113,7 +113,7 @@ struct per_cpu_pages {
 };
 
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c7de8e..144c0967 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -901,24 +901,21 @@ static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
-	int i;
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
+		struct per_cpu_pages *pcp;
 
 		if (!populated_zone(zone))
 			continue;
 
 		pset = zone_pcp(zone, cpu);
-		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &pset->pcp[i];
-			local_irq_save(flags);
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
-			local_irq_restore(flags);
-		}
+
+		pcp = &pset->pcp;
+		local_irq_save(flags);
+		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		pcp->count = 0;
+		local_irq_restore(flags);
 	}
 }
 
@@ -993,10 +990,13 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	list_add(&page->lru, &pcp->list);
+	if (cold)
+		list_add_tail(&page->lru, &pcp->list);
+	else
+		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
@@ -1054,7 +1054,7 @@ again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone_pcp(zone, cpu)->pcp[cold];
+		pcp = &zone_pcp(zone, cpu)->pcp;
 		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
@@ -1064,9 +1064,15 @@ again:
 		}
 
 		/* Find a page of the appropriate migrate type */
-		list_for_each_entry(page, &pcp->list, lru)
-			if (page_private(page) == migratetype)
-				break;
+		if (cold) {
+			list_for_each_entry_reverse(page, &pcp->list, lru)
+				if (page_private(page) == migratetype)
+					break;
+		} else {
+			list_for_each_entry(page, &pcp->list, lru)
+				if (page_private(page) == migratetype)
+					break;
+		}
 
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
@@ -1793,12 +1799,9 @@ void show_free_areas(void)
 
 			pageset = zone_pcp(zone, cpu);
 
-			printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
-			       "Cold: hi:%5d, btch:%4d usd:%4d\n",
-			       cpu, pageset->pcp[0].high,
-			       pageset->pcp[0].batch, pageset->pcp[0].count,
-			       pageset->pcp[1].high, pageset->pcp[1].batch,
-			       pageset->pcp[1].count);
+			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
+			       cpu, pageset->pcp.high,
+			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 
@@ -2596,17 +2599,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	memset(p, 0, sizeof(*p));
 
-	pcp = &p->pcp[0];		/* hot */
+	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
 }
 
 /*
@@ -2619,7 +2616,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 {
 	struct per_cpu_pages *pcp;
 
-	pcp = &p->pcp[0]; /* hot list */
+	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9ffc573..888668e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -337,7 +337,7 @@ void refresh_cpu_vm_stats(int cpu)
 		 * Check if there are pages remaining in this pageset
 		 * if not then there is nothing to expire.
 		 */
-		if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+		if (!p->expire || !p->pcp.count)
 			continue;
 
 		/*
@@ -352,11 +352,8 @@ void refresh_cpu_vm_stats(int cpu)
 		if (p->expire)
 			continue;
 
-		if (p->pcp[0].count)
-			drain_zone_pages(zone, p->pcp + 0);
-
-		if (p->pcp[1].count)
-			drain_zone_pages(zone, p->pcp + 1);
+		if (p->pcp.count)
+			drain_zone_pages(zone, &p->pcp);
 #endif
 	}
 
@@ -693,20 +690,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  pagesets");
 	for_each_online_cpu(i) {
 		struct per_cpu_pageset *pageset;
-		int j;
 
 		pageset = zone_pcp(zone, i);
-		for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-			seq_printf(m,
-				   "\n    cpu: %i pcp: %i"
-				   "\n              count: %i"
-				   "\n              high:  %i"
-				   "\n              batch: %i",
-				   i, j,
-				   pageset->pcp[j].count,
-				   pageset->pcp[j].high,
-				   pageset->pcp[j].batch);
-			}
+		seq_printf(m,
+			   "\n    cpu: %i"
+			   "\n              count: %i"
+			   "\n              high:  %i"
+			   "\n              batch: %i",
+			   i,
+			   pageset->pcp.count,
+			   pageset->pcp.high,
+			   pageset->pcp.batch);
 #ifdef CONFIG_SMP
 		seq_printf(m, "\n  vm stats threshold: %d",
 				pageset->stat_threshold);
-- 
cgit v1.1


From 195cf453d2c3d789cbe80e3735755f860c2fb222 Mon Sep 17 00:00:00 2001
From: Bron Gondwana <brong@fastmail.fm>
Date: Mon, 4 Feb 2008 22:29:20 -0800
Subject: mm/page-writeback: highmem_is_dirtyable option

Add vm.highmem_is_dirtyable toggle

A 32 bit machine with HIGHMEM64 enabled running DCC has an MMAPed file of
approximately 2Gb size which contains a hash format that is written
randomly by the dbclean process.  On 2.6.16 this process took a few
minutes.  With lowmem only accounting of dirty ratios, this takes about 12
hours of 100% disk IO, all random writes.

Include a toggle in /proc/sys/vm/highmem_is_dirtyable which can be set to 1 to
add the highmem back to the total available memory count.

[akpm@linux-foundation.org: Fix the CONFIG_DETECT_SOFTLOCKUP=y build]
Signed-off-by: Bron Gondwana <brong@fastmail.fm>
Cc: Ethan Solomita <solo@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: WU Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 15 +++++++++++++++
 Documentation/sysctl/vm.txt        |  7 ++++---
 include/linux/writeback.h          |  1 +
 kernel/sysctl.c                    | 18 +++++++++++++++++-
 mm/page-writeback.c                | 11 ++++++++++-
 5 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0b1b0c0..07231af 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1315,6 +1315,21 @@ for writeout by the pdflush daemons.  It is expressed in 100'ths of a second.
 Data which has been dirty in-memory for longer than this interval will be
 written out next time a pdflush daemon wakes up.
 
+highmem_is_dirtyable
+--------------------
+
+Only present if CONFIG_HIGHMEM is set.
+
+This defaults to 0 (false), meaning that the ratios set above are calculated
+as a percentage of lowmem only.  This protects against excessive scanning
+in page reclaim, swapping and general VM distress.
+
+Setting this to 1 can be useful on 32 bit machines where you want to make
+random changes within an MMAPed file that is larger than your available
+lowmem without causing large quantities of random IO.  Is is safe if the
+behavior of all programs running on the machine is known and memory will
+not be otherwise stressed.
+
 legacy_va_layout
 ----------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6f31f0a..24eac1b 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm:
 - dirty_background_ratio
 - dirty_expire_centisecs
 - dirty_writeback_centisecs
+- highmem_is_dirtyable   (only if CONFIG_HIGHMEM set)
 - max_map_count
 - min_free_kbytes
 - laptop_mode
@@ -40,9 +41,9 @@ Currently, these files are in /proc/sys/vm:
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
-dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout, drop-caches,
-hugepages_treat_as_movable:
+dirty_writeback_centisecs, highmem_is_dirtyable,
+vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout,
+drop-caches, hugepages_treat_as_movable:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c6148bb..b2cd826 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -100,6 +100,7 @@ extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
 extern int dirty_writeback_interval;
 extern int dirty_expire_interval;
+extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7cb1ac3..d0b47b8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,8 +84,11 @@ extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
 static int one = 1;
+#endif
+
+#ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 #endif
 
@@ -1150,6 +1153,19 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+#ifdef CONFIG_HIGHMEM
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "highmem_is_dirtyable",
+		.data		= &vm_highmem_is_dirtyable,
+		.maxlen		= sizeof(vm_highmem_is_dirtyable),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8137482..c689b60 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
 int dirty_background_ratio = 5;
 
 /*
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
+ */
+int vm_highmem_is_dirtyable;
+
+/*
  * The generator of dirty data starts writeback at this percentage
  */
 int vm_dirty_ratio = 10;
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void)
 	x = global_page_state(NR_FREE_PAGES)
 		+ global_page_state(NR_INACTIVE)
 		+ global_page_state(NR_ACTIVE);
-	x -= highmem_dirtyable_memory(x);
+
+	if (!vm_highmem_is_dirtyable)
+		x -= highmem_dirtyable_memory(x);
+
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-- 
cgit v1.1


From 7766755a2f249e7e0dabc5255a0a3d151ff79821 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Mon, 4 Feb 2008 22:29:21 -0800
Subject: Fix /proc dcache deadlock in do_exit

This patch fixes a sles9 system hang in start_this_handle from a customer
with some heavy workload where all tasks are waiting on kjournald to commit
the transaction, but kjournald waits on t_updates to go down to zero (it
never does).

This was reported as a lowmem shortage deadlock but when checking the debug
data I noticed the VM wasn't under pressure at all (well it was really
under vm pressure, because lots of tasks hanged in the VM prune_dcache
methods trying to flush dirty inodes, but no task was hanging in GFP_NOFS
mode, the holder of the journal handle should have if this was a vm issue
in the first place).

No task was apparently holding the leftover handle in the committing
transaction, so I deduced t_updates was stuck to 1 because a journal_stop
was never run by some path (this turned out to be correct).  With a debug
patch adding proper reverse links and stack trace logging in ext3 deployed
in production, I found journal_stop is never run because
mark_inode_dirty_sync is called inside release_task called by do_exit.
(that was quite fun because I would have never thought about this
subtleness, I thought a regular path in ext3 had a bug and it forgot to
call journal_stop)

do_exit->release_task->mark_inode_dirty_sync->schedule() (will never
come back to run journal_stop)

The reason is that shrink_dcache_parent is racy by design (feature not
a bug) and it can do blocking I/O in some case, but the point is that
calling shrink_dcache_parent at the last stage of do_exit isn't safe
for self-reaping tasks.

I guess the memory pressure of the unbalanced highmem system allowed
to trigger this more easily.

Now mainline doesn't have this line in iput (like sles9 has):

    	     if (inode->i_state & I_DIRTY_DELAYED)
	     			mark_inode_dirty_sync(inode);

so it will probably not crash with ext3, but for example ext2 implements an
I/O-blocking ext2_put_inode that will lead to similar screwups with
ext2_free_blocks never coming back and it's definitely wrong to call
blocking-IO paths inside do_exit.  So this should fix a subtle bug in
mainline too (not verified in practice though).  The equivalent fix for
ext3 is also not verified yet to fix the problem in sles9 but I don't have
doubt it will (it usually takes days to crash, so it'll take weeks to be
sure).

An alternate fix would be to offload that work to a kernel thread, but I
don't think a reschedule for this is worth it, the vm should be able to
collect those entries for the synchronous release_task.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Jan Kara <jack@ucw.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd9f84c..c59852b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2321,7 +2321,8 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 	name.len = snprintf(buf, sizeof(buf), "%d", pid);
 	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
 	if (dentry) {
-		shrink_dcache_parent(dentry);
+		if (!(current->flags & PF_EXITING))
+			shrink_dcache_parent(dentry);
 		d_drop(dentry);
 		dput(dentry);
 	}
-- 
cgit v1.1


From 9eccf2a816ed0aad82b577de6a40cd098ad41944 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 4 Feb 2008 22:29:22 -0800
Subject: vmstat: remove prefetch

Remove the prefetch logic in order to avoid touching impossible per cpu
areas.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 888668e..422d960 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states);
 
 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
 {
-	int cpu = 0;
+	int cpu;
 	int i;
 
 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 
-	cpu = first_cpu(*cpumask);
-	while (cpu < NR_CPUS) {
+	for_each_cpu_mask(cpu, *cpumask) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
-		cpu = next_cpu(cpu, *cpumask);
-
-		if (cpu < NR_CPUS)
-			prefetch(&per_cpu(vm_event_states, cpu));
-
-
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 			ret[i] += this->event[i];
 	}
-- 
cgit v1.1


From 5a9bbdcd29adbb786c53eba1dfc3c2d256020d6b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 4 Feb 2008 22:29:23 -0800
Subject: mm: don't waste swap on locked pages

try_to_unmap always fails on a page found in a VM_LOCKED vma (unless
migrating), and recycles it back to the active list.  But if it's an
anonymous page, we've already allocated swap to it: just wasting swap.
Spot locked pages in page_referenced_one and treat them as referenced.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Tested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ethan Solomita <solo@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 0334c8f..57ad276 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -283,7 +283,10 @@ static int page_referenced_one(struct page *page,
 	if (!pte)
 		goto out;
 
-	if (ptep_clear_flush_young(vma, address, pte))
+	if (vma->vm_flags & VM_LOCKED) {
+		referenced++;
+		*mapcount = 1;	/* break early from loop */
+	} else if (ptep_clear_flush_young(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
-- 
cgit v1.1


From 2d544564f9954860235db97df2e549a66c61f557 Mon Sep 17 00:00:00 2001
From: Qi Yong <qiyong@fc-cn.com>
Date: Mon, 4 Feb 2008 22:29:23 -0800
Subject: skip writing data pages when inode is under I_SYNC

Since I_SYNC was split out from I_LOCK, the concern in commit
4b89eed93e0fa40a63e3d7b1796ec1337ea7a3aa ("Write back inode data pages
even when the inode itself is locked") is not longer valid.

We should revert to the original behavior: in __writeback_single_inode(),
when we find an I_SYNC-ed inode and we're not doing a data-integrity sync,
skip writing entirely.  Otherwise, we are double calling do_writepages()

Signed-off-by: Qi Yong <qiyong@fc-cn.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Joern Engel <joern@wohnheim.fh-wedel.de>
Cc: WU Fengguang <wfg@mail.ustc.edu.cn>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 300324b..3fe782d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -334,9 +334,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		WARN_ON(inode->i_state & I_WILL_FREE);
 
 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
-		struct address_space *mapping = inode->i_mapping;
-		int ret;
-
 		/*
 		 * We're skipping this inode because it's locked, and we're not
 		 * doing writeback-for-data-integrity.  Move it to s_more_io so
@@ -345,15 +342,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * completed a full scan of s_io.
 		 */
 		requeue_io(inode);
-
-		/*
-		 * Even if we don't actually write the inode itself here,
-		 * we can at least start some of the data writeout..
-		 */
-		spin_unlock(&inode_lock);
-		ret = do_writepages(mapping, wbc);
-		spin_lock(&inode_lock);
-		return ret;
+		return 0;
 	}
 
 	/*
-- 
cgit v1.1


From 1e548deb5d1630ca14ba04da04e3b6b3766178c7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 4 Feb 2008 22:29:26 -0800
Subject: page allocator: remove unused arguments in zone_init_free_lists()

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 144c0967..55fe57c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2546,8 +2546,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	}
 }
 
-static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
-				struct zone *zone, unsigned long size)
+static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
@@ -2820,7 +2819,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
 
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 
-	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+	zone_init_free_lists(zone);
 
 	return 0;
 }
-- 
cgit v1.1


From 920c7a5d0c94b8ce740f1d76fa06422f2a95a757 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 4 Feb 2008 22:29:26 -0800
Subject: mm: remove fastcall from mm/

fastcall is always defined to be empty, remove it

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c        | 10 +++++-----
 mm/highmem.c        |  4 ++--
 mm/internal.h       |  2 +-
 mm/memory.c         |  3 ++-
 mm/page-writeback.c |  2 +-
 mm/page_alloc.c     | 16 ++++++++--------
 mm/swap.c           | 10 +++++-----
 7 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 96920f8..81fb9bf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -527,7 +527,7 @@ static inline void wake_up_page(struct page *page, int bit)
 	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
 }
 
-void fastcall wait_on_page_bit(struct page *page, int bit_nr)
+void wait_on_page_bit(struct page *page, int bit_nr)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 
@@ -551,7 +551,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
  * parallel wait_on_page_locked()).
  */
-void fastcall unlock_page(struct page *page)
+void unlock_page(struct page *page)
 {
 	smp_mb__before_clear_bit();
 	if (!TestClearPageLocked(page))
@@ -585,7 +585,7 @@ EXPORT_SYMBOL(end_page_writeback);
  * chances are that on the second loop, the block layer's plug list is empty,
  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
  */
-void fastcall __lock_page(struct page *page)
+void __lock_page(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
@@ -606,7 +606,7 @@ int fastcall __lock_page_killable(struct page *page)
  * Variant of lock_page that does not require the caller to hold a reference
  * on the page's mapping.
  */
-void fastcall __lock_page_nosync(struct page *page)
+void __lock_page_nosync(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
@@ -1276,7 +1276,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
  */
-static int fastcall page_cache_read(struct file * file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *page; 
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a967bc..35d4773 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -163,7 +163,7 @@ start:
 	return vaddr;
 }
 
-void fastcall *kmap_high(struct page *page)
+void *kmap_high(struct page *page)
 {
 	unsigned long vaddr;
 
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page)
 
 EXPORT_SYMBOL(kmap_high);
 
-void fastcall kunmap_high(struct page *page)
+void kunmap_high(struct page *page)
 {
 	unsigned long vaddr;
 	unsigned long nr;
diff --git a/mm/internal.h b/mm/internal.h
index 953f941..1e34d24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
-extern void fastcall __init __free_pages_bootmem(struct page *page,
+extern void __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 1c81fc2..6a9c048 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1109,7 +1109,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
 
-pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+			spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
 	pud_t * pud = pud_alloc(mm, pgd, addr);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c689b60..a4ca162 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1073,7 +1073,7 @@ static int __set_page_dirty(struct page *page)
 	return 0;
 }
 
-int fastcall set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
 	int ret = __set_page_dirty(page);
 	if (ret)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 55fe57c..d73c133 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -537,7 +537,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
  * permit the bootmem allocator to evade page validation on high-order frees
  */
-void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	if (order == 0) {
 		__ClearPageReserved(page);
@@ -974,7 +974,7 @@ void mark_free_pages(struct zone *zone)
 /*
  * Free a 0-order page
  */
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -1007,12 +1007,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	put_cpu();
 }
 
-void fastcall free_hot_page(struct page *page)
+void free_hot_page(struct page *page)
 {
 	free_hot_cold_page(page, 0);
 }
 	
-void fastcall free_cold_page(struct page *page)
+void free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
 }
@@ -1641,7 +1641,7 @@ EXPORT_SYMBOL(__alloc_pages);
 /*
  * Common helper functions.
  */
-fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page * page;
 	page = alloc_pages(gfp_mask, order);
@@ -1652,7 +1652,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 
 EXPORT_SYMBOL(__get_free_pages);
 
-fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	struct page * page;
 
@@ -1678,7 +1678,7 @@ void __pagevec_free(struct pagevec *pvec)
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 
-fastcall void __free_pages(struct page *page, unsigned int order)
+void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
@@ -1690,7 +1690,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
 
 EXPORT_SYMBOL(__free_pages);
 
-fastcall void free_pages(unsigned long addr, unsigned int order)
+void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
diff --git a/mm/swap.c b/mm/swap.c
index 9ac8832..57b7e25 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,7 +41,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
  */
-static void fastcall __page_cache_release(struct page *page)
+static void __page_cache_release(struct page *page)
 {
 	if (PageLRU(page)) {
 		unsigned long flags;
@@ -165,7 +165,7 @@ int rotate_reclaimable_page(struct page *page)
 /*
  * FIXME: speed this up?
  */
-void fastcall activate_page(struct page *page)
+void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 
@@ -186,7 +186,7 @@ void fastcall activate_page(struct page *page)
  * inactive,referenced		->	active,unreferenced
  * active,unreferenced		->	active,referenced
  */
-void fastcall mark_page_accessed(struct page *page)
+void mark_page_accessed(struct page *page)
 {
 	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
 		activate_page(page);
@@ -202,7 +202,7 @@ EXPORT_SYMBOL(mark_page_accessed);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-void fastcall lru_cache_add(struct page *page)
+void lru_cache_add(struct page *page)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 
@@ -212,7 +212,7 @@ void fastcall lru_cache_add(struct page *page)
 	put_cpu_var(lru_add_pvecs);
 }
 
-void fastcall lru_cache_add_active(struct page *page)
+void lru_cache_add_active(struct page *page)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
 
-- 
cgit v1.1


From ae1276b9349a2fd9c3afb4651e25a77ac03299d9 Mon Sep 17 00:00:00 2001
From: Qi Yong <qiyong@fc-cn.com>
Date: Mon, 4 Feb 2008 22:29:27 -0800
Subject: set_page_refcounted() VM_BUG_ON fix

The current PageTail semantic is that a PageTail page is first a
PageCompound page.  So remove the redundant PageCompound test in
set_page_refcounted().

Signed-off-by: Qi Yong <qiyong@fc-cn.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/internal.h b/mm/internal.h
index 1e34d24..5a9a620 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
  */
 static inline void set_page_refcounted(struct page *page)
 {
-	VM_BUG_ON(PageCompound(page) && PageTail(page));
+	VM_BUG_ON(PageTail(page));
 	VM_BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
-- 
cgit v1.1


From a2b345642f530054a92b8d2b5108436225a8093e Mon Sep 17 00:00:00 2001
From: Bjorn Steinbrink <B.Steinbrink@gmx.de>
Date: Mon, 4 Feb 2008 22:29:28 -0800
Subject: Fix dirty page accounting leak with ext3 data=journal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In 46d2277c796f9f4937bfa668c40b2e3f43e93dd0 ("Clean up and make
try_to_free_buffers() not race with dirty pages"), try_to_free_buffers
was changed to bail out if the page was dirty.

That in turn caused truncate_complete_page to leak massive amounts of
memory, because the dirty bit was only cleared after the call to
try_to_free_buffers.

So the call to cancel_dirty_page was moved up to have the dirty bit
cleared early in 3e67c0987d7567ad666641164a153dca9a43b11d ("truncate:
clear page dirtiness before running try_to_free_buffers()").

The problem with that fix is, that the page can be redirtied after
cancel_dirty_page was called, eg. like this:

truncate_complete_page()
  cancel_dirty_page() // PG_dirty cleared, decr. dirty pages
  do_invalidatepage()
    ext3_invalidatepage()
      journal_invalidatepage()
        journal_unmap_buffer()
          __dispose_buffer()
            __journal_unfile_buffer()
              __journal_temp_unlink_buffer()
                mark_buffer_dirty(); // PG_dirty set, incr. dirty pages

And then we end up with dirty pages being wrongly accounted.

As a result, in ecdfc9787fe527491baefc22dce8b2dbd5b2908d ("Resurrect
'try_to_free_buffers()' VM hackery") the changes to try_to_free_buffers
were reverted, so the original reason for the massive memory leak is
gone, and we can also revert the move of the call to cancel_dirty_page
from truncate_complete_page and get the accounting right again.

I'm not sure if it matters, but opposed to the final check in
__remove_from_page_cache, this one also cares about the task io
accounting, so maybe we want to use this instead, although it's not
quite the clean fix either.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Cc: Jan Kara <jack@ucw.cz>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Osterried <osterried@jesse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/truncate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index 3855492..9838c05 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return;
 
-	cancel_dirty_page(page, PAGE_CACHE_SIZE);
-
 	if (PagePrivate(page))
 		do_invalidatepage(page, 0);
 
+	cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
 	remove_from_page_cache(page);
 	ClearPageUptodate(page);
 	ClearPageMappedToDisk(page);
-- 
cgit v1.1


From e6f3602d2c58815775b2a293cec6650622236169 Mon Sep 17 00:00:00 2001
From: Larry Woodman <lwoodman@redhat.com>
Date: Mon, 4 Feb 2008 22:29:30 -0800
Subject: Include count of pagecache pages in show_mem() output

The show_mem() output does not include the total number of pagecache
pages.  This would be helpful when analyzing the debug information in
the /var/log/messages file after OOM kills occur.

This patch includes the total pagecache pages in that output.

Signed-off-by: Larry Woodman <lwoodman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d73c133..37576b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1874,6 +1874,8 @@ void show_free_areas(void)
 		printk("= %lukB\n", K(total));
 	}
 
+	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+
 	show_swap_cache_info();
 }
 
-- 
cgit v1.1


From b5beb1caff4ce063e6e2dc13f23b80eeef4e9782 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Mon, 4 Feb 2008 22:29:31 -0800
Subject: check ADVICE of fadvise64_64 even if get_xip_page is given

I've written some test programs in ltp project.  During writing I met an
problem which I cannot solve in user land.  So I wrote a patch for linux
kernel.  Please, include this patch if acceptable.

The test program tests the 4th parameter of fadvise64_64:

    long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);

My test case calls fadvise64_64 with invalid advice value and checks errno is
set to EINVAL.  About the advice parameter man page says:

    ...
    Permissible values for advice include:

	   POSIX_FADV_NORMAL
                  ...
	   POSIX_FADV_SEQUENTIAL
                  ...
	   POSIX_FADV_RANDOM
		  ...
	   POSIX_FADV_NOREUSE
                  ...
	   POSIX_FADV_WILLNEED
                  ...
	   POSIX_FADV_DONTNEED
		  ...
    ERRORS
           ...
	   EINVAL An invalid value was specified for advice.

However, I got a bug report that the system call invocations
in my test case returned 0 unexpectedly.

I've inspected the kernel code:

    asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
    {
	    struct file *file = fget(fd);
	    struct address_space *mapping;
	    struct backing_dev_info *bdi;
	    loff_t endbyte;			/* inclusive */
	    pgoff_t start_index;
	    pgoff_t end_index;
	    unsigned long nrpages;
	    int ret = 0;

	    if (!file)
		    return -EBADF;

	    if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
		    ret = -ESPIPE;
		    goto out;
	    }

	    mapping = file->f_mapping;
	    if (!mapping || len < 0) {
		    ret = -EINVAL;
		    goto out;
	    }

	    if (mapping->a_ops->get_xip_page)
		    /* no bad return value, but ignore advice */
		    goto out;
    ...
    out:
	    fput(file);
	    return ret;
    }

I found the advice parameter is just ignored in the case
mapping->a_ops->get_xip_page is given. This behavior is different from
what is written on the man page. Is this o.k.?

get_xip_page is given if CONFIG_EXT2_FS_XIP is true.
Anyway I cannot find the easy way to detect get_xip_page
field is given or CONFIG_EXT2_FS_XIP is true from the
user space.

I propose the following patch which checks the advice parameter
even if get_xip_page is given.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/fadvise.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0df4c89..3c0f1e9 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 		goto out;
 	}
 
-	if (mapping->a_ops->get_xip_page)
-		/* no bad return value, but ignore advice */
+	if (mapping->a_ops->get_xip_page) {
+		switch (advice) {
+		case POSIX_FADV_NORMAL:
+		case POSIX_FADV_RANDOM:
+		case POSIX_FADV_SEQUENTIAL:
+		case POSIX_FADV_WILLNEED:
+		case POSIX_FADV_NOREUSE:
+		case POSIX_FADV_DONTNEED:
+			/* no bad return value, but ignore advice */
+			break;
+		default:
+			ret = -EINVAL;
+		}
 		goto out;
+	}
 
 	/* Careful about overflows. Len == 0 means "as much as possible" */
 	endbyte = offset + len;
-- 
cgit v1.1


From 7786fa9ac5366214fb942a9e62c6e46b4272c22c Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Mon, 4 Feb 2008 22:29:32 -0800
Subject: Document lowmem_reserve_ratio

Though the lower_zone_protection was changed to lowmem_reserve_ratio, the
document has been not changed.  The lowmem_reserve_ratio seems quite hard
to estimate, but there is no guidance.  This patch is to change document
for it.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Andrea Arcangeli <andrea@cpushare.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 80 ++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 17 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 07231af..e2799b5 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1336,7 +1336,7 @@ legacy_va_layout
 If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel
 will use the legacy (2.4) layout for all processes.
 
-lower_zone_protection
+lowmem_reserve_ratio
 ---------------------
 
 For some specialised workloads on highmem machines it is dangerous for
@@ -1356,25 +1356,71 @@ captured into pinned user memory.
 mechanism will also defend that region from allocations which could use
 highmem or lowmem).
 
-The `lower_zone_protection' tunable determines how aggressive the kernel is
-in defending these lower zones.  The default value is zero - no
-protection at all.
+The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
+in defending these lower zones.
 
 If you have a machine which uses highmem or ISA DMA and your
 applications are using mlock(), or if you are running with no swap then
-you probably should increase the lower_zone_protection setting.
-
-The units of this tunable are fairly vague.  It is approximately equal
-to "megabytes," so setting lower_zone_protection=100 will protect around 100
-megabytes of the lowmem zone from user allocations.  It will also make
-those 100 megabytes unavailable for use by applications and by
-pagecache, so there is a cost.
-
-The effects of this tunable may be observed by monitoring
-/proc/meminfo:LowFree.  Write a single huge file and observe the point
-at which LowFree ceases to fall.
-
-A reasonable value for lower_zone_protection is 100.
+you probably should change the lowmem_reserve_ratio setting.
+
+The lowmem_reserve_ratio is an array. You can see them by reading this file.
+-
+% cat /proc/sys/vm/lowmem_reserve_ratio
+256     256     32
+-
+Note: # of this elements is one fewer than number of zones. Because the highest
+      zone's value is not necessary for following calculation.
+
+But, these values are not used directly. The kernel calculates # of protection
+pages for each zones from them. These are shown as array of protection pages
+in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+Each zone has an array of protection pages like this.
+
+-
+Node 0, zone      DMA
+  pages free     1355
+        min      3
+        low      3
+        high     4
+	:
+	:
+    numa_other   0
+        protection: (0, 2004, 2004, 2004)
+	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  pagesets
+    cpu: 0 pcp: 0
+        :
+-
+These protections are added to score to judge whether this zone should be used
+for page allocation or should be reclaimed.
+
+In this example, if normal pages (index=2) are required to this DMA zone and
+pages_high is used for watermark, the kernel judges this zone should not be
+used because pages_free(1355) is smaller than watermark + protection[2]
+(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
+normal page requirement. If requirement is DMA zone(index=0), protection[0]
+(=0) is used.
+
+zone[i]'s protection[j] is calculated by following exprssion.
+
+(i < j):
+  zone[i]->protection[j]
+  = (total sums of present_pages from zone[i+1] to zone[j] on the node)
+    / lowmem_reserve_ratio[i];
+(i = j):
+   (should not be protected. = 0;
+(i > j):
+   (not necessary, but looks 0)
+
+The default values of lowmem_reserve_ratio[i] are
+    256 (if zone[i] means DMA or DMA32 zone)
+    32  (others).
+As above expression, they are reciprocal number of ratio.
+256 means 1/256. # of protection pages becomes about "0.39%" of total present
+pages of higher zones on the node.
+
+If you would like to protect more pages, smaller values are effective.
+The minimum value is 1 (1/1 -> 100%).
 
 page-cluster
 ------------
-- 
cgit v1.1


From 62e1c55300f306e06478f460a7eefba085206e0b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 4 Feb 2008 22:29:33 -0800
Subject: page migraton: handle orphaned pages

Orphaned page might have fs-private metadata, the page is truncated.  As
the page hasn't mapping, page migration refuse to migrate the page.  It
appears the page is only freed in page reclaim and if zone watermark is
low, the page is never freed, as a result migration always fail.  I thought
we could free the metadata so such page can be freed in migration and make
migration more reliable.

[akpm@linux-foundation.org: go direct to try_to_free_buffers()]
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c  | 30 ++++++++++++++++++++++++------
 mm/truncate.c |  2 +-
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 4ee4cca..857a987 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -640,15 +640,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		rcu_read_lock();
 		rcu_locked = 1;
 	}
+
 	/*
-	 * This is a corner case handling.
-	 * When a new swap-cache is read into, it is linked to LRU
-	 * and treated as swapcache but has no rmap yet.
-	 * Calling try_to_unmap() against a page->mapping==NULL page is
-	 * BUG. So handle it here.
+	 * Corner case handling:
+	 * 1. When a new swap-cache page is read into, it is added to the LRU
+	 * and treated as swapcache but it has no rmap yet.
+	 * Calling try_to_unmap() against a page->mapping==NULL page will
+	 * trigger a BUG.  So handle it here.
+	 * 2. An orphaned page (see truncate_complete_page) might have
+	 * fs-private metadata. The page can be picked up due to memory
+	 * offlining.  Everywhere else except page reclaim, the page is
+	 * invisible to the vm, so the page can not be migrated.  So try to
+	 * free the metadata, so the page can be freed.
 	 */
-	if (!page->mapping)
+	if (!page->mapping) {
+		if (!PageAnon(page) && PagePrivate(page)) {
+			/*
+			 * Go direct to try_to_free_buffers() here because
+			 * a) that's what try_to_release_page() would do anyway
+			 * b) we may be under rcu_read_lock() here, so we can't
+			 *    use GFP_KERNEL which is what try_to_release_page()
+			 *    needs to be effective.
+			 */
+			try_to_free_buffers(page);
+		}
 		goto rcu_unlock;
+	}
+
 	/* Establish migration ptes or remove ptes */
 	try_to_unmap(page, 1);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 9838c05..c35c49e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
 
 /*
  * If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * becomes orphaned.  It will be left on the LRU and may even be mapped into
  * user pagetables if we're racing with filemap_fault().
  *
  * We need to bale out if page->mapping is no longer equal to the original
-- 
cgit v1.1


From 0ed361dec36945f3116ee1338638ada9a8920905 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 4 Feb 2008 22:29:34 -0800
Subject: mm: fix PageUptodate data race

After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.

Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.

Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.

Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked.  Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.

Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such.  Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem.  Both file and anonymous pages are
handled with the same barriers.

FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.

Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.

Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h    |  4 ----
 include/linux/page-flags.h | 42 +++++++++++++++++++++++++++++++++++++++---
 mm/hugetlb.c               |  2 ++
 mm/memory.c                |  9 +++++----
 mm/page_io.c               |  2 +-
 mm/swap_state.c            |  2 +-
 6 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 61a5e5e..7dcbc82 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -68,8 +68,6 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 	void *addr = kmap_atomic(page, KM_USER0);
 	clear_user_page(addr, vaddr, page);
 	kunmap_atomic(addr, KM_USER0);
-	/* Make sure this page is cleared on other CPU's too before using it */
-	smp_wmb();
 }
 
 #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
@@ -172,8 +170,6 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 	copy_user_page(vto, vfrom, vaddr, to);
 	kunmap_atomic(vfrom, KM_USER0);
 	kunmap_atomic(vto, KM_USER1);
-	/* Make sure this page is cleared on other CPU's too before using it */
-	smp_wmb();
 }
 
 #endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 209d3a4..bbad43f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,16 +131,52 @@
 #define ClearPageReferenced(page)	clear_bit(PG_referenced, &(page)->flags)
 #define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags)
 
-#define PageUptodate(page)	test_bit(PG_uptodate, &(page)->flags)
+static inline int PageUptodate(struct page *page)
+{
+	int ret = test_bit(PG_uptodate, &(page)->flags);
+
+	/*
+	 * Must ensure that the data we read out of the page is loaded
+	 * _after_ we've loaded page->flags to check for PageUptodate.
+	 * We can skip the barrier if the page is not uptodate, because
+	 * we wouldn't be reading anything from it.
+	 *
+	 * See SetPageUptodate() for the other side of the story.
+	 */
+	if (ret)
+		smp_rmb();
+
+	return ret;
+}
+
+static inline void __SetPageUptodate(struct page *page)
+{
+	smp_wmb();
+	__set_bit(PG_uptodate, &(page)->flags);
 #ifdef CONFIG_S390
+	page_clear_dirty(page);
+#endif
+}
+
 static inline void SetPageUptodate(struct page *page)
 {
+#ifdef CONFIG_S390
 	if (!test_and_set_bit(PG_uptodate, &page->flags))
 		page_clear_dirty(page);
-}
 #else
-#define SetPageUptodate(page)	set_bit(PG_uptodate, &(page)->flags)
+	/*
+	 * Memory barrier must be issued before setting the PG_uptodate bit,
+	 * so that all previous stores issued in order to bring the page
+	 * uptodate are actually visible before PageUptodate becomes true.
+	 *
+	 * s390 doesn't need an explicit smp_wmb here because the test and
+	 * set bit already provides full barriers.
+	 */
+	smp_wmb();
+	set_bit(PG_uptodate, &(page)->flags);
 #endif
+}
+
 #define ClearPageUptodate(page)	clear_bit(PG_uptodate, &(page)->flags)
 
 #define PageDirty(page)		test_bit(PG_dirty, &(page)->flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db861d8..1a56420 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	spin_unlock(&mm->page_table_lock);
 	copy_huge_page(new_page, old_page, address, vma);
+	__SetPageUptodate(new_page);
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -858,6 +859,7 @@ retry:
 			goto out;
 		}
 		clear_huge_page(page, address);
+		__SetPageUptodate(page);
 
 		if (vma->vm_flags & VM_SHARED) {
 			int err;
diff --git a/mm/memory.c b/mm/memory.c
index 6a9c048..7bb7072 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1518,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		flush_dcache_page(dst);
-		return;
-
-	}
-	copy_user_highpage(dst, src, va, vma);
+	} else
+		copy_user_highpage(dst, src, va, vma);
 }
 
 /*
@@ -1630,6 +1628,7 @@ gotten:
 	if (!new_page)
 		goto oom;
 	cow_user_page(new_page, old_page, address, vma);
+	__SetPageUptodate(new_page);
 
 	/*
 	 * Re-check the pte - we dropped the lock
@@ -2102,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page = alloc_zeroed_user_highpage_movable(vma, address);
 	if (!page)
 		goto oom;
+	__SetPageUptodate(page);
 
 	entry = mk_pte(page, vma->vm_page_prot);
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2202,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				goto out;
 			}
 			copy_user_highpage(page, vmf.page, address, vma);
+			__SetPageUptodate(page);
 		} else {
 			/*
 			 * If the page will be shareable, see if the backing
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f68..065c448 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
 	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
-	ClearPageUptodate(page);
+	BUG_ON(PageUptodate(page));
 	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
 				end_swap_bio_read);
 	if (bio == NULL) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 65b81c9..ec42f01 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -125,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
 	int err;
 
 	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageUptodate(page));
 
 	for (;;) {
 		entry = get_swap_page();
@@ -147,7 +148,6 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
 
 		switch (err) {
 		case 0:				/* Success */
-			SetPageUptodate(page);
 			SetPageDirty(page);
 			return 1;
 		case -EEXIST:
-- 
cgit v1.1


From a322f8ab66f50b6c0dcdb59abae84fede7a5fded Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 4 Feb 2008 22:29:35 -0800
Subject: mm: fix section mismatch warning in sparse.c

Fix following warning:
WARNING: mm/built-in.o(.text+0x22069): Section mismatch in reference from the function sparse_early_usemap_alloc() to the function .init.text:__alloc_bootmem_node()

static sparse_early_usemap_alloc() were used only by sparse_init()
and with sparse_init() annotated _init it is safe to
annotate sparse_early_usemap_alloc with __init too.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 7859c80..f6a43c0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
+static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
 	unsigned long *usemap;
 	struct mem_section *ms = __nr_to_section(pnum);
-- 
cgit v1.1


From 8bc3be2751b4f74ab90a446da1912fd8204d53f7 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Mon, 4 Feb 2008 22:29:36 -0800
Subject: writeback: speed up writeback of big dirty files

After making dirty a 100M file, the normal behavior is to start the
writeback for all data after 30s delays.  But sometimes the following
happens instead:

	- after 30s:    ~4M
	- after 5s:     ~4M
	- after 5s:     all remaining 92M

Some analyze shows that the internal io dispatch queues goes like this:

		s_io            s_more_io
		-------------------------
	1)	100M,1K         0
	2)	1K              96M
	3)	0               96M
1) initial state with a 100M file and a 1K file

2) 4M written, nr_to_write <= 0, so write more

3) 1K written, nr_to_write > 0, no more writes(BUG)

nr_to_write > 0 in (3) fools the upper layer to think that data have all
been written out.  The big dirty file is actually still sitting in
s_more_io.  We cannot simply splice s_more_io back to s_io as soon as s_io
becomes empty, and let the loop in generic_sync_sb_inodes() continue: this
may starve newly expired inodes in s_dirty.  It is also not an option to
draw inodes from both s_more_io and s_dirty, an let the loop go on: this
might lead to live locks, and might also starve other superblocks in sync
time(well kupdate may still starve some superblocks, that's another bug).

We have to return when a full scan of s_io completes.  So nr_to_write > 0
does not necessarily mean that "all data are written".  This patch
introduces a flag writeback_control.more_io to indicate that more io should
be done.  With it the big dirty file no longer has to wait for the next
kupdate invokation 5s later.

In sync_sb_inodes() we only set more_io on super_blocks we actually
visited.  This avoids the interaction between two pdflush deamons.

Also in __sync_single_inode() we don't blindly keep requeuing the io if the
filesystem cannot progress.  Failing to do so may lead to 100% iowait.

Tested-by: Mike Snitzer <snitzer@gmail.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c         | 18 ++++++++++++++++--
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       |  9 ++++++---
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3fe782d..0b30640 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -284,7 +284,17 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
-				requeue_io(inode);
+				if (wbc->nr_to_write <= 0) {
+					/*
+					 * slice used up: queue for next turn
+					 */
+					requeue_io(inode);
+				} else {
+					/*
+					 * somehow blocked: retry later
+					 */
+					redirty_tail(inode);
+				}
 			} else {
 				/*
 				 * Otherwise fully redirty the inode so that
@@ -468,8 +478,12 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		iput(inode);
 		cond_resched();
 		spin_lock(&inode_lock);
-		if (wbc->nr_to_write <= 0)
+		if (wbc->nr_to_write <= 0) {
+			wbc->more_io = 1;
 			break;
+		}
+		if (!list_empty(&sb->s_more_io))
+			wbc->more_io = 1;
 	}
 	return;		/* Leave any unwritten inodes on s_io */
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2cd826..b7b3362 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,6 +62,7 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
+	unsigned more_io:1;		/* more io to be dispatched */
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a4ca162..5e00f17 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -567,6 +567,7 @@ static void background_writeout(unsigned long _min_pages)
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
 			break;
+		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
@@ -574,8 +575,9 @@ static void background_writeout(unsigned long _min_pages)
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			congestion_wait(WRITE, HZ/10);
-			if (!wbc.encountered_congestion)
+			if (wbc.encountered_congestion || wbc.more_io)
+				congestion_wait(WRITE, HZ/10);
+			else
 				break;
 		}
 	}
@@ -640,11 +642,12 @@ static void wb_kupdate(unsigned long arg)
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
+		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
 		if (wbc.nr_to_write > 0) {
-			if (wbc.encountered_congestion)
+			if (wbc.encountered_congestion || wbc.more_io)
 				congestion_wait(WRITE, HZ/10);
 			else
 				break;	/* All the old data is written */
-- 
cgit v1.1


From 679299b32dbf9bac4bdaedc850fb95d0f81b4963 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:37 -0800
Subject: slob: fix free block merging at head of subpage

We weren't merging freed blocks at the beginning of the free list.  Fixing
this showed a 2.5% efficiency improvement in a userspace test harness.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slob.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/slob.c b/mm/slob.c
index 773a7aa..c56c5e5 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -398,6 +398,10 @@ static void slob_free(void *block, int size)
 	sp->units += units;
 
 	if (b < sp->free) {
+		if (b + units == sp->free) {
+			units += slob_units(sp->free);
+			sp->free = slob_next(sp->free);
+		}
 		set_slob(b, units, sp->free);
 		sp->free = b;
 	} else {
-- 
cgit v1.1


From 20cecbae44528d347c46e71f40650b75e0dcbc8e Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:37 -0800
Subject: slob: reduce external fragmentation by using three free lists

By putting smaller objects on their own list, we greatly reduce overall
external fragmentation and increase repeatability.  This reduces total SLOB
overhead from > 50% to ~6% on a simple boot test.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slob.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/mm/slob.c b/mm/slob.c
index c56c5e5..e2c3c0e 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -12,10 +12,17 @@
  * allocator is as little as 2 bytes, however typically most architectures
  * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
  *
- * The slob heap is a linked list of pages from alloc_pages(), and
- * within each page, there is a singly-linked list of free blocks (slob_t).
- * The heap is grown on demand and allocation from the heap is currently
- * first-fit.
+ * The slob heap is a set of linked list of pages from alloc_pages(),
+ * and within each page, there is a singly-linked list of free blocks
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
  *
  * Above this is an implementation of kmalloc/kfree. Blocks returned
  * from kmalloc are prepended with a 4-byte header with the kmalloc size.
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp)
 }
 
 /*
- * All (partially) free slob pages go on this list.
+ * All partially free slob pages go on these lists.
  */
-static LIST_HEAD(free_slob_pages);
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
 
 /*
  * slob_page: True for all slob pages (false for bigblock pages)
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp)
 	return test_bit(PG_private, &sp->flags);
 }
 
-static inline void set_slob_page_free(struct slob_page *sp)
+static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
 {
-	list_add(&sp->list, &free_slob_pages);
+	list_add(&sp->list, list);
 	__set_bit(PG_private, &sp->flags);
 }
 
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
 	struct slob_page *sp;
 	struct list_head *prev;
+	struct list_head *slob_list;
 	slob_t *b = NULL;
 	unsigned long flags;
 
+	if (size < SLOB_BREAK1)
+		slob_list = &free_slob_small;
+	else if (size < SLOB_BREAK2)
+		slob_list = &free_slob_medium;
+	else
+		slob_list = &free_slob_large;
+
 	spin_lock_irqsave(&slob_lock, flags);
 	/* Iterate through each partially free page, try to find room */
-	list_for_each_entry(sp, &free_slob_pages, list) {
+	list_for_each_entry(sp, slob_list, list) {
 #ifdef CONFIG_NUMA
 		/*
 		 * If there's a node specification, search for a partial
@@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 		/* Improve fragment distribution and reduce our average
 		 * search time by starting our next search here. (see
 		 * Knuth vol 1, sec 2.5, pg 449) */
-		if (prev != free_slob_pages.prev &&
-				free_slob_pages.next != prev->next)
-			list_move_tail(&free_slob_pages, prev->next);
+		if (prev != slob_list->prev &&
+				slob_list->next != prev->next)
+			list_move_tail(slob_list, prev->next);
 		break;
 	}
 	spin_unlock_irqrestore(&slob_lock, flags);
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 		sp->free = b;
 		INIT_LIST_HEAD(&sp->list);
 		set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
-		set_slob_page_free(sp);
+		set_slob_page_free(sp, slob_list);
 		b = slob_page_alloc(sp, size, align);
 		BUG_ON(!b);
 		spin_unlock_irqrestore(&slob_lock, flags);
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size)
 		set_slob(b, units,
 			(void *)((unsigned long)(b +
 					SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
-		set_slob_page_free(sp);
+		set_slob_page_free(sp, &free_slob_small);
 		goto out;
 	}
 
-- 
cgit v1.1


From 3729145821e3088a0c3c4183037fde356204bf97 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 4 Feb 2008 22:29:38 -0800
Subject: slob: correct Kconfig description

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index aaa7e7d..87f50df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -656,11 +656,9 @@ config SLOB
 	depends on EMBEDDED
 	bool "SLOB (Simple Allocator)"
 	help
-	   SLOB replaces the SLAB allocator with a drastically simpler
-	   allocator.  SLOB is more space efficient than SLAB but does not
-	   scale well (single lock for all operations) and is also highly
-	   susceptible to fragmentation. SLUB can accomplish a higher object
-	   density. It is usually better to use SLUB instead of SLOB.
+	   SLOB replaces the stock allocator with a drastically simpler
+	   allocator. SLOB is generally more space efficient but
+	   does not perform as well on large systems.
 
 endchoice
 
-- 
cgit v1.1


From 42492594043d621a7910ff5877c3eb9202870b45 Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Mon, 4 Feb 2008 22:29:39 -0800
Subject: VFS/Security: Rework inode_getsecurity and callers to return
 resulting buffer

This patch modifies the interface to inode_getsecurity to have the function
return a buffer containing the security blob and its length via parameters
instead of relying on the calling function to give it an appropriately sized
buffer.

Security blobs obtained with this function should be freed using the
release_secctx LSM hook.  This alleviates the problem of the caller having to
guess a length and preallocate a buffer for this function allowing it to be
used elsewhere for Labeled NFS.

The patch also removed the unused err parameter.  The conversion is similar to
the one performed by Al Viro for the security_getprocattr hook.

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Chris Wright <chrisw@sous-sol.org>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xattr.c               | 30 ++++++++++++++++++++++++++++--
 include/linux/security.h | 21 +++++++++------------
 include/linux/xattr.h    |  1 +
 mm/shmem.c               |  3 +--
 security/dummy.c         |  2 +-
 security/security.c      |  4 ++--
 security/selinux/hooks.c | 43 +++++++++++++++----------------------------
 7 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index 6645b73..1858552 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -105,6 +105,33 @@ out:
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
 ssize_t
+xattr_getsecurity(struct inode *inode, const char *name, void *value,
+			size_t size)
+{
+	void *buffer = NULL;
+	ssize_t len;
+
+	if (!value || !size) {
+		len = security_inode_getsecurity(inode, name, &buffer, false);
+		goto out_noalloc;
+	}
+
+	len = security_inode_getsecurity(inode, name, &buffer, true);
+	if (len < 0)
+		return len;
+	if (size < len) {
+		len = -ERANGE;
+		goto out;
+	}
+	memcpy(value, buffer, len);
+out:
+	security_release_secctx(buffer, len);
+out_noalloc:
+	return len;
+}
+EXPORT_SYMBOL_GPL(xattr_getsecurity);
+
+ssize_t
 vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
@@ -126,8 +153,7 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		int ret = security_inode_getsecurity(inode, suffix, value,
-						     size, error);
+		int ret = xattr_getsecurity(inode, suffix, value, size);
 		/*
 		 * Only overwrite the return value if a security module
 		 * is actually active.
diff --git a/include/linux/security.h b/include/linux/security.h
index d249742..9d289e7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -423,15 +423,12 @@ struct request_sock;
  * 	identified by @name for @dentry.
  * 	Return 0 if permission is granted.
  * @inode_getsecurity:
- *	Copy the extended attribute representation of the security label 
- *	associated with @name for @inode into @buffer.  @buffer may be
- *	NULL to request the size of the buffer required.  @size indicates
- *	the size of @buffer in bytes.  Note that @name is the remainder
- *	of the attribute name after the security. prefix has been removed.
- *	@err is the return value from the preceding fs getxattr call,
- *	and can be used by the security module to determine whether it
- *	should try and canonicalize the attribute value.
- *	Return number of bytes used/required on success.
+ *	Retrieve a copy of the extended attribute representation of the
+ *	security label associated with @name for @inode via @buffer.  Note that
+ *	@name is the remainder of the attribute name after the security prefix
+ *	has been removed. @alloc is used to specify of the call should return a
+ *	value via the buffer or just the value length Return size of buffer on
+ *	success.
  * @inode_setsecurity:
  *	Set the security label associated with @name for @inode from the
  *	extended attribute value @value.  @size indicates the size of the
@@ -1304,7 +1301,7 @@ struct security_operations {
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
 	int (*inode_need_killpriv) (struct dentry *dentry);
 	int (*inode_killpriv) (struct dentry *dentry);
-  	int (*inode_getsecurity)(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
+	int (*inode_getsecurity)(const struct inode *inode, const char *name, void **buffer, bool alloc);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   	int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size);
 
@@ -1565,7 +1562,7 @@ int security_inode_listxattr(struct dentry *dentry);
 int security_inode_removexattr(struct dentry *dentry, char *name);
 int security_inode_need_killpriv(struct dentry *dentry);
 int security_inode_killpriv(struct dentry *dentry);
-int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
+int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc);
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
 int security_file_permission(struct file *file, int mask);
@@ -1967,7 +1964,7 @@ static inline int security_inode_killpriv(struct dentry *dentry)
 	return cap_inode_killpriv(dentry);
 }
 
-static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index def131a..df6b95d 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -46,6 +46,7 @@ struct xattr_handler {
 		   size_t size, int flags);
 };
 
+ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t);
 ssize_t vfs_getxattr(struct dentry *, char *, void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
 int vfs_setxattr(struct dentry *, char *, void *, size_t, int);
diff --git a/mm/shmem.c b/mm/shmem.c
index ee90244..0f246c4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1955,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size,
-					  -EOPNOTSUPP);
+	return xattr_getsecurity(inode, name, buffer, size);
 }
 
 static int shmem_xattr_security_set(struct inode *inode, const char *name,
diff --git a/security/dummy.c b/security/dummy.c
index 48d4b0a..c505122 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -402,7 +402,7 @@ static int dummy_inode_killpriv(struct dentry *dentry)
 	return 0;
 }
 
-static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/security.c b/security/security.c
index ca475ca..b6c57a6 100644
--- a/security/security.c
+++ b/security/security.c
@@ -493,11 +493,11 @@ int security_inode_killpriv(struct dentry *dentry)
 	return security_ops->inode_killpriv(dentry);
 }
 
-int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
+int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc)
 {
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
-	return security_ops->inode_getsecurity(inode, name, buffer, size, err);
+	return security_ops->inode_getsecurity(inode, name, buffer, alloc);
 }
 
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index be6de0b..e5ed075 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -136,32 +136,6 @@ static DEFINE_SPINLOCK(sb_security_lock);
 
 static struct kmem_cache *sel_inode_cache;
 
-/* Return security context for a given sid or just the context 
-   length if the buffer is null or length is 0 */
-static int selinux_getsecurity(u32 sid, void *buffer, size_t size)
-{
-	char *context;
-	unsigned len;
-	int rc;
-
-	rc = security_sid_to_context(sid, &context, &len);
-	if (rc)
-		return rc;
-
-	if (!buffer || !size)
-		goto getsecurity_exit;
-
-	if (size < len) {
-		len = -ERANGE;
-		goto getsecurity_exit;
-	}
-	memcpy(buffer, context, len);
-
-getsecurity_exit:
-	kfree(context);
-	return len;
-}
-
 /**
  * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
  *
@@ -2675,14 +2649,27 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name)
  *
  * Permission check is handled by selinux_inode_getxattr hook.
  */
-static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc)
 {
+	u32 size;
+	int error;
+	char *context = NULL;
 	struct inode_security_struct *isec = inode->i_security;
 
 	if (strcmp(name, XATTR_SELINUX_SUFFIX))
 		return -EOPNOTSUPP;
 
-	return selinux_getsecurity(isec->sid, buffer, size);
+	error = security_sid_to_context(isec->sid, &context, &size);
+	if (error)
+		return error;
+	error = size;
+	if (alloc) {
+		*buffer = context;
+		goto out_nofree;
+	}
+	kfree(context);
+out_nofree:
+	return error;
 }
 
 static int selinux_inode_setsecurity(struct inode *inode, const char *name,
-- 
cgit v1.1


From 4bea58053f206be9a89ca35850f9ad295dac2042 Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Mon, 4 Feb 2008 22:29:40 -0800
Subject: VFS: Reorder vfs_getxattr to avoid unnecessary calls to the LSM

Originally vfs_getxattr would pull the security xattr variable using
the inode getxattr handle and then proceed to clobber it with a subsequent call
to the LSM.

This patch reorders the two operations such that when the xattr requested is
in the security namespace it first attempts to grab the value from the LSM
directly.

If it fails to obtain the value because there is no module present or the
module does not support the operation it will fall back to using the inode
getxattr operation.

In the event that both are inaccessible it returns EOPNOTSUPP.

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Chris Wright <chrisw@sous-sol.org>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xattr.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index 1858552..f7c8f87 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -145,11 +145,6 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 	if (error)
 		return error;
 
-	if (inode->i_op->getxattr)
-		error = inode->i_op->getxattr(dentry, name, value, size);
-	else
-		error = -EOPNOTSUPP;
-
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
@@ -158,9 +153,15 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 		 * Only overwrite the return value if a security module
 		 * is actually active.
 		 */
-		if (ret != -EOPNOTSUPP)
-			error = ret;
+		if (ret == -EOPNOTSUPP)
+			goto nolsm;
+		return ret;
 	}
+nolsm:
+	if (inode->i_op->getxattr)
+		error = inode->i_op->getxattr(dentry, name, value, size);
+	else
+		error = -EOPNOTSUPP;
 
 	return error;
 }
-- 
cgit v1.1


From 8f6936f4d29aa14e54a2470b954a2e1f96322988 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:29:41 -0800
Subject: revert "capabilities: clean up file capability reading"

Revert b68680e4731abbd78863063aaa0dca2a6d8cc723 to make way for the next
patch: "Add 64-bit capability support to the kernel".

We want to keep the vfs_cap_data.data[] structure, using two 'data's for
64-bit caps (and later three for 96-bit caps), whereas
b68680e4731abbd78863063aaa0dca2a6d8cc723 had gotten rid of the 'data' struct
made its members inline.

The 64-bit caps patch keeps the stack abuse fix at get_file_caps(), which was
the more important part of that patch.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h |  6 ++++--
 security/commoncap.c       | 23 ++++++++---------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index bb017ed..7a8d7ad 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -56,8 +56,10 @@ typedef struct __user_cap_data_struct {
 
 struct vfs_cap_data {
 	__u32 magic_etc;  /* Little endian */
-	__u32 permitted;    /* Little endian */
-	__u32 inheritable;  /* Little endian */
+	struct {
+		__u32 permitted;    /* Little endian */
+		__u32 inheritable;  /* Little endian */
+	} data[1];
 };
 
 #ifdef __KERNEL__
diff --git a/security/commoncap.c b/security/commoncap.c
index ea61bc7..b06617b 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -197,8 +197,7 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
 }
 
-static inline int cap_from_disk(struct vfs_cap_data *caps,
-				struct linux_binprm *bprm,
+static inline int cap_from_disk(__le32 *caps, struct linux_binprm *bprm,
 				int size)
 {
 	__u32 magic_etc;
@@ -206,7 +205,7 @@ static inline int cap_from_disk(struct vfs_cap_data *caps,
 	if (size != XATTR_CAPS_SZ)
 		return -EINVAL;
 
-	magic_etc = le32_to_cpu(caps->magic_etc);
+	magic_etc = le32_to_cpu(caps[0]);
 
 	switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
 	case VFS_CAP_REVISION:
@@ -214,8 +213,8 @@ static inline int cap_from_disk(struct vfs_cap_data *caps,
 			bprm->cap_effective = true;
 		else
 			bprm->cap_effective = false;
-		bprm->cap_permitted = to_cap_t(le32_to_cpu(caps->permitted));
-		bprm->cap_inheritable = to_cap_t(le32_to_cpu(caps->inheritable));
+		bprm->cap_permitted = to_cap_t(le32_to_cpu(caps[1]));
+		bprm->cap_inheritable = to_cap_t(le32_to_cpu(caps[2]));
 		return 0;
 	default:
 		return -EINVAL;
@@ -227,7 +226,7 @@ static int get_file_caps(struct linux_binprm *bprm)
 {
 	struct dentry *dentry;
 	int rc = 0;
-	struct vfs_cap_data incaps;
+	__le32 v1caps[XATTR_CAPS_SZ];
 	struct inode *inode;
 
 	if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) {
@@ -240,14 +239,8 @@ static int get_file_caps(struct linux_binprm *bprm)
 	if (!inode->i_op || !inode->i_op->getxattr)
 		goto out;
 
-	rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
-	if (rc > 0) {
-		if (rc == XATTR_CAPS_SZ)
-			rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS,
-						&incaps, XATTR_CAPS_SZ);
-		else
-			rc = -EINVAL;
-	}
+	rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &v1caps,
+							XATTR_CAPS_SZ);
 	if (rc == -ENODATA || rc == -EOPNOTSUPP) {
 		/* no data, that's ok */
 		rc = 0;
@@ -256,7 +249,7 @@ static int get_file_caps(struct linux_binprm *bprm)
 	if (rc < 0)
 		goto out;
 
-	rc = cap_from_disk(&incaps, bprm, rc);
+	rc = cap_from_disk(v1caps, bprm, rc);
 	if (rc)
 		printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
 			__FUNCTION__, rc, bprm->filename);
-- 
cgit v1.1


From e338d263a76af78fe8f38a72131188b58fceb591 Mon Sep 17 00:00:00 2001
From: Andrew Morgan <morgan@kernel.org>
Date: Mon, 4 Feb 2008 22:29:42 -0800
Subject: Add 64-bit capability support to the kernel

The patch supports legacy (32-bit) capability userspace, and where possible
translates 32-bit capabilities to/from userspace and the VFS to 64-bit
kernel space capabilities.  If a capability set cannot be compressed into
32-bits for consumption by user space, the system call fails, with -ERANGE.

FWIW libcap-2.00 supports this change (and earlier capability formats)

 http://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/

[akpm@linux-foundation.org: coding-syle fixes]
[akpm@linux-foundation.org: use get_task_comm()]
[ezk@cs.sunysb.edu: build fix]
[akpm@linux-foundation.org: do not initialise statics to 0 or NULL]
[akpm@linux-foundation.org: unused var]
[serue@us.ibm.com: export __cap_ symbols]
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/auth.c             |  10 +-
 fs/proc/array.c            |  21 +++--
 include/linux/capability.h | 221 +++++++++++++++++++++++++++++++--------------
 kernel/capability.c        | 113 +++++++++++++++++++++--
 mm/oom_kill.c              |   5 +-
 security/commoncap.c       |  87 ++++++++++++------
 security/dummy.c           |  17 ++--
 7 files changed, 349 insertions(+), 125 deletions(-)

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 2192805..d13403e 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -11,8 +11,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/export.h>
 
-#define	CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
-
 int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 {
 	struct exp_flavor_info *f;
@@ -69,10 +67,12 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	ret = set_current_groups(cred.cr_group_info);
 	put_group_info(cred.cr_group_info);
 	if ((cred.cr_uid)) {
-		cap_t(current->cap_effective) &= ~CAP_NFSD_MASK;
+		current->cap_effective =
+			cap_drop_nfsd_set(current->cap_effective);
 	} else {
-		cap_t(current->cap_effective) |= (CAP_NFSD_MASK &
-						  current->cap_permitted);
+		current->cap_effective =
+			cap_raise_nfsd_set(current->cap_effective,
+					   current->cap_permitted);
 	}
 	return ret;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index b380313..6ba2746 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -281,14 +281,23 @@ static inline char *task_sig(struct task_struct *p, char *buffer)
 	return buffer;
 }
 
+static char *render_cap_t(const char *header, kernel_cap_t *a, char *buffer)
+{
+	unsigned __capi;
+
+	buffer += sprintf(buffer, "%s", header);
+	CAP_FOR_EACH_U32(__capi) {
+		buffer += sprintf(buffer, "%08x",
+				  a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]);
+	}
+	return buffer + sprintf(buffer, "\n");
+}
+
 static inline char *task_cap(struct task_struct *p, char *buffer)
 {
-    return buffer + sprintf(buffer, "CapInh:\t%016x\n"
-			    "CapPrm:\t%016x\n"
-			    "CapEff:\t%016x\n",
-			    cap_t(p->cap_inheritable),
-			    cap_t(p->cap_permitted),
-			    cap_t(p->cap_effective));
+	buffer = render_cap_t("CapInh:\t", &p->cap_inheritable, buffer);
+	buffer = render_cap_t("CapPrm:\t", &p->cap_permitted, buffer);
+	return render_cap_t("CapEff:\t", &p->cap_effective, buffer);
 }
 
 static inline char *task_context_switch_counts(struct task_struct *p,
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7a8d7ad..a934dac 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -23,13 +23,20 @@ struct task_struct;
    kernel might be somewhat backwards compatible, but don't bet on
    it. */
 
-/* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to
+/* Note, cap_t, is defined by POSIX (draft) to be an "opaque" pointer to
    a set of three capability sets.  The transposition of 3*the
    following structure to such a composite is better handled in a user
    library since the draft standard requires the use of malloc/free
    etc.. */
 
-#define _LINUX_CAPABILITY_VERSION  0x19980330
+#define _LINUX_CAPABILITY_VERSION_1  0x19980330
+#define _LINUX_CAPABILITY_U32S_1     1
+
+#define _LINUX_CAPABILITY_VERSION_2  0x20071026
+#define _LINUX_CAPABILITY_U32S_2     2
+
+#define _LINUX_CAPABILITY_VERSION    _LINUX_CAPABILITY_VERSION_2
+#define _LINUX_CAPABILITY_U32S       _LINUX_CAPABILITY_U32S_2
 
 typedef struct __user_cap_header_struct {
 	__u32 version;
@@ -42,43 +49,42 @@ typedef struct __user_cap_data_struct {
         __u32 inheritable;
 } __user *cap_user_data_t;
 
+
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
-#define XATTR_CAPS_SZ (3*sizeof(__le32))
 #define VFS_CAP_REVISION_MASK	0xFF000000
+#define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
+#define VFS_CAP_FLAGS_EFFECTIVE	0x000001
+
 #define VFS_CAP_REVISION_1	0x01000000
+#define VFS_CAP_U32_1           1
+#define XATTR_CAPS_SZ_1         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_1))
 
-#define VFS_CAP_REVISION	VFS_CAP_REVISION_1
+#define VFS_CAP_REVISION_2	0x02000000
+#define VFS_CAP_U32_2           2
+#define XATTR_CAPS_SZ_2         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
+
+#define XATTR_CAPS_SZ           XATTR_CAPS_SZ_2
+#define VFS_CAP_U32             VFS_CAP_U32_2
+#define VFS_CAP_REVISION	VFS_CAP_REVISION_2
 
-#define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
-#define VFS_CAP_FLAGS_EFFECTIVE	0x000001
 
 struct vfs_cap_data {
-	__u32 magic_etc;  /* Little endian */
+	__le32 magic_etc;            /* Little endian */
 	struct {
-		__u32 permitted;    /* Little endian */
-		__u32 inheritable;  /* Little endian */
-	} data[1];
+		__le32 permitted;    /* Little endian */
+		__le32 inheritable;  /* Little endian */
+	} data[VFS_CAP_U32];
 };
 
 #ifdef __KERNEL__
 
-/* #define STRICT_CAP_T_TYPECHECKS */
-
-#ifdef STRICT_CAP_T_TYPECHECKS
-
 typedef struct kernel_cap_struct {
-	__u32 cap;
+	__u32 cap[_LINUX_CAPABILITY_U32S];
 } kernel_cap_t;
 
-#else
-
-typedef __u32 kernel_cap_t;
-
-#endif
-
-#define _USER_CAP_HEADER_SIZE  (2*sizeof(__u32))
+#define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
 #endif
@@ -121,10 +127,6 @@ typedef __u32 kernel_cap_t;
 
 #define CAP_FSETID           4
 
-/* Used to decide between falling back on the old suser() or fsuser(). */
-
-#define CAP_FS_MASK          0x1f
-
 /* Overrides the restriction that the real or effective user ID of a
    process sending a signal must match the real or effective user ID
    of the process receiving the signal. */
@@ -147,8 +149,12 @@ typedef __u32 kernel_cap_t;
  ** Linux-specific capabilities
  **/
 
-/* Transfer any capability in your permitted set to any pid,
-   remove any capability in your permitted set from any pid */
+/* Without VFS support for capabilities:
+ *   Transfer any capability in your permitted set to any pid,
+ *   remove any capability in your permitted set from any pid
+ * With VFS support for capabilities (neither of above, but)
+ *   Add any capability to the current process' inheritable set
+ */
 
 #define CAP_SETPCAP          8
 
@@ -309,70 +315,153 @@ typedef __u32 kernel_cap_t;
 
 #define CAP_SETFCAP	     31
 
+/*
+ * Bit location of each capability (used by user-space library and kernel)
+ */
+
+#define CAP_TO_INDEX(x)     ((x) >> 5)        /* 1 << 5 == bits in __u32 */
+#define CAP_TO_MASK(x)      (1 << ((x) & 31)) /* mask for indexed __u32 */
+
 #ifdef __KERNEL__
 
 /*
  * Internal kernel functions only
  */
 
-#ifdef STRICT_CAP_T_TYPECHECKS
+#define CAP_FOR_EACH_U32(__capi)  \
+	for (__capi = 0; __capi < _LINUX_CAPABILITY_U32S; ++__capi)
+
+# define CAP_FS_MASK_B0     (CAP_TO_MASK(CAP_CHOWN)		\
+			    | CAP_TO_MASK(CAP_DAC_OVERRIDE)	\
+			    | CAP_TO_MASK(CAP_DAC_READ_SEARCH)	\
+			    | CAP_TO_MASK(CAP_FOWNER)		\
+			    | CAP_TO_MASK(CAP_FSETID))
+
+#if _LINUX_CAPABILITY_U32S != 2
+# error Fix up hand-coded capability macro initializers
+#else /* HAND-CODED capability initializers */
+
+# define CAP_EMPTY_SET    {{ 0, 0 }}
+# define CAP_FULL_SET     {{ ~0, ~0 }}
+# define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}
+# define CAP_FS_SET       {{ CAP_FS_MASK_B0, 0 }}
+# define CAP_NFSD_SET     {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), 0 }}
+
+#endif /* _LINUX_CAPABILITY_U32S != 2 */
+
+#define CAP_INIT_INH_SET    CAP_EMPTY_SET
+
+# define cap_clear(c)         do { (c) = __cap_empty_set; } while (0)
+# define cap_set_full(c)      do { (c) = __cap_full_set; } while (0)
+# define cap_set_init_eff(c)  do { (c) = __cap_init_eff_set; } while (0)
+
+#define cap_raise(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
+#define cap_lower(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag))
+#define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag))
+
+#define CAP_BOP_ALL(c, a, b, OP)                                    \
+do {                                                                \
+	unsigned __capi;                                            \
+	CAP_FOR_EACH_U32(__capi) {                                  \
+		c.cap[__capi] = a.cap[__capi] OP b.cap[__capi];     \
+	}                                                           \
+} while (0)
+
+#define CAP_UOP_ALL(c, a, OP)                                       \
+do {                                                                \
+	unsigned __capi;                                            \
+	CAP_FOR_EACH_U32(__capi) {                                  \
+		c.cap[__capi] = OP a.cap[__capi];                   \
+	}                                                           \
+} while (0)
+
+static inline kernel_cap_t cap_combine(const kernel_cap_t a,
+				       const kernel_cap_t b)
+{
+	kernel_cap_t dest;
+	CAP_BOP_ALL(dest, a, b, |);
+	return dest;
+}
 
-#define to_cap_t(x) { x }
-#define cap_t(x) (x).cap
+static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
+					 const kernel_cap_t b)
+{
+	kernel_cap_t dest;
+	CAP_BOP_ALL(dest, a, b, &);
+	return dest;
+}
 
-#else
+static inline kernel_cap_t cap_drop(const kernel_cap_t a,
+				    const kernel_cap_t drop)
+{
+	kernel_cap_t dest;
+	CAP_BOP_ALL(dest, a, drop, &~);
+	return dest;
+}
 
-#define to_cap_t(x) (x)
-#define cap_t(x) (x)
+static inline kernel_cap_t cap_invert(const kernel_cap_t c)
+{
+	kernel_cap_t dest;
+	CAP_UOP_ALL(dest, c, ~);
+	return dest;
+}
 
-#endif
+static inline int cap_isclear(const kernel_cap_t a)
+{
+	unsigned __capi;
+	CAP_FOR_EACH_U32(__capi) {
+		if (a.cap[__capi] != 0)
+			return 0;
+	}
+	return 1;
+}
 
-#define CAP_EMPTY_SET       to_cap_t(0)
-#define CAP_FULL_SET        to_cap_t(~0)
-#define CAP_INIT_EFF_SET    to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP))
-#define CAP_INIT_INH_SET    to_cap_t(0)
+static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
+{
+	kernel_cap_t dest;
+	dest = cap_drop(a, set);
+	return cap_isclear(dest);
+}
 
-#define CAP_TO_MASK(x) (1 << (x))
-#define cap_raise(c, flag)   (cap_t(c) |=  CAP_TO_MASK(flag))
-#define cap_lower(c, flag)   (cap_t(c) &= ~CAP_TO_MASK(flag))
-#define cap_raised(c, flag)  (cap_t(c) & CAP_TO_MASK(flag))
+/* Used to decide between falling back on the old suser() or fsuser(). */
 
-static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b)
+static inline int cap_is_fs_cap(int cap)
 {
-     kernel_cap_t dest;
-     cap_t(dest) = cap_t(a) | cap_t(b);
-     return dest;
+	const kernel_cap_t __cap_fs_set = CAP_FS_SET;
+	return !!(CAP_TO_MASK(cap) & __cap_fs_set.cap[CAP_TO_INDEX(cap)]);
 }
 
-static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b)
+static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
 {
-     kernel_cap_t dest;
-     cap_t(dest) = cap_t(a) & cap_t(b);
-     return dest;
+	const kernel_cap_t __cap_fs_set = CAP_FS_SET;
+	return cap_drop(a, __cap_fs_set);
 }
 
-static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop)
+static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
+					    const kernel_cap_t permitted)
 {
-     kernel_cap_t dest;
-     cap_t(dest) = cap_t(a) & ~cap_t(drop);
-     return dest;
+	const kernel_cap_t __cap_fs_set = CAP_FS_SET;
+	return cap_combine(a,
+			   cap_intersect(permitted, __cap_fs_set));
 }
 
-static inline kernel_cap_t cap_invert(kernel_cap_t c)
+static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
 {
-     kernel_cap_t dest;
-     cap_t(dest) = ~cap_t(c);
-     return dest;
+	const kernel_cap_t __cap_fs_set = CAP_NFSD_SET;
+	return cap_drop(a, __cap_fs_set);
 }
 
-#define cap_isclear(c)       (!cap_t(c))
-#define cap_issubset(a,set)  (!(cap_t(a) & ~cap_t(set)))
-
-#define cap_clear(c)         do { cap_t(c) =  0; } while(0)
-#define cap_set_full(c)      do { cap_t(c) = ~0; } while(0)
-#define cap_mask(c,mask)     do { cap_t(c) &= cap_t(mask); } while(0)
+static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
+					      const kernel_cap_t permitted)
+{
+	const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET;
+	return cap_combine(a,
+			   cap_intersect(permitted, __cap_nfsd_set));
+}
 
-#define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
+extern const kernel_cap_t __cap_empty_set;
+extern const kernel_cap_t __cap_full_set;
+extern const kernel_cap_t __cap_init_eff_set;
 
 int capable(int cap);
 int __capable(struct task_struct *t, int cap);
diff --git a/kernel/capability.c b/kernel/capability.c
index efbd9cd..39e8193 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,6 +22,37 @@
 static DEFINE_SPINLOCK(task_capability_lock);
 
 /*
+ * Leveraged for setting/resetting capabilities
+ */
+
+const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
+const kernel_cap_t __cap_full_set = CAP_FULL_SET;
+const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
+
+EXPORT_SYMBOL(__cap_empty_set);
+EXPORT_SYMBOL(__cap_full_set);
+EXPORT_SYMBOL(__cap_init_eff_set);
+
+/*
+ * More recent versions of libcap are available from:
+ *
+ *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
+ */
+
+static void warn_legacy_capability_use(void)
+{
+	static int warned;
+	if (!warned) {
+		char name[sizeof(current->comm)];
+
+		printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
+		       " (legacy support in use)\n",
+		       get_task_comm(name, current));
+		warned = 1;
+	}
+}
+
+/*
  * For sys_getproccap() and sys_setproccap(), any of the three
  * capability set pointers may be NULL -- indicating that that set is
  * uninteresting and/or not to be changed.
@@ -42,12 +73,21 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 	pid_t pid;
 	__u32 version;
 	struct task_struct *target;
-	struct __user_cap_data_struct data;
+	unsigned tocopy;
+	kernel_cap_t pE, pI, pP;
 
 	if (get_user(version, &header->version))
 		return -EFAULT;
 
-	if (version != _LINUX_CAPABILITY_VERSION) {
+	switch (version) {
+	case _LINUX_CAPABILITY_VERSION_1:
+		warn_legacy_capability_use();
+		tocopy = _LINUX_CAPABILITY_U32S_1;
+		break;
+	case _LINUX_CAPABILITY_VERSION_2:
+		tocopy = _LINUX_CAPABILITY_U32S_2;
+		break;
+	default:
 		if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
 			return -EFAULT;
 		return -EINVAL;
@@ -71,14 +111,47 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 	} else
 		target = current;
 
-	ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
+	ret = security_capget(target, &pE, &pI, &pP);
 
 out:
 	read_unlock(&tasklist_lock);
 	spin_unlock(&task_capability_lock);
 
-	if (!ret && copy_to_user(dataptr, &data, sizeof data))
-		return -EFAULT;
+	if (!ret) {
+		struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+		unsigned i;
+
+		for (i = 0; i < tocopy; i++) {
+			kdata[i].effective = pE.cap[i];
+			kdata[i].permitted = pP.cap[i];
+			kdata[i].inheritable = pI.cap[i];
+		}
+
+		/*
+		 * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
+		 * we silently drop the upper capabilities here. This
+		 * has the effect of making older libcap
+		 * implementations implicitly drop upper capability
+		 * bits when they perform a: capget/modify/capset
+		 * sequence.
+		 *
+		 * This behavior is considered fail-safe
+		 * behavior. Upgrading the application to a newer
+		 * version of libcap will enable access to the newer
+		 * capabilities.
+		 *
+		 * An alternative would be to return an error here
+		 * (-ERANGE), but that causes legacy applications to
+		 * unexpectidly fail; the capget/modify/capset aborts
+		 * before modification is attempted and the application
+		 * fails.
+		 */
+
+		if (copy_to_user(dataptr, kdata, tocopy
+				 * sizeof(struct __user_cap_data_struct))) {
+			return -EFAULT;
+		}
+	}
 
 	return ret;
 }
@@ -167,6 +240,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
  */
 asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
+	struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
 	__u32 version;
 	struct task_struct *target;
@@ -176,7 +251,15 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(version, &header->version))
 		return -EFAULT;
 
-	if (version != _LINUX_CAPABILITY_VERSION) {
+	switch (version) {
+	case _LINUX_CAPABILITY_VERSION_1:
+		warn_legacy_capability_use();
+		tocopy = _LINUX_CAPABILITY_U32S_1;
+		break;
+	case _LINUX_CAPABILITY_VERSION_2:
+		tocopy = _LINUX_CAPABILITY_U32S_2;
+		break;
+	default:
 		if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
 			return -EFAULT;
 		return -EINVAL;
@@ -188,10 +271,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
 		return -EPERM;
 
-	if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
-	    copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
-	    copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
+	if (copy_from_user(&kdata, data, tocopy
+			   * sizeof(struct __user_cap_data_struct))) {
 		return -EFAULT;
+	}
+
+	for (i = 0; i < tocopy; i++) {
+		effective.cap[i] = kdata[i].effective;
+		permitted.cap[i] = kdata[i].permitted;
+		inheritable.cap[i] = kdata[i].inheritable;
+	}
+	while (i < _LINUX_CAPABILITY_U32S) {
+		effective.cap[i] = 0;
+		permitted.cap[i] = 0;
+		inheritable.cap[i] = 0;
+		i++;
+	}
 
 	spin_lock(&task_capability_lock);
 	read_lock(&tasklist_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 96473b4..320d74e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -125,8 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * Superuser processes are usually more important, so we make it
 	 * less likely that we kill those.
 	 */
-	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
-				p->uid == 0 || p->euid == 0)
+	if (__capable(p, CAP_SYS_ADMIN) || p->uid == 0 || p->euid == 0)
 		points /= 4;
 
 	/*
@@ -135,7 +134,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * tend to only have this flag set on applications they think
 	 * of as important.
 	 */
-	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+	if (__capable(p, CAP_SYS_RAWIO))
 		points /= 4;
 
 	/*
diff --git a/security/commoncap.c b/security/commoncap.c
index b06617b..01ab478 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1,4 +1,4 @@
-/* Common capabilities, needed by capability.o and root_plug.o 
+/* Common capabilities, needed by capability.o and root_plug.o
  *
  *	This program is free software; you can redistribute it and/or modify
  *	it under the terms of the GNU General Public License as published by
@@ -93,9 +93,9 @@ int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 		kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
 	/* Derived from kernel/capability.c:sys_capget. */
-	*effective = cap_t (target->cap_effective);
-	*inheritable = cap_t (target->cap_inheritable);
-	*permitted = cap_t (target->cap_permitted);
+	*effective = target->cap_effective;
+	*inheritable = target->cap_inheritable;
+	*permitted = target->cap_permitted;
 	return 0;
 }
 
@@ -197,28 +197,51 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
 }
 
-static inline int cap_from_disk(__le32 *caps, struct linux_binprm *bprm,
-				int size)
+static inline int cap_from_disk(struct vfs_cap_data *caps,
+				struct linux_binprm *bprm, unsigned size)
 {
 	__u32 magic_etc;
+	unsigned tocopy, i;
 
-	if (size != XATTR_CAPS_SZ)
+	if (size < sizeof(magic_etc))
 		return -EINVAL;
 
-	magic_etc = le32_to_cpu(caps[0]);
+	magic_etc = le32_to_cpu(caps->magic_etc);
 
 	switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
-	case VFS_CAP_REVISION:
-		if (magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
-			bprm->cap_effective = true;
-		else
-			bprm->cap_effective = false;
-		bprm->cap_permitted = to_cap_t(le32_to_cpu(caps[1]));
-		bprm->cap_inheritable = to_cap_t(le32_to_cpu(caps[2]));
-		return 0;
+	case VFS_CAP_REVISION_1:
+		if (size != XATTR_CAPS_SZ_1)
+			return -EINVAL;
+		tocopy = VFS_CAP_U32_1;
+		break;
+	case VFS_CAP_REVISION_2:
+		if (size != XATTR_CAPS_SZ_2)
+			return -EINVAL;
+		tocopy = VFS_CAP_U32_2;
+		break;
 	default:
 		return -EINVAL;
 	}
+
+	if (magic_etc & VFS_CAP_FLAGS_EFFECTIVE) {
+		bprm->cap_effective = true;
+	} else {
+		bprm->cap_effective = false;
+	}
+
+	for (i = 0; i < tocopy; ++i) {
+		bprm->cap_permitted.cap[i] =
+			le32_to_cpu(caps->data[i].permitted);
+		bprm->cap_inheritable.cap[i] =
+			le32_to_cpu(caps->data[i].inheritable);
+	}
+	while (i < VFS_CAP_U32) {
+		bprm->cap_permitted.cap[i] = 0;
+		bprm->cap_inheritable.cap[i] = 0;
+		i++;
+	}
+
+	return 0;
 }
 
 /* Locate any VFS capabilities: */
@@ -226,7 +249,7 @@ static int get_file_caps(struct linux_binprm *bprm)
 {
 	struct dentry *dentry;
 	int rc = 0;
-	__le32 v1caps[XATTR_CAPS_SZ];
+	struct vfs_cap_data vcaps;
 	struct inode *inode;
 
 	if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID) {
@@ -239,8 +262,8 @@ static int get_file_caps(struct linux_binprm *bprm)
 	if (!inode->i_op || !inode->i_op->getxattr)
 		goto out;
 
-	rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &v1caps,
-							XATTR_CAPS_SZ);
+	rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &vcaps,
+				   XATTR_CAPS_SZ);
 	if (rc == -ENODATA || rc == -EOPNOTSUPP) {
 		/* no data, that's ok */
 		rc = 0;
@@ -249,7 +272,7 @@ static int get_file_caps(struct linux_binprm *bprm)
 	if (rc < 0)
 		goto out;
 
-	rc = cap_from_disk(v1caps, bprm, rc);
+	rc = cap_from_disk(&vcaps, bprm, rc);
 	if (rc)
 		printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
 			__FUNCTION__, rc, bprm->filename);
@@ -344,8 +367,10 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * capability rules */
 	if (!is_global_init(current)) {
 		current->cap_permitted = new_permitted;
-		current->cap_effective = bprm->cap_effective ?
-				new_permitted : 0;
+		if (bprm->cap_effective)
+			current->cap_effective = new_permitted;
+		else
+			cap_clear(current->cap_effective);
 	}
 
 	/* AUD: Audit candidate if current->cap_effective is set */
@@ -467,13 +492,15 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
 
 			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
 				if (old_fsuid == 0 && current->fsuid != 0) {
-					cap_t (current->cap_effective) &=
-					    ~CAP_FS_MASK;
+					current->cap_effective =
+						cap_drop_fs_set(
+						    current->cap_effective);
 				}
 				if (old_fsuid != 0 && current->fsuid == 0) {
-					cap_t (current->cap_effective) |=
-					    (cap_t (current->cap_permitted) &
-					     CAP_FS_MASK);
+					current->cap_effective =
+						cap_raise_fs_set(
+						    current->cap_effective,
+						    current->cap_permitted);
 				}
 			}
 			break;
@@ -577,9 +604,9 @@ int cap_task_kill(struct task_struct *p, struct siginfo *info,
 
 void cap_task_reparent_to_init (struct task_struct *p)
 {
-	p->cap_effective = CAP_INIT_EFF_SET;
-	p->cap_inheritable = CAP_INIT_INH_SET;
-	p->cap_permitted = CAP_FULL_SET;
+	cap_set_init_eff(p->cap_effective);
+	cap_clear(p->cap_inheritable);
+	cap_set_full(p->cap_permitted);
 	p->keep_capabilities = 0;
 	return;
 }
diff --git a/security/dummy.c b/security/dummy.c
index c505122..649326b 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -36,14 +36,19 @@ static int dummy_ptrace (struct task_struct *parent, struct task_struct *child)
 static int dummy_capget (struct task_struct *target, kernel_cap_t * effective,
 			 kernel_cap_t * inheritable, kernel_cap_t * permitted)
 {
-	*effective = *inheritable = *permitted = 0;
 	if (target->euid == 0) {
-		*permitted |= (~0 & ~CAP_FS_MASK);
-		*effective |= (~0 & ~CAP_TO_MASK(CAP_SETPCAP) & ~CAP_FS_MASK);
+		cap_set_full(*permitted);
+		cap_set_init_eff(*effective);
+	} else {
+		cap_clear(*permitted);
+		cap_clear(*effective);
 	}
-	if (target->fsuid == 0) {
-		*permitted |= CAP_FS_MASK;
-		*effective |= CAP_FS_MASK;
+
+	cap_clear(*inheritable);
+
+	if (target->fsuid != 0) {
+		*permitted = cap_drop_fs_set(*permitted);
+		*effective = cap_drop_fs_set(*effective);
 	}
 	return 0;
 }
-- 
cgit v1.1


From 46c383cc4530ccc438cb325e92e11eb21dd3d4fc Mon Sep 17 00:00:00 2001
From: Andrew Morgan <andrew@nagrom.org>
Date: Mon, 4 Feb 2008 22:29:43 -0800
Subject: Remove unnecessary include from include/linux/capability.h

KaiGai Kohei observed that this line in the linux header is not needed.

Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Cc: KaiGai Kohei <kaigai@kaigai.gr.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index a934dac..a1d93da 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,7 +14,6 @@
 #define _LINUX_CAPABILITY_H
 
 #include <linux/types.h>
-#include <linux/compiler.h>
 
 struct task_struct;
 
-- 
cgit v1.1


From 3b7391de67da515c91f48aa371de77cb6cc5c07e Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 4 Feb 2008 22:29:45 -0800
Subject: capabilities: introduce per-process capability bounding set

The capability bounding set is a set beyond which capabilities cannot grow.
 Currently cap_bset is per-system.  It can be manipulated through sysctl,
but only init can add capabilities.  Root can remove capabilities.  By
default it includes all caps except CAP_SETPCAP.

This patch makes the bounding set per-process when file capabilities are
enabled.  It is inherited at fork from parent.  Noone can add elements,
CAP_SETPCAP is required to remove them.

One example use of this is to start a safer container.  For instance, until
device namespaces or per-container device whitelists are introduced, it is
best to take CAP_MKNOD away from a container.

The bounding set will not affect pP and pE immediately.  It will only
affect pP' and pE' after subsequent exec()s.  It also does not affect pI,
and exec() does not constrain pI'.  So to really start a shell with no way
of regain CAP_MKNOD, you would do

	prctl(PR_CAPBSET_DROP, CAP_MKNOD);
	cap_t cap = cap_get_proc();
	cap_value_t caparray[1];
	caparray[0] = CAP_MKNOD;
	cap_set_flag(cap, CAP_INHERITABLE, 1, caparray, CAP_DROP);
	cap_set_proc(cap);
	cap_free(cap);

The following test program will get and set the bounding
set (but not pI).  For instance

	./bset get
		(lists capabilities in bset)
	./bset drop cap_net_raw
		(starts shell with new bset)
		(use capset, setuid binary, or binary with
		file capabilities to try to increase caps)

************************************************************
cap_bound.c
************************************************************
 #include <sys/prctl.h>
 #include <linux/capability.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #ifndef PR_CAPBSET_READ
 #define PR_CAPBSET_READ 23
 #endif

 #ifndef PR_CAPBSET_DROP
 #define PR_CAPBSET_DROP 24
 #endif

int usage(char *me)
{
	printf("Usage: %s get\n", me);
	printf("       %s drop <capability>\n", me);
	return 1;
}

 #define numcaps 32
char *captable[numcaps] = {
	"cap_chown",
	"cap_dac_override",
	"cap_dac_read_search",
	"cap_fowner",
	"cap_fsetid",
	"cap_kill",
	"cap_setgid",
	"cap_setuid",
	"cap_setpcap",
	"cap_linux_immutable",
	"cap_net_bind_service",
	"cap_net_broadcast",
	"cap_net_admin",
	"cap_net_raw",
	"cap_ipc_lock",
	"cap_ipc_owner",
	"cap_sys_module",
	"cap_sys_rawio",
	"cap_sys_chroot",
	"cap_sys_ptrace",
	"cap_sys_pacct",
	"cap_sys_admin",
	"cap_sys_boot",
	"cap_sys_nice",
	"cap_sys_resource",
	"cap_sys_time",
	"cap_sys_tty_config",
	"cap_mknod",
	"cap_lease",
	"cap_audit_write",
	"cap_audit_control",
	"cap_setfcap"
};

int getbcap(void)
{
	int comma=0;
	unsigned long i;
	int ret;

	printf("i know of %d capabilities\n", numcaps);
	printf("capability bounding set:");
	for (i=0; i<numcaps; i++) {
		ret = prctl(PR_CAPBSET_READ, i);
		if (ret < 0)
			perror("prctl");
		else if (ret==1)
			printf("%s%s", (comma++) ? ", " : " ", captable[i]);
	}
	printf("\n");
	return 0;
}

int capdrop(char *str)
{
	unsigned long i;

	int found=0;
	for (i=0; i<numcaps; i++) {
		if (strcmp(captable[i], str) == 0) {
			found=1;
			break;
		}
	}
	if (!found)
		return 1;
	if (prctl(PR_CAPBSET_DROP, i)) {
		perror("prctl");
		return 1;
	}
	return 0;
}

int main(int argc, char *argv[])
{
	if (argc<2)
		return usage(argv[0]);
	if (strcmp(argv[1], "get")==0)
		return getbcap();
	if (strcmp(argv[1], "drop")!=0 || argc<3)
		return usage(argv[0]);
	if (capdrop(argv[2])) {
		printf("unknown capability\n");
		return 1;
	}
	return execl("/bin/bash", "/bin/bash", NULL);
}
************************************************************

[serue@us.ibm.com: fix typo]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>a
Signed-off-by: "Serge E. Hallyn" <serue@us.ibm.com>
Tested-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h | 11 +++++++++--
 include/linux/init_task.h  | 13 +++++++++++++
 include/linux/prctl.h      |  4 ++++
 include/linux/sched.h      |  2 +-
 include/linux/security.h   |  5 -----
 include/linux/sysctl.h     |  3 ---
 kernel/fork.c              |  1 +
 kernel/sys.c               | 13 ++++++++++++-
 kernel/sysctl.c            | 35 -----------------------------------
 kernel/sysctl_check.c      |  7 -------
 security/commoncap.c       | 44 +++++++++++++++++++++++++++-----------------
 11 files changed, 67 insertions(+), 71 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index a1d93da..ffe7bab8 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -152,7 +152,9 @@ typedef struct kernel_cap_struct {
  *   Transfer any capability in your permitted set to any pid,
  *   remove any capability in your permitted set from any pid
  * With VFS support for capabilities (neither of above, but)
- *   Add any capability to the current process' inheritable set
+ *   Add any capability from current's capability bounding set
+ *       to the current process' inheritable set
+ *   Allow taking bits out of capability bounding set
  */
 
 #define CAP_SETPCAP          8
@@ -202,7 +204,6 @@ typedef struct kernel_cap_struct {
 #define CAP_IPC_OWNER        15
 
 /* Insert and remove kernel modules - modify kernel without limit */
-/* Modify cap_bset */
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
@@ -314,6 +315,10 @@ typedef struct kernel_cap_struct {
 
 #define CAP_SETFCAP	     31
 
+#define CAP_LAST_CAP         CAP_SETFCAP
+
+#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
+
 /*
  * Bit location of each capability (used by user-space library and kernel)
  */
@@ -465,6 +470,8 @@ extern const kernel_cap_t __cap_init_eff_set;
 int capable(int cap);
 int __capable(struct task_struct *t, int cap);
 
+extern long cap_prctl_drop(unsigned long cap);
+
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f42663e..1f74e1d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -121,6 +121,18 @@ extern struct group_info init_groups;
 #else
 #define INIT_IDS
 #endif
+
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+/*
+ * Because of the reduced scope of CAP_SETPCAP when filesystem
+ * capabilities are in effect, it is safe to allow CAP_SETPCAP to
+ * be available in the default configuration.
+ */
+# define CAP_INIT_BSET  CAP_FULL_SET
+#else
+# define CAP_INIT_BSET  CAP_INIT_EFF_SET
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -156,6 +168,7 @@ extern struct group_info init_groups;
 	.cap_effective	= CAP_INIT_EFF_SET,				\
 	.cap_inheritable = CAP_INIT_INH_SET,				\
 	.cap_permitted	= CAP_FULL_SET,					\
+	.cap_bset 	= CAP_INIT_BSET,				\
 	.keep_capabilities = 0,						\
 	.user		= INIT_USER,					\
 	.comm		= "swapper",					\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index e2eff90..3800639 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -63,4 +63,8 @@
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
 
+/* Get/set the capability bounding set */
+#define PR_CAPBSET_READ 23
+#define PR_CAPBSET_DROP 24
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c30d174..9c13be3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1098,7 +1098,7 @@ struct task_struct {
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
 	struct group_info *group_info;
-	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
diff --git a/include/linux/security.h b/include/linux/security.h
index 9d289e7..fe52cde 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -40,11 +40,6 @@
 #define ROOTCONTEXT_MNT		0x04
 #define DEFCONTEXT_MNT		0x08
 
-/*
- * Bounding set
- */
-extern kernel_cap_t cap_bset;
-
 extern unsigned securebits;
 
 struct ctl_table;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bf4ae4e..571f01d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -102,7 +102,6 @@ enum
 	KERN_NODENAME=7,
 	KERN_DOMAINNAME=8,
 
-	KERN_CAP_BSET=14,	/* int: capability bounding set */
 	KERN_PANIC=15,		/* int: panic timeout */
 	KERN_REALROOTDEV=16,	/* real root device to mount after initrd */
 
@@ -965,8 +964,6 @@ extern int proc_dostring(struct ctl_table *, int, struct file *,
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec(struct ctl_table *, int, struct file *,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_bset(struct ctl_table *, int, struct file *,
-			      void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *,
 				void __user *, size_t *, loff_t *);
 extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *,
diff --git a/kernel/fork.c b/kernel/fork.c
index 1160f87..2b55b74 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1118,6 +1118,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_SECURITY
 	p->security = NULL;
 #endif
+	p->cap_bset = current->cap_bset;
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index d1fe71e..4162d12 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1637,7 +1637,7 @@ asmlinkage long sys_umask(int mask)
 	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
 	return mask;
 }
-    
+
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
@@ -1742,6 +1742,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = prctl_set_seccomp(arg2);
 			break;
 
+		case PR_CAPBSET_READ:
+			if (!cap_valid(arg2))
+				return -EINVAL;
+			return !!cap_raised(current->cap_bset, arg2);
+		case PR_CAPBSET_DROP:
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+			return cap_prctl_drop(arg2);
+#else
+			return -EINVAL;
+#endif
+
 		default:
 			error = -EINVAL;
 			break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d0b47b8..5e2ad5b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -419,15 +419,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SECURITY_CAPABILITIES
-	{
-		.procname	= "cap-bound",
-		.data		= &cap_bset,
-		.maxlen		= sizeof(kernel_cap_t),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec_bset,
-	},
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
 #ifdef CONFIG_BLK_DEV_INITRD
 	{
 		.ctl_name	= KERN_REALROOTDEV,
@@ -2096,26 +2087,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-#ifdef CONFIG_SECURITY_CAPABILITIES
-/*
- *	init may raise the set.
- */
-
-int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int op;
-
-	if (write && !capable(CAP_SYS_MODULE)) {
-		return -EPERM;
-	}
-
-	op = is_global_init(current) ? OP_SET : OP_AND;
-	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
-				do_proc_dointvec_bset_conv,&op);
-}
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
-
 /*
  *	Taint values can only be increased
  */
@@ -2529,12 +2500,6 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
 	return -ENOSYS;
 }
 
-int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
 int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c3206fa..006365b 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -37,10 +37,6 @@ static struct trans_ctl_table trans_kern_table[] = {
 	{ KERN_NODENAME,		"hostname" },
 	{ KERN_DOMAINNAME,		"domainname" },
 
-#ifdef CONFIG_SECURITY_CAPABILITIES
-	{ KERN_CAP_BSET,		"cap-bound" },
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
-
 	{ KERN_PANIC,			"panic" },
 	{ KERN_REALROOTDEV,		"real-root-dev" },
 
@@ -1498,9 +1494,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 			    (table->strategy == sysctl_ms_jiffies) ||
 			    (table->proc_handler == proc_dostring) ||
 			    (table->proc_handler == proc_dointvec) ||
-#ifdef CONFIG_SECURITY_CAPABILITIES
-			    (table->proc_handler == proc_dointvec_bset) ||
-#endif /* def CONFIG_SECURITY_CAPABILITIES */
 			    (table->proc_handler == proc_dointvec_minmax) ||
 			    (table->proc_handler == proc_dointvec_jiffies) ||
 			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
diff --git a/security/commoncap.c b/security/commoncap.c
index 01ab478..5aba826 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -25,20 +25,6 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
-/*
- * Because of the reduced scope of CAP_SETPCAP when filesystem
- * capabilities are in effect, it is safe to allow this capability to
- * be available in the default configuration.
- */
-# define CAP_INIT_BSET  CAP_FULL_SET
-#else /* ie. ndef CONFIG_SECURITY_FILE_CAPABILITIES */
-# define CAP_INIT_BSET  CAP_INIT_EFF_SET
-#endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
-
-kernel_cap_t cap_bset = CAP_INIT_BSET;    /* systemwide capability bound */
-EXPORT_SYMBOL(cap_bset);
-
 /* Global security state */
 
 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
@@ -140,6 +126,12 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
 		/* incapable of using this inheritable set */
 		return -EPERM;
 	}
+	if (!cap_issubset(*inheritable,
+			   cap_combine(target->cap_inheritable,
+				       current->cap_bset))) {
+		/* no new pI capabilities outside bounding set */
+		return -EPERM;
+	}
 
 	/* verify restrictions on target's new Permitted set */
 	if (!cap_issubset (*permitted,
@@ -337,10 +329,11 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	/* Derived from fs/exec.c:compute_creds. */
 	kernel_cap_t new_permitted, working;
 
-	new_permitted = cap_intersect (bprm->cap_permitted, cap_bset);
-	working = cap_intersect (bprm->cap_inheritable,
+	new_permitted = cap_intersect(bprm->cap_permitted,
+				 current->cap_bset);
+	working = cap_intersect(bprm->cap_inheritable,
 				 current->cap_inheritable);
-	new_permitted = cap_combine (new_permitted, working);
+	new_permitted = cap_combine(new_permitted, working);
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset (new_permitted, current->cap_permitted)) {
@@ -581,6 +574,23 @@ int cap_task_kill(struct task_struct *p, struct siginfo *info,
 
 	return -EPERM;
 }
+
+/*
+ * called from kernel/sys.c for prctl(PR_CABSET_DROP)
+ * done without task_capability_lock() because it introduces
+ * no new races - i.e. only another task doing capget() on
+ * this task could get inconsistent info.  There can be no
+ * racing writer bc a task can only change its own caps.
+ */
+long cap_prctl_drop(unsigned long cap)
+{
+	if (!capable(CAP_SETPCAP))
+		return -EPERM;
+	if (!cap_valid(cap))
+		return -EINVAL;
+	cap_lower(current->cap_bset, cap);
+	return 0;
+}
 #else
 int cap_task_setscheduler (struct task_struct *p, int policy,
 			   struct sched_param *lp)
-- 
cgit v1.1


From 97829955ad291acec1d8b94e9911b3ceb1118bb1 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 4 Feb 2008 22:29:47 -0800
Subject: oom_kill: remove uid==0 checks

Root processes are considered more important when out of memory and killing
proceses.  The check for CAP_SYS_ADMIN was augmented with a check for
uid==0 or euid==0.

There are several possible ways to look at this:

	1. uid comparisons are unnecessary, trust CAP_SYS_ADMIN
	   alone.  However CAP_SYS_RESOURCE is the one that really
	   means "give me extra resources" so allow for that as
	   well.
	2. Any privileged code should be protected, but uid is not
	   an indication of privilege.  So we should check whether
	   any capabilities are raised.
	3. uid==0 makes processes on the host as well as in containers
	   more important, so we should keep the existing checks.
	4. uid==0 makes processes only on the host more important,
	   even without any capabilities.  So we should be keeping
	   the (uid==0||euid==0) check but only when
	   userns==&init_user_ns.

I'm following number 1 here.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Cc: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 320d74e..c1850bf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -125,7 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * Superuser processes are usually more important, so we make it
 	 * less likely that we kill those.
 	 */
-	if (__capable(p, CAP_SYS_ADMIN) || p->uid == 0 || p->euid == 0)
+	if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
 		points /= 4;
 
 	/*
-- 
cgit v1.1


From eda61d32e8ad1d9102872f9a0abf3344bf9c5e67 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 4 Feb 2008 22:29:47 -0800
Subject: NetLabel: introduce a new kernel configuration API for NetLabel

Add a new set of configuration functions to the NetLabel/LSM API so that
LSMs can perform their own configuration of the NetLabel subsystem without
relying on assistance from userspace.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: James Morris <jmorris@namei.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/netlabel.h             |  47 ++++++++--
 net/ipv4/cipso_ipv4.c              |   4 +-
 net/netlabel/netlabel_cipso_v4.c   |   2 +-
 net/netlabel/netlabel_cipso_v4.h   |   3 +
 net/netlabel/netlabel_domainhash.h |   1 +
 net/netlabel/netlabel_kapi.c       | 177 +++++++++++++++++++++++++++++++++++++
 6 files changed, 225 insertions(+), 9 deletions(-)

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index b3213c7..0ca67d7 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -36,6 +36,8 @@
 #include <net/netlink.h>
 #include <asm/atomic.h>
 
+struct cipso_v4_doi;
+
 /*
  * NetLabel - A management interface for maintaining network packet label
  *            mapping tables for explicit packet labling protocols.
@@ -103,12 +105,6 @@ struct netlbl_audit {
 	uid_t loginuid;
 };
 
-/* Domain mapping definition struct */
-struct netlbl_dom_map;
-
-/* Domain mapping operations */
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
-
 /*
  * LSM security attributes
  */
@@ -344,6 +340,19 @@ static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr)
 
 #ifdef CONFIG_NETLABEL
 /*
+ * LSM configuration operations
+ */
+int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info);
+int netlbl_cfg_unlbl_add_map(const char *domain,
+			     struct netlbl_audit *audit_info);
+int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+			   struct netlbl_audit *audit_info);
+int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
+			       const char *domain,
+			       struct netlbl_audit *audit_info);
+int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info);
+
+/*
  * LSM security attribute operations
  */
 int netlbl_secattr_catmap_walk(struct netlbl_lsm_secattr_catmap *catmap,
@@ -378,6 +387,32 @@ void netlbl_cache_invalidate(void);
 int netlbl_cache_add(const struct sk_buff *skb,
 		     const struct netlbl_lsm_secattr *secattr);
 #else
+static inline int netlbl_cfg_map_del(const char *domain,
+				     struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
+static inline int netlbl_cfg_unlbl_add_map(const char *domain,
+					   struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
+static inline int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+					 struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
+static inline int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
+					     const char *domain,
+					     struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
+static inline int netlbl_cfg_cipsov4_del(u32 doi,
+					 struct netlbl_audit *audit_info)
+{
+	return -ENOSYS;
+}
 static inline int netlbl_secattr_catmap_walk(
 	                              struct netlbl_lsm_secattr_catmap *catmap,
 				      u32 offset)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a224106..8cd357f 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -547,8 +547,8 @@ int cipso_v4_doi_remove(u32 doi,
 		rcu_read_lock();
 		list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
 			if (dom_iter->valid)
-				netlbl_domhsh_remove(dom_iter->domain,
-						     audit_info);
+				netlbl_cfg_map_del(dom_iter->domain,
+						   audit_info);
 		rcu_read_unlock();
 		cipso_v4_cache_invalidate();
 		call_rcu(&doi_def->rcu, callback);
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index becf91a..c7ad64d 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -90,7 +90,7 @@ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1
  * safely.
  *
  */
-static void netlbl_cipsov4_doi_free(struct rcu_head *entry)
+void netlbl_cipsov4_doi_free(struct rcu_head *entry)
 {
 	struct cipso_v4_doi *ptr;
 
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index f03cf9b..220cb9d 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -163,4 +163,7 @@ enum {
 /* NetLabel protocol functions */
 int netlbl_cipsov4_genl_init(void);
 
+/* Free the memory associated with a CIPSOv4 DOI definition */
+void netlbl_cipsov4_doi_free(struct rcu_head *entry);
+
 #endif
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 3689956..8220990 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -61,6 +61,7 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
 		      struct netlbl_audit *audit_info);
 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
 			      struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
 int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
 int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index c69e3e1..39793a1 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -30,6 +30,7 @@
 
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/audit.h>
 #include <net/ip.h>
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
@@ -38,10 +39,186 @@
 
 #include "netlabel_domainhash.h"
 #include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
 #include "netlabel_user.h"
 #include "netlabel_mgmt.h"
 
 /*
+ * Configuration Functions
+ */
+
+/**
+ * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
+ * @domain: the domain mapping to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a NetLabel/LSM domain mapping.  A @domain value of NULL causes the
+ * default domain mapping to be removed.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info)
+{
+	return netlbl_domhsh_remove(domain, audit_info);
+}
+
+/**
+ * netlbl_cfg_unlbl_add_map - Add an unlabeled NetLabel/LSM domain mapping
+ * @domain: the domain mapping to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new unlabeled NetLabel/LSM domain mapping.  A @domain value of NULL
+ * causes a new default domain mapping to be added.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_add_map(const char *domain,
+			     struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_dom_map *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		goto cfg_unlbl_add_map_failure;
+	if (domain != NULL) {
+		entry->domain = kstrdup(domain, GFP_ATOMIC);
+		if (entry->domain == NULL)
+			goto cfg_unlbl_add_map_failure;
+	}
+	entry->type = NETLBL_NLTYPE_UNLABELED;
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto cfg_unlbl_add_map_failure;
+
+	return 0;
+
+cfg_unlbl_add_map_failure:
+	if (entry != NULL)
+		kfree(entry->domain);
+	kfree(entry);
+	return ret_val;
+}
+
+/**
+ * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
+ * @doi_def: the DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSOv4 DOI definition to the NetLabel subsystem.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+			   struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	const char *type_str;
+	struct audit_buffer *audit_buf;
+
+	ret_val = cipso_v4_doi_add(doi_def);
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
+					      audit_info);
+	if (audit_buf != NULL) {
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_STD:
+			type_str = "std";
+			break;
+		case CIPSO_V4_MAP_PASS:
+			type_str = "pass";
+			break;
+		default:
+			type_str = "(unknown)";
+		}
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u cipso_type=%s res=%u",
+				 doi_def->doi,
+				 type_str,
+				 ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * netlbl_cfg_cipsov4_add_map - Add a new CIPSOv4 DOI definition and mapping
+ * @doi_def: the DOI definition
+ * @domain: the domain mapping to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSOv4 DOI definition and NetLabel/LSM domain mapping for this
+ * new DOI definition to the NetLabel subsystem.  A @domain value of NULL adds
+ * a new default domain mapping.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add_map(struct cipso_v4_doi *doi_def,
+			       const char *domain,
+			       struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_dom_map *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		goto cfg_cipsov4_add_map_failure;
+	if (domain != NULL) {
+		entry->domain = kstrdup(domain, GFP_ATOMIC);
+		if (entry->domain == NULL)
+			goto cfg_cipsov4_add_map_failure;
+	}
+	entry->type = NETLBL_NLTYPE_CIPSOV4;
+	entry->type_def.cipsov4 = doi_def;
+
+	/* Grab a RCU read lock here so nothing happens to the doi_def variable
+	 * between adding it to the CIPSOv4 protocol engine and adding a
+	 * domain mapping for it. */
+
+	rcu_read_lock();
+	ret_val = netlbl_cfg_cipsov4_add(doi_def, audit_info);
+	if (ret_val != 0)
+		goto cfg_cipsov4_add_map_failure_unlock;
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto cfg_cipsov4_add_map_failure_remove_doi;
+	rcu_read_unlock();
+
+	return 0;
+
+cfg_cipsov4_add_map_failure_remove_doi:
+	cipso_v4_doi_remove(doi_def->doi, audit_info, netlbl_cipsov4_doi_free);
+cfg_cipsov4_add_map_failure_unlock:
+	rcu_read_unlock();
+cfg_cipsov4_add_map_failure:
+	if (entry != NULL)
+		kfree(entry->domain);
+	kfree(entry);
+	return ret_val;
+}
+
+/**
+ * netlbl_cfg_cipsov4_del - Removean existing CIPSOv4 DOI definition
+ * @doi: the CIPSO DOI value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an existing CIPSOv4 DOI definition from the NetLabel subsystem.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
+{
+	return cipso_v4_doi_remove(doi, audit_info, netlbl_cipsov4_doi_free);
+}
+
+/*
  * Security Attribute Functions
  */
 
-- 
cgit v1.1


From e114e473771c848c3cfec05f0123e70f1cdbdc99 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Mon, 4 Feb 2008 22:29:50 -0800
Subject: Smack: Simplified Mandatory Access Control Kernel

Smack is the Simplified Mandatory Access Control Kernel.

Smack implements mandatory access control (MAC) using labels
attached to tasks and data containers, including files, SVIPC,
and other tasks. Smack is a kernel based scheme that requires
an absolute minimum of application support and a very small
amount of configuration data.

Smack uses extended attributes and
provides a set of general mount options, borrowing technics used
elsewhere. Smack uses netlabel for CIPSO labeling. Smack provides
a pseudo-filesystem smackfs that is used for manipulation of
system Smack attributes.

The patch, patches for ls and sshd, a README, a startup script,
and x86 binaries for ls and sshd are also available on

    http://www.schaufler-ca.com

Development has been done using Fedora Core 7 in a virtual machine
environment and on an old Sony laptop.

Smack provides mandatory access controls based on the label attached
to a task and the label attached to the object it is attempting to
access. Smack labels are deliberately short (1-23 characters) text
strings. Single character labels using special characters are reserved
for system use. The only operation applied to Smack labels is equality
comparison. No wildcards or expressions, regular or otherwise, are
used. Smack labels are composed of printable characters and may not
include "/".

A file always gets the Smack label of the task that created it.

Smack defines and uses these labels:

    "*" - pronounced "star"
    "_" - pronounced "floor"
    "^" - pronounced "hat"
    "?" - pronounced "huh"

The access rules enforced by Smack are, in order:

1. Any access requested by a task labeled "*" is denied.
2. A read or execute access requested by a task labeled "^"
   is permitted.
3. A read or execute access requested on an object labeled "_"
   is permitted.
4. Any access requested on an object labeled "*" is permitted.
5. Any access requested by a task on an object with the same
   label is permitted.
6. Any access requested that is explicitly defined in the loaded
   rule set is permitted.
7. Any other access is denied.

Rules may be explicitly defined by writing subject,object,access
triples to /smack/load.

Smack rule sets can be easily defined that describe Bell&LaPadula
sensitivity, Biba integrity, and a variety of interesting
configurations. Smack rule sets can be modified on the fly to
accommodate changes in the operating environment or even the time
of day.

Some practical use cases:

Hierarchical levels. The less common of the two usual uses
for MLS systems is to define hierarchical levels, often
unclassified, confidential, secret, and so on. To set up smack
to support this, these rules could be defined:

   C        Unclass rx
   S        C       rx
   S        Unclass rx
   TS       S       rx
   TS       C       rx
   TS       Unclass rx

A TS process can read S, C, and Unclass data, but cannot write it.
An S process can read C and Unclass. Note that specifying that
TS can read S and S can read C does not imply TS can read C, it
has to be explicitly stated.

Non-hierarchical categories. This is the more common of the
usual uses for an MLS system. Since the default rule is that a
subject cannot access an object with a different label no
access rules are required to implement compartmentalization.

A case that the Bell & LaPadula policy does not allow is demonstrated
with this Smack access rule:

A case that Bell&LaPadula does not allow that Smack does:

    ESPN    ABC   r
    ABC     ESPN  r

On my portable video device I have two applications, one that
shows ABC programming and the other ESPN programming. ESPN wants
to show me sport stories that show up as news, and ABC will
only provide minimal information about a sports story if ESPN
is covering it. Each side can look at the other's info, neither
can change the other. Neither can see what FOX is up to, which
is just as well all things considered.

Another case that I especially like:

    SatData Guard   w
    Guard   Publish w

A program running with the Guard label opens a UDP socket and
accepts messages sent by a program running with a SatData label.
The Guard program inspects the message to ensure it is wholesome
and if it is sends it to a program running with the Publish label.
This program then puts the information passed in an appropriate
place. Note that the Guard program cannot write to a Publish
file system object because file system semanitic require read as
well as write.

The four cases (categories, levels, mutual read, guardbox) here
are all quite real, and problems I've been asked to solve over
the years. The first two are easy to do with traditonal MLS systems
while the last two you can't without invoking privilege, at least
for a while.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Cc: Joshua Brindle <method@manicmethod.com>
Cc: Paul Moore <paul.moore@hp.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: James Morris <jmorris@namei.org>
Cc: "Ahmed S. Darwish" <darwish.07@gmail.com>
Cc: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/Smack.txt       |  493 ++++++++
 include/linux/capability.h    |   26 +-
 security/Kconfig              |    1 +
 security/Makefile             |    2 +
 security/smack/Kconfig        |   10 +
 security/smack/Makefile       |    7 +
 security/smack/smack.h        |  220 ++++
 security/smack/smack_access.c |  356 ++++++
 security/smack/smack_lsm.c    | 2518 +++++++++++++++++++++++++++++++++++++++++
 security/smack/smackfs.c      |  981 ++++++++++++++++
 10 files changed, 4611 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/Smack.txt
 create mode 100644 security/smack/Kconfig
 create mode 100644 security/smack/Makefile
 create mode 100644 security/smack/smack.h
 create mode 100644 security/smack/smack_access.c
 create mode 100644 security/smack/smack_lsm.c
 create mode 100644 security/smack/smackfs.c

diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
new file mode 100644
index 0000000..989c2fc
--- /dev/null
+++ b/Documentation/Smack.txt
@@ -0,0 +1,493 @@
+
+
+    "Good for you, you've decided to clean the elevator!"
+    - The Elevator, from Dark Star
+
+Smack is the the Simplified Mandatory Access Control Kernel.
+Smack is a kernel based implementation of mandatory access
+control that includes simplicity in its primary design goals.
+
+Smack is not the only Mandatory Access Control scheme
+available for Linux. Those new to Mandatory Access Control
+are encouraged to compare Smack with the other mechanisms
+available to determine which is best suited to the problem
+at hand.
+
+Smack consists of three major components:
+    - The kernel
+    - A start-up script and a few modified applications
+    - Configuration data
+
+The kernel component of Smack is implemented as a Linux
+Security Modules (LSM) module. It requires netlabel and
+works best with file systems that support extended attributes,
+although xattr support is not strictly required.
+It is safe to run a Smack kernel under a "vanilla" distribution.
+Smack kernels use the CIPSO IP option. Some network
+configurations are intolerant of IP options and can impede
+access to systems that use them as Smack does.
+
+The startup script etc-init.d-smack should be installed
+in /etc/init.d/smack and should be invoked early in the
+start-up process. On Fedora rc5.d/S02smack is recommended.
+This script ensures that certain devices have the correct
+Smack attributes and loads the Smack configuration if
+any is defined. This script invokes two programs that
+ensure configuration data is properly formatted. These
+programs are /usr/sbin/smackload and /usr/sin/smackcipso.
+The system will run just fine without these programs,
+but it will be difficult to set access rules properly.
+
+A version of "ls" that provides a "-M" option to display
+Smack labels on long listing is available.
+
+A hacked version of sshd that allows network logins by users
+with specific Smack labels is available. This version does
+not work for scp. You must set the /etc/ssh/sshd_config
+line:
+   UsePrivilegeSeparation no
+
+The format of /etc/smack/usr is:
+
+   username smack
+
+In keeping with the intent of Smack, configuration data is
+minimal and not strictly required. The most important
+configuration step is mounting the smackfs pseudo filesystem.
+
+Add this line to /etc/fstab:
+
+    smackfs /smack smackfs smackfsdef=* 0 0
+
+and create the /smack directory for mounting.
+
+Smack uses extended attributes (xattrs) to store file labels.
+The command to set a Smack label on a file is:
+
+    # attr -S -s SMACK64 -V "value" path
+
+NOTE: Smack labels are limited to 23 characters. The attr command
+      does not enforce this restriction and can be used to set
+      invalid Smack labels on files.
+
+If you don't do anything special all users will get the floor ("_")
+label when they log in. If you do want to log in via the hacked ssh
+at other labels use the attr command to set the smack value on the
+home directory and it's contents.
+
+You can add access rules in /etc/smack/accesses. They take the form:
+
+    subjectlabel objectlabel access
+
+access is a combination of the letters rwxa which specify the
+kind of access permitted a subject with subjectlabel on an
+object with objectlabel. If there is no rule no access is allowed.
+
+A process can see the smack label it is running with by
+reading /proc/self/attr/current. A privileged process can
+set the process smack by writing there.
+
+Look for additional programs on http://schaufler-ca.com
+
+From the Smack Whitepaper:
+
+The Simplified Mandatory Access Control Kernel
+
+Casey Schaufler
+casey@schaufler-ca.com
+
+Mandatory Access Control
+
+Computer systems employ a variety of schemes to constrain how information is
+shared among the people and services using the machine. Some of these schemes
+allow the program or user to decide what other programs or users are allowed
+access to pieces of data. These schemes are called discretionary access
+control mechanisms because the access control is specified at the discretion
+of the user. Other schemes do not leave the decision regarding what a user or
+program can access up to users or programs. These schemes are called mandatory
+access control mechanisms because you don't have a choice regarding the users
+or programs that have access to pieces of data.
+
+Bell & LaPadula
+
+From the middle of the 1980's until the turn of the century Mandatory Access
+Control (MAC) was very closely associated with the Bell & LaPadula security
+model, a mathematical description of the United States Department of Defense
+policy for marking paper documents. MAC in this form enjoyed a following
+within the Capital Beltway and Scandinavian supercomputer centers but was
+often sited as failing to address general needs.
+
+Domain Type Enforcement
+
+Around the turn of the century Domain Type Enforcement (DTE) became popular.
+This scheme organizes users, programs, and data into domains that are
+protected from each other. This scheme has been widely deployed as a component
+of popular Linux distributions. The administrative overhead required to
+maintain this scheme and the detailed understanding of the whole system
+necessary to provide a secure domain mapping leads to the scheme being
+disabled or used in limited ways in the majority of cases.
+
+Smack
+
+Smack is a Mandatory Access Control mechanism designed to provide useful MAC
+while avoiding the pitfalls of its predecessors. The limitations of Bell &
+LaPadula are addressed by providing a scheme whereby access can be controlled
+according to the requirements of the system and its purpose rather than those
+imposed by an arcane government policy. The complexity of Domain Type
+Enforcement and avoided by defining access controls in terms of the access
+modes already in use.
+
+Smack Terminology
+
+The jargon used to talk about Smack will be familiar to those who have dealt
+with other MAC systems and shouldn't be too difficult for the uninitiated to
+pick up. There are four terms that are used in a specific way and that are
+especially important:
+
+	Subject: A subject is an active entity on the computer system.
+	On Smack a subject is a task, which is in turn the basic unit
+	of execution.
+
+	Object: An object is a passive entity on the computer system.
+	On Smack files of all types, IPC, and tasks can be objects.
+
+	Access: Any attempt by a subject to put information into or get
+	information from an object is an access.
+
+	Label: Data that identifies the Mandatory Access Control
+	characteristics of a subject or an object.
+
+These definitions are consistent with the traditional use in the security
+community. There are also some terms from Linux that are likely to crop up:
+
+	Capability: A task that possesses a capability has permission to
+	violate an aspect of the system security policy, as identified by
+	the specific capability. A task that possesses one or more
+	capabilities is a privileged task, whereas a task with no
+	capabilities is an unprivileged task.
+
+	Privilege: A task that is allowed to violate the system security
+	policy is said to have privilege. As of this writing a task can
+	have privilege either by possessing capabilities or by having an
+	effective user of root.
+
+Smack Basics
+
+Smack is an extension to a Linux system. It enforces additional restrictions
+on what subjects can access which objects, based on the labels attached to
+each of the subject and the object.
+
+Labels
+
+Smack labels are ASCII character strings, one to twenty-three characters in
+length. Single character labels using special characters, that being anything
+other than a letter or digit, are reserved for use by the Smack development
+team. Smack labels are unstructured, case sensitive, and the only operation
+ever performed on them is comparison for equality. Smack labels cannot
+contain unprintable characters or the "/" (slash) character.
+
+There are some predefined labels:
+
+	_ Pronounced "floor", a single underscore character.
+	^ Pronounced "hat", a single circumflex character.
+	* Pronounced "star", a single asterisk character.
+	? Pronounced "huh", a single question mark character.
+
+Every task on a Smack system is assigned a label. System tasks, such as
+init(8) and systems daemons, are run with the floor ("_") label. User tasks
+are assigned labels according to the specification found in the
+/etc/smack/user configuration file.
+
+Access Rules
+
+Smack uses the traditional access modes of Linux. These modes are read,
+execute, write, and occasionally append. There are a few cases where the
+access mode may not be obvious. These include:
+
+	Signals: A signal is a write operation from the subject task to
+	the object task.
+	Internet Domain IPC: Transmission of a packet is considered a
+	write operation from the source task to the destination task.
+
+Smack restricts access based on the label attached to a subject and the label
+attached to the object it is trying to access. The rules enforced are, in
+order:
+
+	1. Any access requested by a task labeled "*" is denied.
+	2. A read or execute access requested by a task labeled "^"
+	   is permitted.
+	3. A read or execute access requested on an object labeled "_"
+	   is permitted.
+	4. Any access requested on an object labeled "*" is permitted.
+	5. Any access requested by a task on an object with the same
+	   label is permitted.
+	6. Any access requested that is explicitly defined in the loaded
+	   rule set is permitted.
+	7. Any other access is denied.
+
+Smack Access Rules
+
+With the isolation provided by Smack access separation is simple. There are
+many interesting cases where limited access by subjects to objects with
+different labels is desired. One example is the familiar spy model of
+sensitivity, where a scientist working on a highly classified project would be
+able to read documents of lower classifications and anything she writes will
+be "born" highly classified. To accommodate such schemes Smack includes a
+mechanism for specifying rules allowing access between labels.
+
+Access Rule Format
+
+The format of an access rule is:
+
+	subject-label object-label access
+
+Where subject-label is the Smack label of the task, object-label is the Smack
+label of the thing being accessed, and access is a string specifying the sort
+of access allowed. The Smack labels are limited to 23 characters. The access
+specification is searched for letters that describe access modes:
+
+	a: indicates that append access should be granted.
+	r: indicates that read access should be granted.
+	w: indicates that write access should be granted.
+	x: indicates that execute access should be granted.
+
+Uppercase values for the specification letters are allowed as well.
+Access mode specifications can be in any order. Examples of acceptable rules
+are:
+
+	TopSecret Secret  rx
+	Secret    Unclass R
+	Manager   Game    x
+	User      HR      w
+	New       Old     rRrRr
+	Closed    Off     -
+
+Examples of unacceptable rules are:
+
+	Top Secret Secret     rx
+	Ace        Ace        r
+	Odd        spells     waxbeans
+
+Spaces are not allowed in labels. Since a subject always has access to files
+with the same label specifying a rule for that case is pointless. Only
+valid letters (rwxaRWXA) and the dash ('-') character are allowed in
+access specifications. The dash is a placeholder, so "a-r" is the same
+as "ar". A lone dash is used to specify that no access should be allowed.
+
+Applying Access Rules
+
+The developers of Linux rarely define new sorts of things, usually importing
+schemes and concepts from other systems. Most often, the other systems are
+variants of Unix. Unix has many endearing properties, but consistency of
+access control models is not one of them. Smack strives to treat accesses as
+uniformly as is sensible while keeping with the spirit of the underlying
+mechanism.
+
+File system objects including files, directories, named pipes, symbolic links,
+and devices require access permissions that closely match those used by mode
+bit access. To open a file for reading read access is required on the file. To
+search a directory requires execute access. Creating a file with write access
+requires both read and write access on the containing directory. Deleting a
+file requires read and write access to the file and to the containing
+directory. It is possible that a user may be able to see that a file exists
+but not any of its attributes by the circumstance of having read access to the
+containing directory but not to the differently labeled file. This is an
+artifact of the file name being data in the directory, not a part of the file.
+
+IPC objects, message queues, semaphore sets, and memory segments exist in flat
+namespaces and access requests are only required to match the object in
+question.
+
+Process objects reflect tasks on the system and the Smack label used to access
+them is the same Smack label that the task would use for its own access
+attempts. Sending a signal via the kill() system call is a write operation
+from the signaler to the recipient. Debugging a process requires both reading
+and writing. Creating a new task is an internal operation that results in two
+tasks with identical Smack labels and requires no access checks.
+
+Sockets are data structures attached to processes and sending a packet from
+one process to another requires that the sender have write access to the
+receiver. The receiver is not required to have read access to the sender.
+
+Setting Access Rules
+
+The configuration file /etc/smack/accesses contains the rules to be set at
+system startup. The contents are written to the special file /smack/load.
+Rules can be written to /smack/load at any time and take effect immediately.
+For any pair of subject and object labels there can be only one rule, with the
+most recently specified overriding any earlier specification.
+
+The program smackload is provided to ensure data is formatted
+properly when written to /smack/load. This program reads lines
+of the form
+
+    subjectlabel objectlabel mode.
+
+Task Attribute
+
+The Smack label of a process can be read from /proc/<pid>/attr/current. A
+process can read its own Smack label from /proc/self/attr/current. A
+privileged process can change its own Smack label by writing to
+/proc/self/attr/current but not the label of another process.
+
+File Attribute
+
+The Smack label of a filesystem object is stored as an extended attribute
+named SMACK64 on the file. This attribute is in the security namespace. It can
+only be changed by a process with privilege.
+
+Privilege
+
+A process with CAP_MAC_OVERRIDE is privileged.
+
+Smack Networking
+
+As mentioned before, Smack enforces access control on network protocol
+transmissions. Every packet sent by a Smack process is tagged with its Smack
+label. This is done by adding a CIPSO tag to the header of the IP packet. Each
+packet received is expected to have a CIPSO tag that identifies the label and
+if it lacks such a tag the network ambient label is assumed. Before the packet
+is delivered a check is made to determine that a subject with the label on the
+packet has write access to the receiving process and if that is not the case
+the packet is dropped.
+
+CIPSO Configuration
+
+It is normally unnecessary to specify the CIPSO configuration. The default
+values used by the system handle all internal cases. Smack will compose CIPSO
+label values to match the Smack labels being used without administrative
+intervention. Unlabeled packets that come into the system will be given the
+ambient label.
+
+Smack requires configuration in the case where packets from a system that is
+not smack that speaks CIPSO may be encountered. Usually this will be a Trusted
+Solaris system, but there are other, less widely deployed systems out there.
+CIPSO provides 3 important values, a Domain Of Interpretation (DOI), a level,
+and a category set with each packet. The DOI is intended to identify a group
+of systems that use compatible labeling schemes, and the DOI specified on the
+smack system must match that of the remote system or packets will be
+discarded. The DOI is 3 by default. The value can be read from /smack/doi and
+can be changed by writing to /smack/doi.
+
+The label and category set are mapped to a Smack label as defined in
+/etc/smack/cipso.
+
+A Smack/CIPSO mapping has the form:
+
+	smack level [category [category]*]
+
+Smack does not expect the level or category sets to be related in any
+particular way and does not assume or assign accesses based on them. Some
+examples of mappings:
+
+	TopSecret 7
+	TS:A,B    7 1 2
+	SecBDE    5 2 4 6
+	RAFTERS   7 12 26
+
+The ":" and "," characters are permitted in a Smack label but have no special
+meaning.
+
+The mapping of Smack labels to CIPSO values is defined by writing to
+/smack/cipso. Again, the format of data written to this special file
+is highly restrictive, so the program smackcipso is provided to
+ensure the writes are done properly. This program takes mappings
+on the standard input and sends them to /smack/cipso properly.
+
+In addition to explicit mappings Smack supports direct CIPSO mappings. One
+CIPSO level is used to indicate that the category set passed in the packet is
+in fact an encoding of the Smack label. The level used is 250 by default. The
+value can be read from /smack/direct and changed by writing to /smack/direct.
+
+Socket Attributes
+
+There are two attributes that are associated with sockets. These attributes
+can only be set by privileged tasks, but any task can read them for their own
+sockets.
+
+	SMACK64IPIN: The Smack label of the task object. A privileged
+	program that will enforce policy may set this to the star label.
+
+	SMACK64IPOUT: The Smack label transmitted with outgoing packets.
+	A privileged program may set this to match the label of another
+	task with which it hopes to communicate.
+
+Writing Applications for Smack
+
+There are three sorts of applications that will run on a Smack system. How an
+application interacts with Smack will determine what it will have to do to
+work properly under Smack.
+
+Smack Ignorant Applications
+
+By far the majority of applications have no reason whatever to care about the
+unique properties of Smack. Since invoking a program has no impact on the
+Smack label associated with the process the only concern likely to arise is
+whether the process has execute access to the program.
+
+Smack Relevant Applications
+
+Some programs can be improved by teaching them about Smack, but do not make
+any security decisions themselves. The utility ls(1) is one example of such a
+program.
+
+Smack Enforcing Applications
+
+These are special programs that not only know about Smack, but participate in
+the enforcement of system policy. In most cases these are the programs that
+set up user sessions. There are also network services that provide information
+to processes running with various labels.
+
+File System Interfaces
+
+Smack maintains labels on file system objects using extended attributes. The
+Smack label of a file, directory, or other file system object can be obtained
+using getxattr(2).
+
+	len = getxattr("/", "security.SMACK64", value, sizeof (value));
+
+will put the Smack label of the root directory into value. A privileged
+process can set the Smack label of a file system object with setxattr(2).
+
+	len = strlen("Rubble");
+	rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
+
+will set the Smack label of /foo to "Rubble" if the program has appropriate
+privilege.
+
+Socket Interfaces
+
+The socket attributes can be read using fgetxattr(2).
+
+A privileged process can set the Smack label of outgoing packets with
+fsetxattr(2).
+
+	len = strlen("Rubble");
+	rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
+
+will set the Smack label "Rubble" on packets going out from the socket if the
+program has appropriate privilege.
+
+	rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
+
+will set the Smack label "*" as the object label against which incoming
+packets will be checked if the program has appropriate privilege.
+
+Administration
+
+Smack supports some mount options:
+
+	smackfsdef=label: specifies the label to give files that lack
+	the Smack label extended attribute.
+
+	smackfsroot=label: specifies the label to assign the root of the
+	file system if it lacks the Smack extended attribute.
+
+	smackfshat=label: specifies a label that must have read access to
+	all labels set on the filesystem. Not yet enforced.
+
+	smackfsfloor=label: specifies a label to which all labels set on the
+	filesystem must have read access. Not yet enforced.
+
+These mount options apply to all file system types.
+
diff --git a/include/linux/capability.h b/include/linux/capability.h
index ffe7bab8..7d50ff6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -315,7 +315,24 @@ typedef struct kernel_cap_struct {
 
 #define CAP_SETFCAP	     31
 
-#define CAP_LAST_CAP         CAP_SETFCAP
+/* Override MAC access.
+   The base kernel enforces no MAC policy.
+   An LSM may enforce a MAC policy, and if it does and it chooses
+   to implement capability based overrides of that policy, this is
+   the capability it should use to do so. */
+
+#define CAP_MAC_OVERRIDE     32
+
+/* Allow MAC configuration or state changes.
+   The base kernel requires no MAC configuration.
+   An LSM may enforce a MAC policy, and if it does and it chooses
+   to implement capability based checks on modifications to that
+   policy or the data required to maintain it, this is the
+   capability it should use to do so. */
+
+#define CAP_MAC_ADMIN        33
+
+#define CAP_LAST_CAP         CAP_MAC_ADMIN
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
@@ -341,6 +358,8 @@ typedef struct kernel_cap_struct {
 			    | CAP_TO_MASK(CAP_FOWNER)		\
 			    | CAP_TO_MASK(CAP_FSETID))
 
+# define CAP_FS_MASK_B1     (CAP_TO_MASK(CAP_MAC_OVERRIDE))
+
 #if _LINUX_CAPABILITY_U32S != 2
 # error Fix up hand-coded capability macro initializers
 #else /* HAND-CODED capability initializers */
@@ -348,8 +367,9 @@ typedef struct kernel_cap_struct {
 # define CAP_EMPTY_SET    {{ 0, 0 }}
 # define CAP_FULL_SET     {{ ~0, ~0 }}
 # define CAP_INIT_EFF_SET {{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}
-# define CAP_FS_SET       {{ CAP_FS_MASK_B0, 0 }}
-# define CAP_NFSD_SET     {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), 0 }}
+# define CAP_FS_SET       {{ CAP_FS_MASK_B0, CAP_FS_MASK_B1 } }
+# define CAP_NFSD_SET     {{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \
+			     CAP_FS_MASK_B1 } }
 
 #endif /* _LINUX_CAPABILITY_U32S != 2 */
 
diff --git a/security/Kconfig b/security/Kconfig
index 389e151..25ffe1b 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -105,6 +105,7 @@ config SECURITY_ROOTPLUG
 	  If you are unsure how to answer this question, answer N.
 
 source security/selinux/Kconfig
+source security/smack/Kconfig
 
 endmenu
 
diff --git a/security/Makefile b/security/Makefile
index ef87df2..9e8b025 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_KEYS)			+= keys/
 subdir-$(CONFIG_SECURITY_SELINUX)	+= selinux
+subdir-$(CONFIG_SECURITY_SMACK)		+= smack
 
 # if we don't select a security model, use the default capabilities
 ifneq ($(CONFIG_SECURITY),y)
@@ -14,5 +15,6 @@ endif
 obj-$(CONFIG_SECURITY)			+= security.o dummy.o inode.o
 # Must precede capability.o in order to stack properly.
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
+obj-$(CONFIG_SECURITY_SMACK)		+= commoncap.o smack/built-in.o
 obj-$(CONFIG_SECURITY_CAPABILITIES)	+= commoncap.o capability.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= commoncap.o root_plug.o
diff --git a/security/smack/Kconfig b/security/smack/Kconfig
new file mode 100644
index 0000000..603b087
--- /dev/null
+++ b/security/smack/Kconfig
@@ -0,0 +1,10 @@
+config SECURITY_SMACK
+	bool "Simplified Mandatory Access Control Kernel Support"
+	depends on NETLABEL && SECURITY_NETWORK
+	default n
+	help
+	  This selects the Simplified Mandatory Access Control Kernel.
+	  Smack is useful for sensitivity, integrity, and a variety
+	  of other mandatory security schemes.
+	  If you are unsure how to answer this question, answer N.
+
diff --git a/security/smack/Makefile b/security/smack/Makefile
new file mode 100644
index 0000000..67a63aa
--- /dev/null
+++ b/security/smack/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the SMACK LSM
+#
+
+obj-$(CONFIG_SECURITY_SMACK) := smack.o
+
+smack-y := smack_lsm.o smack_access.o smackfs.o
diff --git a/security/smack/smack.h b/security/smack/smack.h
new file mode 100644
index 0000000..a21a0e9
--- /dev/null
+++ b/security/smack/smack.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *      This program is free software; you can redistribute it and/or modify
+ *      it under the terms of the GNU General Public License as published by
+ *      the Free Software Foundation, version 2.
+ *
+ * Author:
+ *      Casey Schaufler <casey@schaufler-ca.com>
+ *
+ */
+
+#ifndef _SECURITY_SMACK_H
+#define _SECURITY_SMACK_H
+
+#include <linux/capability.h>
+#include <linux/spinlock.h>
+#include <net/netlabel.h>
+
+/*
+ * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is
+ * bigger than can be used, and 24 is the next lower multiple
+ * of 8, and there are too many issues if there isn't space set
+ * aside for the terminating null byte.
+ */
+#define SMK_MAXLEN	23
+#define SMK_LABELLEN	(SMK_MAXLEN+1)
+
+/*
+ * How many kinds of access are there?
+ * Here's your answer.
+ */
+#define SMK_ACCESSDASH	'-'
+#define SMK_ACCESSLOW	"rwxa"
+#define SMK_ACCESSKINDS	(sizeof(SMK_ACCESSLOW) - 1)
+
+struct superblock_smack {
+	char		*smk_root;
+	char		*smk_floor;
+	char		*smk_hat;
+	char		*smk_default;
+	int		smk_initialized;
+	spinlock_t	smk_sblock;	/* for initialization */
+};
+
+struct socket_smack {
+	char		*smk_out;			/* outbound label */
+	char		*smk_in;			/* inbound label */
+	char		smk_packet[SMK_LABELLEN];	/* TCP peer label */
+};
+
+/*
+ * Inode smack data
+ */
+struct inode_smack {
+	char		*smk_inode;	/* label of the fso */
+	struct mutex	smk_lock;	/* initialization lock */
+	int		smk_flags;	/* smack inode flags */
+};
+
+#define	SMK_INODE_INSTANT	0x01	/* inode is instantiated */
+
+/*
+ * A label access rule.
+ */
+struct smack_rule {
+	char	*smk_subject;
+	char	*smk_object;
+	int	smk_access;
+};
+
+/*
+ * An entry in the table of permitted label accesses.
+ */
+struct smk_list_entry {
+	struct smk_list_entry	*smk_next;
+	struct smack_rule	smk_rule;
+};
+
+/*
+ * An entry in the table mapping smack values to
+ * CIPSO level/category-set values.
+ */
+struct smack_cipso {
+	int	smk_level;
+	char	smk_catset[SMK_LABELLEN];
+};
+
+/*
+ * This is the repository for labels seen so that it is
+ * not necessary to keep allocating tiny chuncks of memory
+ * and so that they can be shared.
+ *
+ * Labels are never modified in place. Anytime a label
+ * is imported (e.g. xattrset on a file) the list is checked
+ * for it and it is added if it doesn't exist. The address
+ * is passed out in either case. Entries are added, but
+ * never deleted.
+ *
+ * Since labels are hanging around anyway it doesn't
+ * hurt to maintain a secid for those awkward situations
+ * where kernel components that ought to use LSM independent
+ * interfaces don't. The secid should go away when all of
+ * these components have been repaired.
+ *
+ * If there is a cipso value associated with the label it
+ * gets stored here, too. This will most likely be rare as
+ * the cipso direct mapping in used internally.
+ */
+struct smack_known {
+	struct smack_known	*smk_next;
+	char			smk_known[SMK_LABELLEN];
+	u32			smk_secid;
+	struct smack_cipso	*smk_cipso;
+	spinlock_t		smk_cipsolock; /* for changing cipso map */
+};
+
+/*
+ * Mount options
+ */
+#define SMK_FSDEFAULT	"smackfsdef="
+#define SMK_FSFLOOR	"smackfsfloor="
+#define SMK_FSHAT	"smackfshat="
+#define SMK_FSROOT	"smackfsroot="
+
+/*
+ * xattr names
+ */
+#define XATTR_SMACK_SUFFIX	"SMACK64"
+#define XATTR_SMACK_IPIN	"SMACK64IPIN"
+#define XATTR_SMACK_IPOUT	"SMACK64IPOUT"
+#define XATTR_NAME_SMACK	XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
+#define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
+#define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
+
+/*
+ * smackfs macic number
+ */
+#define SMACK_MAGIC	0x43415d53 /* "SMAC" */
+
+/*
+ * A limit on the number of entries in the lists
+ * makes some of the list administration easier.
+ */
+#define SMACK_LIST_MAX	10000
+
+/*
+ * CIPSO defaults.
+ */
+#define SMACK_CIPSO_DOI_DEFAULT		3	/* Historical */
+#define SMACK_CIPSO_DIRECT_DEFAULT	250	/* Arbitrary */
+#define SMACK_CIPSO_MAXCATVAL		63	/* Bigger gets harder */
+#define SMACK_CIPSO_MAXLEVEL            255     /* CIPSO 2.2 standard */
+#define SMACK_CIPSO_MAXCATNUM           239     /* CIPSO 2.2 standard */
+
+/*
+ * Just to make the common cases easier to deal with
+ */
+#define MAY_ANY		(MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
+#define MAY_ANYREAD	(MAY_READ | MAY_EXEC)
+#define MAY_ANYWRITE	(MAY_WRITE | MAY_APPEND)
+#define MAY_READWRITE	(MAY_READ | MAY_WRITE)
+#define MAY_NOT		0
+
+/*
+ * These functions are in smack_lsm.c
+ */
+struct inode_smack *new_inode_smack(char *);
+
+/*
+ * These functions are in smack_access.c
+ */
+int smk_access(char *, char *, int);
+int smk_curacc(char *, u32);
+int smack_to_cipso(const char *, struct smack_cipso *);
+void smack_from_cipso(u32, char *, char *);
+char *smack_from_secid(const u32);
+char *smk_import(const char *, int);
+struct smack_known *smk_import_entry(const char *, int);
+u32 smack_to_secid(const char *);
+
+/*
+ * Shared data.
+ */
+extern int smack_cipso_direct;
+extern int smack_net_nltype;
+extern char *smack_net_ambient;
+
+extern struct smack_known *smack_known;
+extern struct smack_known smack_known_floor;
+extern struct smack_known smack_known_hat;
+extern struct smack_known smack_known_huh;
+extern struct smack_known smack_known_invalid;
+extern struct smack_known smack_known_star;
+extern struct smack_known smack_known_unset;
+
+extern struct smk_list_entry *smack_list;
+
+/*
+ * Stricly for CIPSO level manipulation.
+ * Set the category bit number in a smack label sized buffer.
+ */
+static inline void smack_catset_bit(int cat, char *catsetp)
+{
+	if (cat > SMK_LABELLEN * 8)
+		return;
+
+	catsetp[(cat - 1) / 8] |= 0x80 >> ((cat - 1) % 8);
+}
+
+/*
+ * Present a pointer to the smack label in an inode blob.
+ */
+static inline char *smk_of_inode(const struct inode *isp)
+{
+	struct inode_smack *sip = isp->i_security;
+	return sip->smk_inode;
+}
+
+#endif  /* _SECURITY_SMACK_H */
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
new file mode 100644
index 0000000..f6b5f6e
--- /dev/null
+++ b/security/smack/smack_access.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *      This program is free software; you can redistribute it and/or modify
+ *      it under the terms of the GNU General Public License as published by
+ *      the Free Software Foundation, version 2.
+ *
+ * Author:
+ *      Casey Schaufler <casey@schaufler-ca.com>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include "smack.h"
+
+struct smack_known smack_known_unset = {
+	.smk_next	= NULL,
+	.smk_known	= "UNSET",
+	.smk_secid	= 1,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known smack_known_huh = {
+	.smk_next	= &smack_known_unset,
+	.smk_known	= "?",
+	.smk_secid	= 2,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known smack_known_hat = {
+	.smk_next	= &smack_known_huh,
+	.smk_known	= "^",
+	.smk_secid	= 3,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known smack_known_star = {
+	.smk_next	= &smack_known_hat,
+	.smk_known	= "*",
+	.smk_secid	= 4,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known smack_known_floor = {
+	.smk_next	= &smack_known_star,
+	.smk_known	= "_",
+	.smk_secid	= 5,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known smack_known_invalid = {
+	.smk_next	= &smack_known_floor,
+	.smk_known	= "",
+	.smk_secid	= 6,
+	.smk_cipso	= NULL,
+};
+
+struct smack_known *smack_known = &smack_known_invalid;
+
+/*
+ * The initial value needs to be bigger than any of the
+ * known values above.
+ */
+static u32 smack_next_secid = 10;
+
+/**
+ * smk_access - determine if a subject has a specific access to an object
+ * @subject_label: a pointer to the subject's Smack label
+ * @object_label: a pointer to the object's Smack label
+ * @request: the access requested, in "MAY" format
+ *
+ * This function looks up the subject/object pair in the
+ * access rule list and returns 0 if the access is permitted,
+ * non zero otherwise.
+ *
+ * Even though Smack labels are usually shared on smack_list
+ * labels that come in off the network can't be imported
+ * and added to the list for locking reasons.
+ *
+ * Therefore, it is necessary to check the contents of the labels,
+ * not just the pointer values. Of course, in most cases the labels
+ * will be on the list, so checking the pointers may be a worthwhile
+ * optimization.
+ */
+int smk_access(char *subject_label, char *object_label, int request)
+{
+	u32 may = MAY_NOT;
+	struct smk_list_entry *sp;
+	struct smack_rule *srp;
+
+	/*
+	 * Hardcoded comparisons.
+	 *
+	 * A star subject can't access any object.
+	 */
+	if (subject_label == smack_known_star.smk_known ||
+	    strcmp(subject_label, smack_known_star.smk_known) == 0)
+		return -EACCES;
+	/*
+	 * A star object can be accessed by any subject.
+	 */
+	if (object_label == smack_known_star.smk_known ||
+	    strcmp(object_label, smack_known_star.smk_known) == 0)
+		return 0;
+	/*
+	 * An object can be accessed in any way by a subject
+	 * with the same label.
+	 */
+	if (subject_label == object_label ||
+	    strcmp(subject_label, object_label) == 0)
+		return 0;
+	/*
+	 * A hat subject can read any object.
+	 * A floor object can be read by any subject.
+	 */
+	if ((request & MAY_ANYREAD) == request) {
+		if (object_label == smack_known_floor.smk_known ||
+		    strcmp(object_label, smack_known_floor.smk_known) == 0)
+			return 0;
+		if (subject_label == smack_known_hat.smk_known ||
+		    strcmp(subject_label, smack_known_hat.smk_known) == 0)
+			return 0;
+	}
+	/*
+	 * Beyond here an explicit relationship is required.
+	 * If the requested access is contained in the available
+	 * access (e.g. read is included in readwrite) it's
+	 * good.
+	 */
+	for (sp = smack_list; sp != NULL; sp = sp->smk_next) {
+		srp = &sp->smk_rule;
+
+		if (srp->smk_subject == subject_label ||
+		    strcmp(srp->smk_subject, subject_label) == 0) {
+			if (srp->smk_object == object_label ||
+			    strcmp(srp->smk_object, object_label) == 0) {
+				may = srp->smk_access;
+				break;
+			}
+		}
+	}
+	/*
+	 * This is a bit map operation.
+	 */
+	if ((request & may) == request)
+		return 0;
+
+	return -EACCES;
+}
+
+/**
+ * smk_curacc - determine if current has a specific access to an object
+ * @object_label: a pointer to the object's Smack label
+ * @request: the access requested, in "MAY" format
+ *
+ * This function checks the current subject label/object label pair
+ * in the access rule list and returns 0 if the access is permitted,
+ * non zero otherwise. It allows that current my have the capability
+ * to override the rules.
+ */
+int smk_curacc(char *obj_label, u32 mode)
+{
+	int rc;
+
+	rc = smk_access(current->security, obj_label, mode);
+	if (rc == 0)
+		return 0;
+
+	if (capable(CAP_MAC_OVERRIDE))
+		return 0;
+
+	return rc;
+}
+
+static DEFINE_MUTEX(smack_known_lock);
+
+/**
+ * smk_import_entry - import a label, return the list entry
+ * @string: a text string that might be a Smack label
+ * @len: the maximum size, or zero if it is NULL terminated.
+ *
+ * Returns a pointer to the entry in the label list that
+ * matches the passed string, adding it if necessary.
+ */
+struct smack_known *smk_import_entry(const char *string, int len)
+{
+	struct smack_known *skp;
+	char smack[SMK_LABELLEN];
+	int found;
+	int i;
+
+	if (len <= 0 || len > SMK_MAXLEN)
+		len = SMK_MAXLEN;
+
+	for (i = 0, found = 0; i < SMK_LABELLEN; i++) {
+		if (found)
+			smack[i] = '\0';
+		else if (i >= len || string[i] > '~' || string[i] <= ' ' ||
+			 string[i] == '/') {
+			smack[i] = '\0';
+			found = 1;
+		} else
+			smack[i] = string[i];
+	}
+
+	if (smack[0] == '\0')
+		return NULL;
+
+	mutex_lock(&smack_known_lock);
+
+	for (skp = smack_known; skp != NULL; skp = skp->smk_next)
+		if (strncmp(skp->smk_known, smack, SMK_MAXLEN) == 0)
+			break;
+
+	if (skp == NULL) {
+		skp = kzalloc(sizeof(struct smack_known), GFP_KERNEL);
+		if (skp != NULL) {
+			skp->smk_next = smack_known;
+			strncpy(skp->smk_known, smack, SMK_MAXLEN);
+			skp->smk_secid = smack_next_secid++;
+			skp->smk_cipso = NULL;
+			spin_lock_init(&skp->smk_cipsolock);
+			/*
+			 * Make sure that the entry is actually
+			 * filled before putting it on the list.
+			 */
+			smp_mb();
+			smack_known = skp;
+		}
+	}
+
+	mutex_unlock(&smack_known_lock);
+
+	return skp;
+}
+
+/**
+ * smk_import - import a smack label
+ * @string: a text string that might be a Smack label
+ * @len: the maximum size, or zero if it is NULL terminated.
+ *
+ * Returns a pointer to the label in the label list that
+ * matches the passed string, adding it if necessary.
+ */
+char *smk_import(const char *string, int len)
+{
+	struct smack_known *skp;
+
+	skp = smk_import_entry(string, len);
+	if (skp == NULL)
+		return NULL;
+	return skp->smk_known;
+}
+
+/**
+ * smack_from_secid - find the Smack label associated with a secid
+ * @secid: an integer that might be associated with a Smack label
+ *
+ * Returns a pointer to the appropraite Smack label if there is one,
+ * otherwise a pointer to the invalid Smack label.
+ */
+char *smack_from_secid(const u32 secid)
+{
+	struct smack_known *skp;
+
+	for (skp = smack_known; skp != NULL; skp = skp->smk_next)
+		if (skp->smk_secid == secid)
+			return skp->smk_known;
+
+	/*
+	 * If we got this far someone asked for the translation
+	 * of a secid that is not on the list.
+	 */
+	return smack_known_invalid.smk_known;
+}
+
+/**
+ * smack_to_secid - find the secid associated with a Smack label
+ * @smack: the Smack label
+ *
+ * Returns the appropriate secid if there is one,
+ * otherwise 0
+ */
+u32 smack_to_secid(const char *smack)
+{
+	struct smack_known *skp;
+
+	for (skp = smack_known; skp != NULL; skp = skp->smk_next)
+		if (strncmp(skp->smk_known, smack, SMK_MAXLEN) == 0)
+			return skp->smk_secid;
+	return 0;
+}
+
+/**
+ * smack_from_cipso - find the Smack label associated with a CIPSO option
+ * @level: Bell & LaPadula level from the network
+ * @cp: Bell & LaPadula categories from the network
+ * @result: where to put the Smack value
+ *
+ * This is a simple lookup in the label table.
+ *
+ * This is an odd duck as far as smack handling goes in that
+ * it sends back a copy of the smack label rather than a pointer
+ * to the master list. This is done because it is possible for
+ * a foreign host to send a smack label that is new to this
+ * machine and hence not on the list. That would not be an
+ * issue except that adding an entry to the master list can't
+ * be done at that point.
+ */
+void smack_from_cipso(u32 level, char *cp, char *result)
+{
+	struct smack_known *kp;
+	char *final = NULL;
+
+	for (kp = smack_known; final == NULL && kp != NULL; kp = kp->smk_next) {
+		if (kp->smk_cipso == NULL)
+			continue;
+
+		spin_lock_bh(&kp->smk_cipsolock);
+
+		if (kp->smk_cipso->smk_level == level &&
+		    memcmp(kp->smk_cipso->smk_catset, cp, SMK_LABELLEN) == 0)
+			final = kp->smk_known;
+
+		spin_unlock_bh(&kp->smk_cipsolock);
+	}
+	if (final == NULL)
+		final = smack_known_huh.smk_known;
+	strncpy(result, final, SMK_MAXLEN);
+	return;
+}
+
+/**
+ * smack_to_cipso - find the CIPSO option to go with a Smack label
+ * @smack: a pointer to the smack label in question
+ * @cp: where to put the result
+ *
+ * Returns zero if a value is available, non-zero otherwise.
+ */
+int smack_to_cipso(const char *smack, struct smack_cipso *cp)
+{
+	struct smack_known *kp;
+
+	for (kp = smack_known; kp != NULL; kp = kp->smk_next)
+		if (kp->smk_known == smack ||
+		    strcmp(kp->smk_known, smack) == 0)
+			break;
+
+	if (kp == NULL || kp->smk_cipso == NULL)
+		return -ENOENT;
+
+	memcpy(cp, kp->smk_cipso, sizeof(struct smack_cipso));
+	return 0;
+}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
new file mode 100644
index 0000000..1c11e42
--- /dev/null
+++ b/security/smack/smack_lsm.c
@@ -0,0 +1,2518 @@
+/*
+ *  Simplified MAC Kernel (smack) security module
+ *
+ *  This file contains the smack hook function implementations.
+ *
+ *  Author:
+ *	Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *  Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2,
+ *      as published by the Free Software Foundation.
+ */
+
+#include <linux/xattr.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <linux/ext2_fs.h>
+#include <linux/kd.h>
+#include <asm/ioctls.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/mutex.h>
+#include <linux/pipe_fs_i.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+
+#include "smack.h"
+
+/*
+ * I hope these are the hokeyist lines of code in the module. Casey.
+ */
+#define DEVPTS_SUPER_MAGIC	0x1cd1
+#define SOCKFS_MAGIC		0x534F434B
+#define TMPFS_MAGIC		0x01021994
+
+/**
+ * smk_fetch - Fetch the smack label from a file.
+ * @ip: a pointer to the inode
+ * @dp: a pointer to the dentry
+ *
+ * Returns a pointer to the master list entry for the Smack label
+ * or NULL if there was no label to fetch.
+ */
+static char *smk_fetch(struct inode *ip, struct dentry *dp)
+{
+	int rc;
+	char in[SMK_LABELLEN];
+
+	if (ip->i_op->getxattr == NULL)
+		return NULL;
+
+	rc = ip->i_op->getxattr(dp, XATTR_NAME_SMACK, in, SMK_LABELLEN);
+	if (rc < 0)
+		return NULL;
+
+	return smk_import(in, rc);
+}
+
+/**
+ * new_inode_smack - allocate an inode security blob
+ * @smack: a pointer to the Smack label to use in the blob
+ *
+ * Returns the new blob or NULL if there's no memory available
+ */
+struct inode_smack *new_inode_smack(char *smack)
+{
+	struct inode_smack *isp;
+
+	isp = kzalloc(sizeof(struct inode_smack), GFP_KERNEL);
+	if (isp == NULL)
+		return NULL;
+
+	isp->smk_inode = smack;
+	isp->smk_flags = 0;
+	mutex_init(&isp->smk_lock);
+
+	return isp;
+}
+
+/*
+ * LSM hooks.
+ * We he, that is fun!
+ */
+
+/**
+ * smack_ptrace - Smack approval on ptrace
+ * @ptp: parent task pointer
+ * @ctp: child task pointer
+ *
+ * Returns 0 if access is OK, an error code otherwise
+ *
+ * Do the capability checks, and require read and write.
+ */
+static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp)
+{
+	int rc;
+
+	rc = cap_ptrace(ptp, ctp);
+	if (rc != 0)
+		return rc;
+
+	rc = smk_access(ptp->security, ctp->security, MAY_READWRITE);
+	if (rc != 0 && __capable(ptp, CAP_MAC_OVERRIDE))
+		return 0;
+
+	return rc;
+}
+
+/**
+ * smack_syslog - Smack approval on syslog
+ * @type: message type
+ *
+ * Require that the task has the floor label
+ *
+ * Returns 0 on success, error code otherwise.
+ */
+static int smack_syslog(int type)
+{
+	int rc;
+	char *sp = current->security;
+
+	rc = cap_syslog(type);
+	if (rc != 0)
+		return rc;
+
+	if (capable(CAP_MAC_OVERRIDE))
+		return 0;
+
+	 if (sp != smack_known_floor.smk_known)
+		rc = -EACCES;
+
+	return rc;
+}
+
+
+/*
+ * Superblock Hooks.
+ */
+
+/**
+ * smack_sb_alloc_security - allocate a superblock blob
+ * @sb: the superblock getting the blob
+ *
+ * Returns 0 on success or -ENOMEM on error.
+ */
+static int smack_sb_alloc_security(struct super_block *sb)
+{
+	struct superblock_smack *sbsp;
+
+	sbsp = kzalloc(sizeof(struct superblock_smack), GFP_KERNEL);
+
+	if (sbsp == NULL)
+		return -ENOMEM;
+
+	sbsp->smk_root = smack_known_floor.smk_known;
+	sbsp->smk_default = smack_known_floor.smk_known;
+	sbsp->smk_floor = smack_known_floor.smk_known;
+	sbsp->smk_hat = smack_known_hat.smk_known;
+	sbsp->smk_initialized = 0;
+	spin_lock_init(&sbsp->smk_sblock);
+
+	sb->s_security = sbsp;
+
+	return 0;
+}
+
+/**
+ * smack_sb_free_security - free a superblock blob
+ * @sb: the superblock getting the blob
+ *
+ */
+static void smack_sb_free_security(struct super_block *sb)
+{
+	kfree(sb->s_security);
+	sb->s_security = NULL;
+}
+
+/**
+ * smack_sb_copy_data - copy mount options data for processing
+ * @type: file system type
+ * @orig: where to start
+ * @smackopts
+ *
+ * Returns 0 on success or -ENOMEM on error.
+ *
+ * Copy the Smack specific mount options out of the mount
+ * options list.
+ */
+static int smack_sb_copy_data(struct file_system_type *type, void *orig,
+			      void *smackopts)
+{
+	char *cp, *commap, *otheropts, *dp;
+
+	/* Binary mount data: just copy */
+	if (type->fs_flags & FS_BINARY_MOUNTDATA) {
+		copy_page(smackopts, orig);
+		return 0;
+	}
+
+	otheropts = (char *)get_zeroed_page(GFP_KERNEL);
+	if (otheropts == NULL)
+		return -ENOMEM;
+
+	for (cp = orig, commap = orig; commap != NULL; cp = commap + 1) {
+		if (strstr(cp, SMK_FSDEFAULT) == cp)
+			dp = smackopts;
+		else if (strstr(cp, SMK_FSFLOOR) == cp)
+			dp = smackopts;
+		else if (strstr(cp, SMK_FSHAT) == cp)
+			dp = smackopts;
+		else if (strstr(cp, SMK_FSROOT) == cp)
+			dp = smackopts;
+		else
+			dp = otheropts;
+
+		commap = strchr(cp, ',');
+		if (commap != NULL)
+			*commap = '\0';
+
+		if (*dp != '\0')
+			strcat(dp, ",");
+		strcat(dp, cp);
+	}
+
+	strcpy(orig, otheropts);
+	free_page((unsigned long)otheropts);
+
+	return 0;
+}
+
+/**
+ * smack_sb_kern_mount - Smack specific mount processing
+ * @sb: the file system superblock
+ * @data: the smack mount options
+ *
+ * Returns 0 on success, an error code on failure
+ */
+static int smack_sb_kern_mount(struct super_block *sb, void *data)
+{
+	struct dentry *root = sb->s_root;
+	struct inode *inode = root->d_inode;
+	struct superblock_smack *sp = sb->s_security;
+	struct inode_smack *isp;
+	char *op;
+	char *commap;
+	char *nsp;
+
+	spin_lock(&sp->smk_sblock);
+	if (sp->smk_initialized != 0) {
+		spin_unlock(&sp->smk_sblock);
+		return 0;
+	}
+	sp->smk_initialized = 1;
+	spin_unlock(&sp->smk_sblock);
+
+	for (op = data; op != NULL; op = commap) {
+		commap = strchr(op, ',');
+		if (commap != NULL)
+			*commap++ = '\0';
+
+		if (strncmp(op, SMK_FSHAT, strlen(SMK_FSHAT)) == 0) {
+			op += strlen(SMK_FSHAT);
+			nsp = smk_import(op, 0);
+			if (nsp != NULL)
+				sp->smk_hat = nsp;
+		} else if (strncmp(op, SMK_FSFLOOR, strlen(SMK_FSFLOOR)) == 0) {
+			op += strlen(SMK_FSFLOOR);
+			nsp = smk_import(op, 0);
+			if (nsp != NULL)
+				sp->smk_floor = nsp;
+		} else if (strncmp(op, SMK_FSDEFAULT,
+				   strlen(SMK_FSDEFAULT)) == 0) {
+			op += strlen(SMK_FSDEFAULT);
+			nsp = smk_import(op, 0);
+			if (nsp != NULL)
+				sp->smk_default = nsp;
+		} else if (strncmp(op, SMK_FSROOT, strlen(SMK_FSROOT)) == 0) {
+			op += strlen(SMK_FSROOT);
+			nsp = smk_import(op, 0);
+			if (nsp != NULL)
+				sp->smk_root = nsp;
+		}
+	}
+
+	/*
+	 * Initialize the root inode.
+	 */
+	isp = inode->i_security;
+	if (isp == NULL)
+		inode->i_security = new_inode_smack(sp->smk_root);
+	else
+		isp->smk_inode = sp->smk_root;
+
+	return 0;
+}
+
+/**
+ * smack_sb_statfs - Smack check on statfs
+ * @dentry: identifies the file system in question
+ *
+ * Returns 0 if current can read the floor of the filesystem,
+ * and error code otherwise
+ */
+static int smack_sb_statfs(struct dentry *dentry)
+{
+	struct superblock_smack *sbp = dentry->d_sb->s_security;
+
+	return smk_curacc(sbp->smk_floor, MAY_READ);
+}
+
+/**
+ * smack_sb_mount - Smack check for mounting
+ * @dev_name: unused
+ * @nd: mount point
+ * @type: unused
+ * @flags: unused
+ * @data: unused
+ *
+ * Returns 0 if current can write the floor of the filesystem
+ * being mounted on, an error code otherwise.
+ */
+static int smack_sb_mount(char *dev_name, struct nameidata *nd,
+			  char *type, unsigned long flags, void *data)
+{
+	struct superblock_smack *sbp = nd->mnt->mnt_sb->s_security;
+
+	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+}
+
+/**
+ * smack_sb_umount - Smack check for unmounting
+ * @mnt: file system to unmount
+ * @flags: unused
+ *
+ * Returns 0 if current can write the floor of the filesystem
+ * being unmounted, an error code otherwise.
+ */
+static int smack_sb_umount(struct vfsmount *mnt, int flags)
+{
+	struct superblock_smack *sbp;
+
+	sbp = mnt->mnt_sb->s_security;
+
+	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+}
+
+/*
+ * Inode hooks
+ */
+
+/**
+ * smack_inode_alloc_security - allocate an inode blob
+ * @inode - the inode in need of a blob
+ *
+ * Returns 0 if it gets a blob, -ENOMEM otherwise
+ */
+static int smack_inode_alloc_security(struct inode *inode)
+{
+	inode->i_security = new_inode_smack(current->security);
+	if (inode->i_security == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * smack_inode_free_security - free an inode blob
+ * @inode - the inode with a blob
+ *
+ * Clears the blob pointer in inode
+ */
+static void smack_inode_free_security(struct inode *inode)
+{
+	kfree(inode->i_security);
+	inode->i_security = NULL;
+}
+
+/**
+ * smack_inode_init_security - copy out the smack from an inode
+ * @inode: the inode
+ * @dir: unused
+ * @name: where to put the attribute name
+ * @value: where to put the attribute value
+ * @len: where to put the length of the attribute
+ *
+ * Returns 0 if it all works out, -ENOMEM if there's no memory
+ */
+static int smack_inode_init_security(struct inode *inode, struct inode *dir,
+				     char **name, void **value, size_t *len)
+{
+	char *isp = smk_of_inode(inode);
+
+	if (name) {
+		*name = kstrdup(XATTR_SMACK_SUFFIX, GFP_KERNEL);
+		if (*name == NULL)
+			return -ENOMEM;
+	}
+
+	if (value) {
+		*value = kstrdup(isp, GFP_KERNEL);
+		if (*value == NULL)
+			return -ENOMEM;
+	}
+
+	if (len)
+		*len = strlen(isp) + 1;
+
+	return 0;
+}
+
+/**
+ * smack_inode_link - Smack check on link
+ * @old_dentry: the existing object
+ * @dir: unused
+ * @new_dentry: the new object
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
+			    struct dentry *new_dentry)
+{
+	int rc;
+	char *isp;
+
+	isp = smk_of_inode(old_dentry->d_inode);
+	rc = smk_curacc(isp, MAY_WRITE);
+
+	if (rc == 0 && new_dentry->d_inode != NULL) {
+		isp = smk_of_inode(new_dentry->d_inode);
+		rc = smk_curacc(isp, MAY_WRITE);
+	}
+
+	return rc;
+}
+
+/**
+ * smack_inode_unlink - Smack check on inode deletion
+ * @dir: containing directory object
+ * @dentry: file to unlink
+ *
+ * Returns 0 if current can write the containing directory
+ * and the object, error code otherwise
+ */
+static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *ip = dentry->d_inode;
+	int rc;
+
+	/*
+	 * You need write access to the thing you're unlinking
+	 */
+	rc = smk_curacc(smk_of_inode(ip), MAY_WRITE);
+	if (rc == 0)
+		/*
+		 * You also need write access to the containing directory
+		 */
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
+
+	return rc;
+}
+
+/**
+ * smack_inode_rmdir - Smack check on directory deletion
+ * @dir: containing directory object
+ * @dentry: directory to unlink
+ *
+ * Returns 0 if current can write the containing directory
+ * and the directory, error code otherwise
+ */
+static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int rc;
+
+	/*
+	 * You need write access to the thing you're removing
+	 */
+	rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+	if (rc == 0)
+		/*
+		 * You also need write access to the containing directory
+		 */
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
+
+	return rc;
+}
+
+/**
+ * smack_inode_rename - Smack check on rename
+ * @old_inode: the old directory
+ * @old_dentry: unused
+ * @new_inode: the new directory
+ * @new_dentry: unused
+ *
+ * Read and write access is required on both the old and
+ * new directories.
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_rename(struct inode *old_inode,
+			      struct dentry *old_dentry,
+			      struct inode *new_inode,
+			      struct dentry *new_dentry)
+{
+	int rc;
+	char *isp;
+
+	isp = smk_of_inode(old_dentry->d_inode);
+	rc = smk_curacc(isp, MAY_READWRITE);
+
+	if (rc == 0 && new_dentry->d_inode != NULL) {
+		isp = smk_of_inode(new_dentry->d_inode);
+		rc = smk_curacc(isp, MAY_READWRITE);
+	}
+
+	return rc;
+}
+
+/**
+ * smack_inode_permission - Smack version of permission()
+ * @inode: the inode in question
+ * @mask: the access requested
+ * @nd: unused
+ *
+ * This is the important Smack hook.
+ *
+ * Returns 0 if access is permitted, -EACCES otherwise
+ */
+static int smack_inode_permission(struct inode *inode, int mask,
+				  struct nameidata *nd)
+{
+	/*
+	 * No permission to check. Existence test. Yup, it's there.
+	 */
+	if (mask == 0)
+		return 0;
+
+	return smk_curacc(smk_of_inode(inode), mask);
+}
+
+/**
+ * smack_inode_setattr - Smack check for setting attributes
+ * @dentry: the object
+ * @iattr: for the force flag
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	/*
+	 * Need to allow for clearing the setuid bit.
+	 */
+	if (iattr->ia_valid & ATTR_FORCE)
+		return 0;
+
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+}
+
+/**
+ * smack_inode_getattr - Smack check for getting attributes
+ * @mnt: unused
+ * @dentry: the object
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
+{
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+}
+
+/**
+ * smack_inode_setxattr - Smack check for setting xattrs
+ * @dentry: the object
+ * @name: name of the attribute
+ * @value: unused
+ * @size: unused
+ * @flags: unused
+ *
+ * This protects the Smack attribute explicitly.
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_setxattr(struct dentry *dentry, char *name,
+				void *value, size_t size, int flags)
+{
+	if (!capable(CAP_MAC_ADMIN)) {
+		if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
+		    strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
+		    strcmp(name, XATTR_NAME_SMACKIPOUT) == 0)
+			return -EPERM;
+	}
+
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+}
+
+/**
+ * smack_inode_post_setxattr - Apply the Smack update approved above
+ * @dentry: object
+ * @name: attribute name
+ * @value: attribute value
+ * @size: attribute size
+ * @flags: unused
+ *
+ * Set the pointer in the inode blob to the entry found
+ * in the master label list.
+ */
+static void smack_inode_post_setxattr(struct dentry *dentry, char *name,
+				      void *value, size_t size, int flags)
+{
+	struct inode_smack *isp;
+	char *nsp;
+
+	/*
+	 * Not SMACK
+	 */
+	if (strcmp(name, XATTR_NAME_SMACK))
+		return;
+
+	if (size >= SMK_LABELLEN)
+		return;
+
+	isp = dentry->d_inode->i_security;
+
+	/*
+	 * No locking is done here. This is a pointer
+	 * assignment.
+	 */
+	nsp = smk_import(value, size);
+	if (nsp != NULL)
+		isp->smk_inode = nsp;
+	else
+		isp->smk_inode = smack_known_invalid.smk_known;
+
+	return;
+}
+
+/*
+ * smack_inode_getxattr - Smack check on getxattr
+ * @dentry: the object
+ * @name: unused
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_getxattr(struct dentry *dentry, char *name)
+{
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+}
+
+/*
+ * smack_inode_removexattr - Smack check on removexattr
+ * @dentry: the object
+ * @name: name of the attribute
+ *
+ * Removing the Smack attribute requires CAP_MAC_ADMIN
+ *
+ * Returns 0 if access is permitted, an error code otherwise
+ */
+static int smack_inode_removexattr(struct dentry *dentry, char *name)
+{
+	if (strcmp(name, XATTR_NAME_SMACK) == 0 && !capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+}
+
+/**
+ * smack_inode_getsecurity - get smack xattrs
+ * @inode: the object
+ * @name: attribute name
+ * @buffer: where to put the result
+ * @size: size of the buffer
+ * @err: unused
+ *
+ * Returns the size of the attribute or an error code
+ */
+static int smack_inode_getsecurity(const struct inode *inode,
+				   const char *name, void **buffer,
+				   bool alloc)
+{
+	struct socket_smack *ssp;
+	struct socket *sock;
+	struct super_block *sbp;
+	struct inode *ip = (struct inode *)inode;
+	char *isp;
+	int ilen;
+	int rc = 0;
+
+	if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) {
+		isp = smk_of_inode(inode);
+		ilen = strlen(isp) + 1;
+		*buffer = isp;
+		return ilen;
+	}
+
+	/*
+	 * The rest of the Smack xattrs are only on sockets.
+	 */
+	sbp = ip->i_sb;
+	if (sbp->s_magic != SOCKFS_MAGIC)
+		return -EOPNOTSUPP;
+
+	sock = SOCKET_I(ip);
+	if (sock == NULL)
+		return -EOPNOTSUPP;
+
+	ssp = sock->sk->sk_security;
+
+	if (strcmp(name, XATTR_SMACK_IPIN) == 0)
+		isp = ssp->smk_in;
+	else if (strcmp(name, XATTR_SMACK_IPOUT) == 0)
+		isp = ssp->smk_out;
+	else
+		return -EOPNOTSUPP;
+
+	ilen = strlen(isp) + 1;
+	if (rc == 0) {
+		*buffer = isp;
+		rc = ilen;
+	}
+
+	return rc;
+}
+
+
+/**
+ * smack_inode_listsecurity - list the Smack attributes
+ * @inode: the object
+ * @buffer: where they go
+ * @buffer_size: size of buffer
+ *
+ * Returns 0 on success, -EINVAL otherwise
+ */
+static int smack_inode_listsecurity(struct inode *inode, char *buffer,
+				    size_t buffer_size)
+{
+	int len = strlen(XATTR_NAME_SMACK);
+
+	if (buffer != NULL && len <= buffer_size) {
+		memcpy(buffer, XATTR_NAME_SMACK, len);
+		return len;
+	}
+	return -EINVAL;
+}
+
+/*
+ * File Hooks
+ */
+
+/**
+ * smack_file_permission - Smack check on file operations
+ * @file: unused
+ * @mask: unused
+ *
+ * Returns 0
+ *
+ * Should access checks be done on each read or write?
+ * UNICOS and SELinux say yes.
+ * Trusted Solaris, Trusted Irix, and just about everyone else says no.
+ *
+ * I'll say no for now. Smack does not do the frequent
+ * label changing that SELinux does.
+ */
+static int smack_file_permission(struct file *file, int mask)
+{
+	return 0;
+}
+
+/**
+ * smack_file_alloc_security - assign a file security blob
+ * @file: the object
+ *
+ * The security blob for a file is a pointer to the master
+ * label list, so no allocation is done.
+ *
+ * Returns 0
+ */
+static int smack_file_alloc_security(struct file *file)
+{
+	file->f_security = current->security;
+	return 0;
+}
+
+/**
+ * smack_file_free_security - clear a file security blob
+ * @file: the object
+ *
+ * The security blob for a file is a pointer to the master
+ * label list, so no memory is freed.
+ */
+static void smack_file_free_security(struct file *file)
+{
+	file->f_security = NULL;
+}
+
+/**
+ * smack_file_ioctl - Smack check on ioctls
+ * @file: the object
+ * @cmd: what to do
+ * @arg: unused
+ *
+ * Relies heavily on the correct use of the ioctl command conventions.
+ *
+ * Returns 0 if allowed, error code otherwise
+ */
+static int smack_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	int rc = 0;
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE)
+		rc = smk_curacc(file->f_security, MAY_WRITE);
+
+	if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ))
+		rc = smk_curacc(file->f_security, MAY_READ);
+
+	return rc;
+}
+
+/**
+ * smack_file_lock - Smack check on file locking
+ * @file: the object
+ * @cmd unused
+ *
+ * Returns 0 if current has write access, error code otherwise
+ */
+static int smack_file_lock(struct file *file, unsigned int cmd)
+{
+	return smk_curacc(file->f_security, MAY_WRITE);
+}
+
+/**
+ * smack_file_fcntl - Smack check on fcntl
+ * @file: the object
+ * @cmd: what action to check
+ * @arg: unused
+ *
+ * Returns 0 if current has access, error code otherwise
+ */
+static int smack_file_fcntl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	int rc;
+
+	switch (cmd) {
+	case F_DUPFD:
+	case F_GETFD:
+	case F_GETFL:
+	case F_GETLK:
+	case F_GETOWN:
+	case F_GETSIG:
+		rc = smk_curacc(file->f_security, MAY_READ);
+		break;
+	case F_SETFD:
+	case F_SETFL:
+	case F_SETLK:
+	case F_SETLKW:
+	case F_SETOWN:
+	case F_SETSIG:
+		rc = smk_curacc(file->f_security, MAY_WRITE);
+		break;
+	default:
+		rc = smk_curacc(file->f_security, MAY_READWRITE);
+	}
+
+	return rc;
+}
+
+/**
+ * smack_file_set_fowner - set the file security blob value
+ * @file: object in question
+ *
+ * Returns 0
+ * Further research may be required on this one.
+ */
+static int smack_file_set_fowner(struct file *file)
+{
+	file->f_security = current->security;
+	return 0;
+}
+
+/**
+ * smack_file_send_sigiotask - Smack on sigio
+ * @tsk: The target task
+ * @fown: the object the signal come from
+ * @signum: unused
+ *
+ * Allow a privileged task to get signals even if it shouldn't
+ *
+ * Returns 0 if a subject with the object's smack could
+ * write to the task, an error code otherwise.
+ */
+static int smack_file_send_sigiotask(struct task_struct *tsk,
+				     struct fown_struct *fown, int signum)
+{
+	struct file *file;
+	int rc;
+
+	/*
+	 * struct fown_struct is never outside the context of a struct file
+	 */
+	file = container_of(fown, struct file, f_owner);
+	rc = smk_access(file->f_security, tsk->security, MAY_WRITE);
+	if (rc != 0 && __capable(tsk, CAP_MAC_OVERRIDE))
+		return 0;
+	return rc;
+}
+
+/**
+ * smack_file_receive - Smack file receive check
+ * @file: the object
+ *
+ * Returns 0 if current has access, error code otherwise
+ */
+static int smack_file_receive(struct file *file)
+{
+	int may = 0;
+
+	/*
+	 * This code relies on bitmasks.
+	 */
+	if (file->f_mode & FMODE_READ)
+		may = MAY_READ;
+	if (file->f_mode & FMODE_WRITE)
+		may |= MAY_WRITE;
+
+	return smk_curacc(file->f_security, may);
+}
+
+/*
+ * Task hooks
+ */
+
+/**
+ * smack_task_alloc_security - "allocate" a task blob
+ * @tsk: the task in need of a blob
+ *
+ * Smack isn't using copies of blobs. Everyone
+ * points to an immutable list. No alloc required.
+ * No data copy required.
+ *
+ * Always returns 0
+ */
+static int smack_task_alloc_security(struct task_struct *tsk)
+{
+	tsk->security = current->security;
+
+	return 0;
+}
+
+/**
+ * smack_task_free_security - "free" a task blob
+ * @task: the task with the blob
+ *
+ * Smack isn't using copies of blobs. Everyone
+ * points to an immutable list. The blobs never go away.
+ * There is no leak here.
+ */
+static void smack_task_free_security(struct task_struct *task)
+{
+	task->security = NULL;
+}
+
+/**
+ * smack_task_setpgid - Smack check on setting pgid
+ * @p: the task object
+ * @pgid: unused
+ *
+ * Return 0 if write access is permitted
+ */
+static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
+{
+	return smk_curacc(p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_getpgid - Smack access check for getpgid
+ * @p: the object task
+ *
+ * Returns 0 if current can read the object task, error code otherwise
+ */
+static int smack_task_getpgid(struct task_struct *p)
+{
+	return smk_curacc(p->security, MAY_READ);
+}
+
+/**
+ * smack_task_getsid - Smack access check for getsid
+ * @p: the object task
+ *
+ * Returns 0 if current can read the object task, error code otherwise
+ */
+static int smack_task_getsid(struct task_struct *p)
+{
+	return smk_curacc(p->security, MAY_READ);
+}
+
+/**
+ * smack_task_getsecid - get the secid of the task
+ * @p: the object task
+ * @secid: where to put the result
+ *
+ * Sets the secid to contain a u32 version of the smack label.
+ */
+static void smack_task_getsecid(struct task_struct *p, u32 *secid)
+{
+	*secid = smack_to_secid(p->security);
+}
+
+/**
+ * smack_task_setnice - Smack check on setting nice
+ * @p: the task object
+ * @nice: unused
+ *
+ * Return 0 if write access is permitted
+ */
+static int smack_task_setnice(struct task_struct *p, int nice)
+{
+	return smk_curacc(p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_setioprio - Smack check on setting ioprio
+ * @p: the task object
+ * @ioprio: unused
+ *
+ * Return 0 if write access is permitted
+ */
+static int smack_task_setioprio(struct task_struct *p, int ioprio)
+{
+	return smk_curacc(p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_getioprio - Smack check on reading ioprio
+ * @p: the task object
+ *
+ * Return 0 if read access is permitted
+ */
+static int smack_task_getioprio(struct task_struct *p)
+{
+	return smk_curacc(p->security, MAY_READ);
+}
+
+/**
+ * smack_task_setscheduler - Smack check on setting scheduler
+ * @p: the task object
+ * @policy: unused
+ * @lp: unused
+ *
+ * Return 0 if read access is permitted
+ */
+static int smack_task_setscheduler(struct task_struct *p, int policy,
+				   struct sched_param *lp)
+{
+	return smk_curacc(p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_getscheduler - Smack check on reading scheduler
+ * @p: the task object
+ *
+ * Return 0 if read access is permitted
+ */
+static int smack_task_getscheduler(struct task_struct *p)
+{
+	return smk_curacc(p->security, MAY_READ);
+}
+
+/**
+ * smack_task_movememory - Smack check on moving memory
+ * @p: the task object
+ *
+ * Return 0 if write access is permitted
+ */
+static int smack_task_movememory(struct task_struct *p)
+{
+	return smk_curacc(p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_kill - Smack check on signal delivery
+ * @p: the task object
+ * @info: unused
+ * @sig: unused
+ * @secid: identifies the smack to use in lieu of current's
+ *
+ * Return 0 if write access is permitted
+ *
+ * The secid behavior is an artifact of an SELinux hack
+ * in the USB code. Someday it may go away.
+ */
+static int smack_task_kill(struct task_struct *p, struct siginfo *info,
+			   int sig, u32 secid)
+{
+	/*
+	 * Special cases where signals really ought to go through
+	 * in spite of policy. Stephen Smalley suggests it may
+	 * make sense to change the caller so that it doesn't
+	 * bother with the LSM hook in these cases.
+	 */
+	if (info != SEND_SIG_NOINFO &&
+	    (is_si_special(info) || SI_FROMKERNEL(info)))
+		return 0;
+	/*
+	 * Sending a signal requires that the sender
+	 * can write the receiver.
+	 */
+	if (secid == 0)
+		return smk_curacc(p->security, MAY_WRITE);
+	/*
+	 * If the secid isn't 0 we're dealing with some USB IO
+	 * specific behavior. This is not clean. For one thing
+	 * we can't take privilege into account.
+	 */
+	return smk_access(smack_from_secid(secid), p->security, MAY_WRITE);
+}
+
+/**
+ * smack_task_wait - Smack access check for waiting
+ * @p: task to wait for
+ *
+ * Returns 0 if current can wait for p, error code otherwise
+ */
+static int smack_task_wait(struct task_struct *p)
+{
+	int rc;
+
+	rc = smk_access(current->security, p->security, MAY_WRITE);
+	if (rc == 0)
+		return 0;
+
+	/*
+	 * Allow the operation to succeed if either task
+	 * has privilege to perform operations that might
+	 * account for the smack labels having gotten to
+	 * be different in the first place.
+	 *
+	 * This breaks the strict subjet/object access
+	 * control ideal, taking the object's privilege
+	 * state into account in the decision as well as
+	 * the smack value.
+	 */
+	if (capable(CAP_MAC_OVERRIDE) || __capable(p, CAP_MAC_OVERRIDE))
+		return 0;
+
+	return rc;
+}
+
+/**
+ * smack_task_to_inode - copy task smack into the inode blob
+ * @p: task to copy from
+ * inode: inode to copy to
+ *
+ * Sets the smack pointer in the inode security blob
+ */
+static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
+{
+	struct inode_smack *isp = inode->i_security;
+	isp->smk_inode = p->security;
+}
+
+/*
+ * Socket hooks.
+ */
+
+/**
+ * smack_sk_alloc_security - Allocate a socket blob
+ * @sk: the socket
+ * @family: unused
+ * @priority: memory allocation priority
+ *
+ * Assign Smack pointers to current
+ *
+ * Returns 0 on success, -ENOMEM is there's no memory
+ */
+static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
+{
+	char *csp = current->security;
+	struct socket_smack *ssp;
+
+	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
+	if (ssp == NULL)
+		return -ENOMEM;
+
+	ssp->smk_in = csp;
+	ssp->smk_out = csp;
+	ssp->smk_packet[0] = '\0';
+
+	sk->sk_security = ssp;
+
+	return 0;
+}
+
+/**
+ * smack_sk_free_security - Free a socket blob
+ * @sk: the socket
+ *
+ * Clears the blob pointer
+ */
+static void smack_sk_free_security(struct sock *sk)
+{
+	kfree(sk->sk_security);
+}
+
+/**
+ * smack_set_catset - convert a capset to netlabel mls categories
+ * @catset: the Smack categories
+ * @sap: where to put the netlabel categories
+ *
+ * Allocates and fills attr.mls.cat
+ */
+static void smack_set_catset(char *catset, struct netlbl_lsm_secattr *sap)
+{
+	unsigned char *cp;
+	unsigned char m;
+	int cat;
+	int rc;
+	int byte;
+
+	if (catset == 0)
+		return;
+
+	sap->flags |= NETLBL_SECATTR_MLS_CAT;
+	sap->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+	sap->attr.mls.cat->startbit = 0;
+
+	for (cat = 1, cp = catset, byte = 0; byte < SMK_LABELLEN; cp++, byte++)
+		for (m = 0x80; m != 0; m >>= 1, cat++) {
+			if ((m & *cp) == 0)
+				continue;
+			rc = netlbl_secattr_catmap_setbit(sap->attr.mls.cat,
+							  cat, GFP_ATOMIC);
+		}
+}
+
+/**
+ * smack_to_secattr - fill a secattr from a smack value
+ * @smack: the smack value
+ * @nlsp: where the result goes
+ *
+ * Casey says that CIPSO is good enough for now.
+ * It can be used to effect.
+ * It can also be abused to effect when necessary.
+ * Appologies to the TSIG group in general and GW in particular.
+ */
+static void smack_to_secattr(char *smack, struct netlbl_lsm_secattr *nlsp)
+{
+	struct smack_cipso cipso;
+	int rc;
+
+	switch (smack_net_nltype) {
+	case NETLBL_NLTYPE_CIPSOV4:
+		nlsp->domain = NULL;
+		nlsp->flags = NETLBL_SECATTR_DOMAIN;
+		nlsp->flags |= NETLBL_SECATTR_MLS_LVL;
+
+		rc = smack_to_cipso(smack, &cipso);
+		if (rc == 0) {
+			nlsp->attr.mls.lvl = cipso.smk_level;
+			smack_set_catset(cipso.smk_catset, nlsp);
+		} else {
+			nlsp->attr.mls.lvl = smack_cipso_direct;
+			smack_set_catset(smack, nlsp);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * smack_netlabel - Set the secattr on a socket
+ * @sk: the socket
+ *
+ * Convert the outbound smack value (smk_out) to a
+ * secattr and attach it to the socket.
+ *
+ * Returns 0 on success or an error code
+ */
+static int smack_netlabel(struct sock *sk)
+{
+	struct socket_smack *ssp = sk->sk_security;
+	struct netlbl_lsm_secattr secattr;
+	int rc = 0;
+
+	netlbl_secattr_init(&secattr);
+	smack_to_secattr(ssp->smk_out, &secattr);
+	if (secattr.flags != NETLBL_SECATTR_NONE)
+		rc = netlbl_sock_setattr(sk, &secattr);
+
+	netlbl_secattr_destroy(&secattr);
+	return rc;
+}
+
+/**
+ * smack_inode_setsecurity - set smack xattrs
+ * @inode: the object
+ * @name: attribute name
+ * @value: attribute value
+ * @size: size of the attribute
+ * @flags: unused
+ *
+ * Sets the named attribute in the appropriate blob
+ *
+ * Returns 0 on success, or an error code
+ */
+static int smack_inode_setsecurity(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	char *sp;
+	struct inode_smack *nsp = inode->i_security;
+	struct socket_smack *ssp;
+	struct socket *sock;
+
+	if (value == NULL || size > SMK_LABELLEN)
+		return -EACCES;
+
+	sp = smk_import(value, size);
+	if (sp == NULL)
+		return -EINVAL;
+
+	if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) {
+		nsp->smk_inode = sp;
+		return 0;
+	}
+	/*
+	 * The rest of the Smack xattrs are only on sockets.
+	 */
+	if (inode->i_sb->s_magic != SOCKFS_MAGIC)
+		return -EOPNOTSUPP;
+
+	sock = SOCKET_I(inode);
+	if (sock == NULL)
+		return -EOPNOTSUPP;
+
+	ssp = sock->sk->sk_security;
+
+	if (strcmp(name, XATTR_SMACK_IPIN) == 0)
+		ssp->smk_in = sp;
+	else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) {
+		ssp->smk_out = sp;
+		return smack_netlabel(sock->sk);
+	} else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * smack_socket_post_create - finish socket setup
+ * @sock: the socket
+ * @family: protocol family
+ * @type: unused
+ * @protocol: unused
+ * @kern: unused
+ *
+ * Sets the netlabel information on the socket
+ *
+ * Returns 0 on success, and error code otherwise
+ */
+static int smack_socket_post_create(struct socket *sock, int family,
+				    int type, int protocol, int kern)
+{
+	if (family != PF_INET)
+		return 0;
+	/*
+	 * Set the outbound netlbl.
+	 */
+	return smack_netlabel(sock->sk);
+}
+
+/**
+ * smack_flags_to_may - convert S_ to MAY_ values
+ * @flags: the S_ value
+ *
+ * Returns the equivalent MAY_ value
+ */
+static int smack_flags_to_may(int flags)
+{
+	int may = 0;
+
+	if (flags & S_IRUGO)
+		may |= MAY_READ;
+	if (flags & S_IWUGO)
+		may |= MAY_WRITE;
+	if (flags & S_IXUGO)
+		may |= MAY_EXEC;
+
+	return may;
+}
+
+/**
+ * smack_msg_msg_alloc_security - Set the security blob for msg_msg
+ * @msg: the object
+ *
+ * Returns 0
+ */
+static int smack_msg_msg_alloc_security(struct msg_msg *msg)
+{
+	msg->security = current->security;
+	return 0;
+}
+
+/**
+ * smack_msg_msg_free_security - Clear the security blob for msg_msg
+ * @msg: the object
+ *
+ * Clears the blob pointer
+ */
+static void smack_msg_msg_free_security(struct msg_msg *msg)
+{
+	msg->security = NULL;
+}
+
+/**
+ * smack_of_shm - the smack pointer for the shm
+ * @shp: the object
+ *
+ * Returns a pointer to the smack value
+ */
+static char *smack_of_shm(struct shmid_kernel *shp)
+{
+	return (char *)shp->shm_perm.security;
+}
+
+/**
+ * smack_shm_alloc_security - Set the security blob for shm
+ * @shp: the object
+ *
+ * Returns 0
+ */
+static int smack_shm_alloc_security(struct shmid_kernel *shp)
+{
+	struct kern_ipc_perm *isp = &shp->shm_perm;
+
+	isp->security = current->security;
+	return 0;
+}
+
+/**
+ * smack_shm_free_security - Clear the security blob for shm
+ * @shp: the object
+ *
+ * Clears the blob pointer
+ */
+static void smack_shm_free_security(struct shmid_kernel *shp)
+{
+	struct kern_ipc_perm *isp = &shp->shm_perm;
+
+	isp->security = NULL;
+}
+
+/**
+ * smack_shm_associate - Smack access check for shm
+ * @shp: the object
+ * @shmflg: access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
+{
+	char *ssp = smack_of_shm(shp);
+	int may;
+
+	may = smack_flags_to_may(shmflg);
+	return smk_curacc(ssp, may);
+}
+
+/**
+ * smack_shm_shmctl - Smack access check for shm
+ * @shp: the object
+ * @cmd: what it wants to do
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
+{
+	char *ssp = smack_of_shm(shp);
+	int may;
+
+	switch (cmd) {
+	case IPC_STAT:
+	case SHM_STAT:
+		may = MAY_READ;
+		break;
+	case IPC_SET:
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+	case IPC_RMID:
+		may = MAY_READWRITE;
+		break;
+	case IPC_INFO:
+	case SHM_INFO:
+		/*
+		 * System level information.
+		 */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	return smk_curacc(ssp, may);
+}
+
+/**
+ * smack_shm_shmat - Smack access for shmat
+ * @shp: the object
+ * @shmaddr: unused
+ * @shmflg: access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr,
+			   int shmflg)
+{
+	char *ssp = smack_of_shm(shp);
+	int may;
+
+	may = smack_flags_to_may(shmflg);
+	return smk_curacc(ssp, may);
+}
+
+/**
+ * smack_of_sem - the smack pointer for the sem
+ * @sma: the object
+ *
+ * Returns a pointer to the smack value
+ */
+static char *smack_of_sem(struct sem_array *sma)
+{
+	return (char *)sma->sem_perm.security;
+}
+
+/**
+ * smack_sem_alloc_security - Set the security blob for sem
+ * @sma: the object
+ *
+ * Returns 0
+ */
+static int smack_sem_alloc_security(struct sem_array *sma)
+{
+	struct kern_ipc_perm *isp = &sma->sem_perm;
+
+	isp->security = current->security;
+	return 0;
+}
+
+/**
+ * smack_sem_free_security - Clear the security blob for sem
+ * @sma: the object
+ *
+ * Clears the blob pointer
+ */
+static void smack_sem_free_security(struct sem_array *sma)
+{
+	struct kern_ipc_perm *isp = &sma->sem_perm;
+
+	isp->security = NULL;
+}
+
+/**
+ * smack_sem_associate - Smack access check for sem
+ * @sma: the object
+ * @semflg: access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_sem_associate(struct sem_array *sma, int semflg)
+{
+	char *ssp = smack_of_sem(sma);
+	int may;
+
+	may = smack_flags_to_may(semflg);
+	return smk_curacc(ssp, may);
+}
+
+/**
+ * smack_sem_shmctl - Smack access check for sem
+ * @sma: the object
+ * @cmd: what it wants to do
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_sem_semctl(struct sem_array *sma, int cmd)
+{
+	char *ssp = smack_of_sem(sma);
+	int may;
+
+	switch (cmd) {
+	case GETPID:
+	case GETNCNT:
+	case GETZCNT:
+	case GETVAL:
+	case GETALL:
+	case IPC_STAT:
+	case SEM_STAT:
+		may = MAY_READ;
+		break;
+	case SETVAL:
+	case SETALL:
+	case IPC_RMID:
+	case IPC_SET:
+		may = MAY_READWRITE;
+		break;
+	case IPC_INFO:
+	case SEM_INFO:
+		/*
+		 * System level information
+		 */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	return smk_curacc(ssp, may);
+}
+
+/**
+ * smack_sem_semop - Smack checks of semaphore operations
+ * @sma: the object
+ * @sops: unused
+ * @nsops: unused
+ * @alter: unused
+ *
+ * Treated as read and write in all cases.
+ *
+ * Returns 0 if access is allowed, error code otherwise
+ */
+static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops,
+			   unsigned nsops, int alter)
+{
+	char *ssp = smack_of_sem(sma);
+
+	return smk_curacc(ssp, MAY_READWRITE);
+}
+
+/**
+ * smack_msg_alloc_security - Set the security blob for msg
+ * @msq: the object
+ *
+ * Returns 0
+ */
+static int smack_msg_queue_alloc_security(struct msg_queue *msq)
+{
+	struct kern_ipc_perm *kisp = &msq->q_perm;
+
+	kisp->security = current->security;
+	return 0;
+}
+
+/**
+ * smack_msg_free_security - Clear the security blob for msg
+ * @msq: the object
+ *
+ * Clears the blob pointer
+ */
+static void smack_msg_queue_free_security(struct msg_queue *msq)
+{
+	struct kern_ipc_perm *kisp = &msq->q_perm;
+
+	kisp->security = NULL;
+}
+
+/**
+ * smack_of_msq - the smack pointer for the msq
+ * @msq: the object
+ *
+ * Returns a pointer to the smack value
+ */
+static char *smack_of_msq(struct msg_queue *msq)
+{
+	return (char *)msq->q_perm.security;
+}
+
+/**
+ * smack_msg_queue_associate - Smack access check for msg_queue
+ * @msq: the object
+ * @msqflg: access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
+{
+	char *msp = smack_of_msq(msq);
+	int may;
+
+	may = smack_flags_to_may(msqflg);
+	return smk_curacc(msp, may);
+}
+
+/**
+ * smack_msg_queue_msgctl - Smack access check for msg_queue
+ * @msq: the object
+ * @cmd: what it wants to do
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
+{
+	char *msp = smack_of_msq(msq);
+	int may;
+
+	switch (cmd) {
+	case IPC_STAT:
+	case MSG_STAT:
+		may = MAY_READ;
+		break;
+	case IPC_SET:
+	case IPC_RMID:
+		may = MAY_READWRITE;
+		break;
+	case IPC_INFO:
+	case MSG_INFO:
+		/*
+		 * System level information
+		 */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	return smk_curacc(msp, may);
+}
+
+/**
+ * smack_msg_queue_msgsnd - Smack access check for msg_queue
+ * @msq: the object
+ * @msg: unused
+ * @msqflg: access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
+				  int msqflg)
+{
+	char *msp = smack_of_msq(msq);
+	int rc;
+
+	rc = smack_flags_to_may(msqflg);
+	return smk_curacc(msp, rc);
+}
+
+/**
+ * smack_msg_queue_msgsnd - Smack access check for msg_queue
+ * @msq: the object
+ * @msg: unused
+ * @target: unused
+ * @type: unused
+ * @mode: unused
+ *
+ * Returns 0 if current has read and write access, error code otherwise
+ */
+static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
+			struct task_struct *target, long type, int mode)
+{
+	char *msp = smack_of_msq(msq);
+
+	return smk_curacc(msp, MAY_READWRITE);
+}
+
+/**
+ * smack_ipc_permission - Smack access for ipc_permission()
+ * @ipp: the object permissions
+ * @flag: access requested
+ *
+ * Returns 0 if current has read and write access, error code otherwise
+ */
+static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
+{
+	char *isp = ipp->security;
+	int may;
+
+	may = smack_flags_to_may(flag);
+	return smk_curacc(isp, may);
+}
+
+/**
+ * smack_d_instantiate - Make sure the blob is correct on an inode
+ * @opt_dentry: unused
+ * @inode: the object
+ *
+ * Set the inode's security blob if it hasn't been done already.
+ */
+static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
+{
+	struct super_block *sbp;
+	struct superblock_smack *sbsp;
+	struct inode_smack *isp;
+	char *csp = current->security;
+	char *fetched;
+	char *final;
+	struct dentry *dp;
+
+	if (inode == NULL)
+		return;
+
+	isp = inode->i_security;
+
+	mutex_lock(&isp->smk_lock);
+	/*
+	 * If the inode is already instantiated
+	 * take the quick way out
+	 */
+	if (isp->smk_flags & SMK_INODE_INSTANT)
+		goto unlockandout;
+
+	sbp = inode->i_sb;
+	sbsp = sbp->s_security;
+	/*
+	 * We're going to use the superblock default label
+	 * if there's no label on the file.
+	 */
+	final = sbsp->smk_default;
+
+	/*
+	 * This is pretty hackish.
+	 * Casey says that we shouldn't have to do
+	 * file system specific code, but it does help
+	 * with keeping it simple.
+	 */
+	switch (sbp->s_magic) {
+	case SMACK_MAGIC:
+		/*
+		 * Casey says that it's a little embarassing
+		 * that the smack file system doesn't do
+		 * extended attributes.
+		 */
+		final = smack_known_star.smk_known;
+		break;
+	case PIPEFS_MAGIC:
+		/*
+		 * Casey says pipes are easy (?)
+		 */
+		final = smack_known_star.smk_known;
+		break;
+	case DEVPTS_SUPER_MAGIC:
+		/*
+		 * devpts seems content with the label of the task.
+		 * Programs that change smack have to treat the
+		 * pty with respect.
+		 */
+		final = csp;
+		break;
+	case SOCKFS_MAGIC:
+		/*
+		 * Casey says sockets get the smack of the task.
+		 */
+		final = csp;
+		break;
+	case PROC_SUPER_MAGIC:
+		/*
+		 * Casey says procfs appears not to care.
+		 * The superblock default suffices.
+		 */
+		break;
+	case TMPFS_MAGIC:
+		/*
+		 * Device labels should come from the filesystem,
+		 * but watch out, because they're volitile,
+		 * getting recreated on every reboot.
+		 */
+		final = smack_known_star.smk_known;
+		/*
+		 * No break.
+		 *
+		 * If a smack value has been set we want to use it,
+		 * but since tmpfs isn't giving us the opportunity
+		 * to set mount options simulate setting the
+		 * superblock default.
+		 */
+	default:
+		/*
+		 * This isn't an understood special case.
+		 * Get the value from the xattr.
+		 *
+		 * No xattr support means, alas, no SMACK label.
+		 * Use the aforeapplied default.
+		 * It would be curious if the label of the task
+		 * does not match that assigned.
+		 */
+		if (inode->i_op->getxattr == NULL)
+			break;
+		/*
+		 * Get the dentry for xattr.
+		 */
+		if (opt_dentry == NULL) {
+			dp = d_find_alias(inode);
+			if (dp == NULL)
+				break;
+		} else {
+			dp = dget(opt_dentry);
+			if (dp == NULL)
+				break;
+		}
+
+		fetched = smk_fetch(inode, dp);
+		if (fetched != NULL)
+			final = fetched;
+
+		dput(dp);
+		break;
+	}
+
+	if (final == NULL)
+		isp->smk_inode = csp;
+	else
+		isp->smk_inode = final;
+
+	isp->smk_flags |= SMK_INODE_INSTANT;
+
+unlockandout:
+	mutex_unlock(&isp->smk_lock);
+	return;
+}
+
+/**
+ * smack_getprocattr - Smack process attribute access
+ * @p: the object task
+ * @name: the name of the attribute in /proc/.../attr
+ * @value: where to put the result
+ *
+ * Places a copy of the task Smack into value
+ *
+ * Returns the length of the smack label or an error code
+ */
+static int smack_getprocattr(struct task_struct *p, char *name, char **value)
+{
+	char *cp;
+	int slen;
+
+	if (strcmp(name, "current") != 0)
+		return -EINVAL;
+
+	cp = kstrdup(p->security, GFP_KERNEL);
+	if (cp == NULL)
+		return -ENOMEM;
+
+	slen = strlen(cp);
+	*value = cp;
+	return slen;
+}
+
+/**
+ * smack_setprocattr - Smack process attribute setting
+ * @p: the object task
+ * @name: the name of the attribute in /proc/.../attr
+ * @value: the value to set
+ * @size: the size of the value
+ *
+ * Sets the Smack value of the task. Only setting self
+ * is permitted and only with privilege
+ *
+ * Returns the length of the smack label or an error code
+ */
+static int smack_setprocattr(struct task_struct *p, char *name,
+			     void *value, size_t size)
+{
+	char *newsmack;
+
+	if (!__capable(p, CAP_MAC_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Changing another process' Smack value is too dangerous
+	 * and supports no sane use case.
+	 */
+	if (p != current)
+		return -EPERM;
+
+	if (value == NULL || size == 0 || size >= SMK_LABELLEN)
+		return -EINVAL;
+
+	if (strcmp(name, "current") != 0)
+		return -EINVAL;
+
+	newsmack = smk_import(value, size);
+	if (newsmack == NULL)
+		return -EINVAL;
+
+	p->security = newsmack;
+	return size;
+}
+
+/**
+ * smack_unix_stream_connect - Smack access on UDS
+ * @sock: one socket
+ * @other: the other socket
+ * @newsk: unused
+ *
+ * Return 0 if a subject with the smack of sock could access
+ * an object with the smack of other, otherwise an error code
+ */
+static int smack_unix_stream_connect(struct socket *sock,
+				     struct socket *other, struct sock *newsk)
+{
+	struct inode *sp = SOCK_INODE(sock);
+	struct inode *op = SOCK_INODE(other);
+
+	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE);
+}
+
+/**
+ * smack_unix_may_send - Smack access on UDS
+ * @sock: one socket
+ * @other: the other socket
+ *
+ * Return 0 if a subject with the smack of sock could access
+ * an object with the smack of other, otherwise an error code
+ */
+static int smack_unix_may_send(struct socket *sock, struct socket *other)
+{
+	struct inode *sp = SOCK_INODE(sock);
+	struct inode *op = SOCK_INODE(other);
+
+	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE);
+}
+
+/**
+ * smack_from_secattr - Convert a netlabel attr.mls.lvl/attr.mls.cat
+ * 	pair to smack
+ * @sap: netlabel secattr
+ * @sip: where to put the result
+ *
+ * Copies a smack label into sip
+ */
+static void smack_from_secattr(struct netlbl_lsm_secattr *sap, char *sip)
+{
+	char smack[SMK_LABELLEN];
+	int pcat;
+
+	if ((sap->flags & NETLBL_SECATTR_MLS_LVL) == 0) {
+		/*
+		 * If there are flags but no level netlabel isn't
+		 * behaving the way we expect it to.
+		 *
+		 * Without guidance regarding the smack value
+		 * for the packet fall back on the network
+		 * ambient value.
+		 */
+		strncpy(sip, smack_net_ambient, SMK_MAXLEN);
+		return;
+	}
+	/*
+	 * Get the categories, if any
+	 */
+	memset(smack, '\0', SMK_LABELLEN);
+	if ((sap->flags & NETLBL_SECATTR_MLS_CAT) != 0)
+		for (pcat = -1;;) {
+			pcat = netlbl_secattr_catmap_walk(sap->attr.mls.cat,
+							  pcat + 1);
+			if (pcat < 0)
+				break;
+			smack_catset_bit(pcat, smack);
+		}
+	/*
+	 * If it is CIPSO using smack direct mapping
+	 * we are already done. WeeHee.
+	 */
+	if (sap->attr.mls.lvl == smack_cipso_direct) {
+		memcpy(sip, smack, SMK_MAXLEN);
+		return;
+	}
+	/*
+	 * Look it up in the supplied table if it is not a direct mapping.
+	 */
+	smack_from_cipso(sap->attr.mls.lvl, smack, sip);
+	return;
+}
+
+/**
+ * smack_socket_sock_rcv_skb - Smack packet delivery access check
+ * @sk: socket
+ * @skb: packet
+ *
+ * Returns 0 if the packet should be delivered, an error code otherwise
+ */
+static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct netlbl_lsm_secattr secattr;
+	struct socket_smack *ssp = sk->sk_security;
+	char smack[SMK_LABELLEN];
+	int rc;
+
+	if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+		return 0;
+
+	/*
+	 * Translate what netlabel gave us.
+	 */
+	memset(smack, '\0', SMK_LABELLEN);
+	netlbl_secattr_init(&secattr);
+	rc = netlbl_skbuff_getattr(skb, sk->sk_family, &secattr);
+	if (rc == 0)
+		smack_from_secattr(&secattr, smack);
+	else
+		strncpy(smack, smack_net_ambient, SMK_MAXLEN);
+	netlbl_secattr_destroy(&secattr);
+	/*
+	 * Receiving a packet requires that the other end
+	 * be able to write here. Read access is not required.
+	 * This is the simplist possible security model
+	 * for networking.
+	 */
+	return smk_access(smack, ssp->smk_in, MAY_WRITE);
+}
+
+/**
+ * smack_socket_getpeersec_stream - pull in packet label
+ * @sock: the socket
+ * @optval: user's destination
+ * @optlen: size thereof
+ * @len: max thereoe
+ *
+ * returns zero on success, an error code otherwise
+ */
+static int smack_socket_getpeersec_stream(struct socket *sock,
+					  char __user *optval,
+					  int __user *optlen, unsigned len)
+{
+	struct socket_smack *ssp;
+	int slen;
+	int rc = 0;
+
+	ssp = sock->sk->sk_security;
+	slen = strlen(ssp->smk_packet) + 1;
+
+	if (slen > len)
+		rc = -ERANGE;
+	else if (copy_to_user(optval, ssp->smk_packet, slen) != 0)
+		rc = -EFAULT;
+
+	if (put_user(slen, optlen) != 0)
+		rc = -EFAULT;
+
+	return rc;
+}
+
+
+/**
+ * smack_socket_getpeersec_dgram - pull in packet label
+ * @sock: the socket
+ * @skb: packet data
+ * @secid: pointer to where to put the secid of the packet
+ *
+ * Sets the netlabel socket state on sk from parent
+ */
+static int smack_socket_getpeersec_dgram(struct socket *sock,
+					 struct sk_buff *skb, u32 *secid)
+
+{
+	struct netlbl_lsm_secattr secattr;
+	struct sock *sk;
+	char smack[SMK_LABELLEN];
+	int family = PF_INET;
+	u32 s;
+	int rc;
+
+	/*
+	 * Only works for families with packets.
+	 */
+	if (sock != NULL) {
+		sk = sock->sk;
+		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+			return 0;
+		family = sk->sk_family;
+	}
+	/*
+	 * Translate what netlabel gave us.
+	 */
+	memset(smack, '\0', SMK_LABELLEN);
+	netlbl_secattr_init(&secattr);
+	rc = netlbl_skbuff_getattr(skb, family, &secattr);
+	if (rc == 0)
+		smack_from_secattr(&secattr, smack);
+	netlbl_secattr_destroy(&secattr);
+
+	/*
+	 * Give up if we couldn't get anything
+	 */
+	if (rc != 0)
+		return rc;
+
+	s = smack_to_secid(smack);
+	if (s == 0)
+		return -EINVAL;
+
+	*secid = s;
+	return 0;
+}
+
+/**
+ * smack_sock_graft - graft access state between two sockets
+ * @sk: fresh sock
+ * @parent: donor socket
+ *
+ * Sets the netlabel socket state on sk from parent
+ */
+static void smack_sock_graft(struct sock *sk, struct socket *parent)
+{
+	struct socket_smack *ssp;
+	int rc;
+
+	if (sk == NULL)
+		return;
+
+	if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+		return;
+
+	ssp = sk->sk_security;
+	ssp->smk_in = current->security;
+	ssp->smk_out = current->security;
+	ssp->smk_packet[0] = '\0';
+
+	rc = smack_netlabel(sk);
+}
+
+/**
+ * smack_inet_conn_request - Smack access check on connect
+ * @sk: socket involved
+ * @skb: packet
+ * @req: unused
+ *
+ * Returns 0 if a task with the packet label could write to
+ * the socket, otherwise an error code
+ */
+static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+				   struct request_sock *req)
+{
+	struct netlbl_lsm_secattr skb_secattr;
+	struct socket_smack *ssp = sk->sk_security;
+	char smack[SMK_LABELLEN];
+	int rc;
+
+	if (skb == NULL)
+		return -EACCES;
+
+	memset(smack, '\0', SMK_LABELLEN);
+	netlbl_secattr_init(&skb_secattr);
+	rc = netlbl_skbuff_getattr(skb, sk->sk_family, &skb_secattr);
+	if (rc == 0)
+		smack_from_secattr(&skb_secattr, smack);
+	else
+		strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN);
+	netlbl_secattr_destroy(&skb_secattr);
+	/*
+	 * Receiving a packet requires that the other end
+	 * be able to write here. Read access is not required.
+	 *
+	 * If the request is successful save the peer's label
+	 * so that SO_PEERCRED can report it.
+	 */
+	rc = smk_access(smack, ssp->smk_in, MAY_WRITE);
+	if (rc == 0)
+		strncpy(ssp->smk_packet, smack, SMK_MAXLEN);
+
+	return rc;
+}
+
+/*
+ * Key management security hooks
+ *
+ * Casey has not tested key support very heavily.
+ * The permission check is most likely too restrictive.
+ * If you care about keys please have a look.
+ */
+#ifdef CONFIG_KEYS
+
+/**
+ * smack_key_alloc - Set the key security blob
+ * @key: object
+ * @tsk: the task associated with the key
+ * @flags: unused
+ *
+ * No allocation required
+ *
+ * Returns 0
+ */
+static int smack_key_alloc(struct key *key, struct task_struct *tsk,
+			   unsigned long flags)
+{
+	key->security = tsk->security;
+	return 0;
+}
+
+/**
+ * smack_key_free - Clear the key security blob
+ * @key: the object
+ *
+ * Clear the blob pointer
+ */
+static void smack_key_free(struct key *key)
+{
+	key->security = NULL;
+}
+
+/*
+ * smack_key_permission - Smack access on a key
+ * @key_ref: gets to the object
+ * @context: task involved
+ * @perm: unused
+ *
+ * Return 0 if the task has read and write to the object,
+ * an error code otherwise
+ */
+static int smack_key_permission(key_ref_t key_ref,
+				struct task_struct *context, key_perm_t perm)
+{
+	struct key *keyp;
+
+	keyp = key_ref_to_ptr(key_ref);
+	if (keyp == NULL)
+		return -EINVAL;
+	/*
+	 * If the key hasn't been initialized give it access so that
+	 * it may do so.
+	 */
+	if (keyp->security == NULL)
+		return 0;
+	/*
+	 * This should not occur
+	 */
+	if (context->security == NULL)
+		return -EACCES;
+
+	return smk_access(context->security, keyp->security, MAY_READWRITE);
+}
+#endif /* CONFIG_KEYS */
+
+/*
+ * smack_secid_to_secctx - return the smack label for a secid
+ * @secid: incoming integer
+ * @secdata: destination
+ * @seclen: how long it is
+ *
+ * Exists for networking code.
+ */
+static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+{
+	char *sp = smack_from_secid(secid);
+
+	*secdata = sp;
+	*seclen = strlen(sp);
+	return 0;
+}
+
+/*
+ * smack_release_secctx - don't do anything.
+ * @key_ref: unused
+ * @context: unused
+ * @perm: unused
+ *
+ * Exists to make sure nothing gets done, and properly
+ */
+static void smack_release_secctx(char *secdata, u32 seclen)
+{
+}
+
+static struct security_operations smack_ops = {
+	.ptrace = 			smack_ptrace,
+	.capget = 			cap_capget,
+	.capset_check = 		cap_capset_check,
+	.capset_set = 			cap_capset_set,
+	.capable = 			cap_capable,
+	.syslog = 			smack_syslog,
+	.settime = 			cap_settime,
+	.vm_enough_memory = 		cap_vm_enough_memory,
+
+	.bprm_apply_creds = 		cap_bprm_apply_creds,
+	.bprm_set_security = 		cap_bprm_set_security,
+	.bprm_secureexec = 		cap_bprm_secureexec,
+
+	.sb_alloc_security = 		smack_sb_alloc_security,
+	.sb_free_security = 		smack_sb_free_security,
+	.sb_copy_data = 		smack_sb_copy_data,
+	.sb_kern_mount = 		smack_sb_kern_mount,
+	.sb_statfs = 			smack_sb_statfs,
+	.sb_mount = 			smack_sb_mount,
+	.sb_umount = 			smack_sb_umount,
+
+	.inode_alloc_security = 	smack_inode_alloc_security,
+	.inode_free_security = 		smack_inode_free_security,
+	.inode_init_security = 		smack_inode_init_security,
+	.inode_link = 			smack_inode_link,
+	.inode_unlink = 		smack_inode_unlink,
+	.inode_rmdir = 			smack_inode_rmdir,
+	.inode_rename = 		smack_inode_rename,
+	.inode_permission = 		smack_inode_permission,
+	.inode_setattr = 		smack_inode_setattr,
+	.inode_getattr = 		smack_inode_getattr,
+	.inode_setxattr = 		smack_inode_setxattr,
+	.inode_post_setxattr = 		smack_inode_post_setxattr,
+	.inode_getxattr = 		smack_inode_getxattr,
+	.inode_removexattr = 		smack_inode_removexattr,
+	.inode_getsecurity = 		smack_inode_getsecurity,
+	.inode_setsecurity = 		smack_inode_setsecurity,
+	.inode_listsecurity = 		smack_inode_listsecurity,
+
+	.file_permission = 		smack_file_permission,
+	.file_alloc_security = 		smack_file_alloc_security,
+	.file_free_security = 		smack_file_free_security,
+	.file_ioctl = 			smack_file_ioctl,
+	.file_lock = 			smack_file_lock,
+	.file_fcntl = 			smack_file_fcntl,
+	.file_set_fowner = 		smack_file_set_fowner,
+	.file_send_sigiotask = 		smack_file_send_sigiotask,
+	.file_receive = 		smack_file_receive,
+
+	.task_alloc_security = 		smack_task_alloc_security,
+	.task_free_security = 		smack_task_free_security,
+	.task_post_setuid =		cap_task_post_setuid,
+	.task_setpgid = 		smack_task_setpgid,
+	.task_getpgid = 		smack_task_getpgid,
+	.task_getsid = 			smack_task_getsid,
+	.task_getsecid = 		smack_task_getsecid,
+	.task_setnice = 		smack_task_setnice,
+	.task_setioprio = 		smack_task_setioprio,
+	.task_getioprio = 		smack_task_getioprio,
+	.task_setscheduler = 		smack_task_setscheduler,
+	.task_getscheduler = 		smack_task_getscheduler,
+	.task_movememory = 		smack_task_movememory,
+	.task_kill = 			smack_task_kill,
+	.task_wait = 			smack_task_wait,
+	.task_reparent_to_init =	cap_task_reparent_to_init,
+	.task_to_inode = 		smack_task_to_inode,
+
+	.ipc_permission = 		smack_ipc_permission,
+
+	.msg_msg_alloc_security = 	smack_msg_msg_alloc_security,
+	.msg_msg_free_security = 	smack_msg_msg_free_security,
+
+	.msg_queue_alloc_security = 	smack_msg_queue_alloc_security,
+	.msg_queue_free_security = 	smack_msg_queue_free_security,
+	.msg_queue_associate = 		smack_msg_queue_associate,
+	.msg_queue_msgctl = 		smack_msg_queue_msgctl,
+	.msg_queue_msgsnd = 		smack_msg_queue_msgsnd,
+	.msg_queue_msgrcv = 		smack_msg_queue_msgrcv,
+
+	.shm_alloc_security = 		smack_shm_alloc_security,
+	.shm_free_security = 		smack_shm_free_security,
+	.shm_associate = 		smack_shm_associate,
+	.shm_shmctl = 			smack_shm_shmctl,
+	.shm_shmat = 			smack_shm_shmat,
+
+	.sem_alloc_security = 		smack_sem_alloc_security,
+	.sem_free_security = 		smack_sem_free_security,
+	.sem_associate = 		smack_sem_associate,
+	.sem_semctl = 			smack_sem_semctl,
+	.sem_semop = 			smack_sem_semop,
+
+	.netlink_send =			cap_netlink_send,
+	.netlink_recv = 		cap_netlink_recv,
+
+	.d_instantiate = 		smack_d_instantiate,
+
+	.getprocattr = 			smack_getprocattr,
+	.setprocattr = 			smack_setprocattr,
+
+	.unix_stream_connect = 		smack_unix_stream_connect,
+	.unix_may_send = 		smack_unix_may_send,
+
+	.socket_post_create = 		smack_socket_post_create,
+	.socket_sock_rcv_skb = 		smack_socket_sock_rcv_skb,
+	.socket_getpeersec_stream =	smack_socket_getpeersec_stream,
+	.socket_getpeersec_dgram =	smack_socket_getpeersec_dgram,
+	.sk_alloc_security = 		smack_sk_alloc_security,
+	.sk_free_security = 		smack_sk_free_security,
+	.sock_graft = 			smack_sock_graft,
+	.inet_conn_request = 		smack_inet_conn_request,
+ /* key management security hooks */
+#ifdef CONFIG_KEYS
+	.key_alloc = 			smack_key_alloc,
+	.key_free = 			smack_key_free,
+	.key_permission = 		smack_key_permission,
+#endif /* CONFIG_KEYS */
+	.secid_to_secctx = 		smack_secid_to_secctx,
+	.release_secctx = 		smack_release_secctx,
+};
+
+/**
+ * smack_init - initialize the smack system
+ *
+ * Returns 0
+ */
+static __init int smack_init(void)
+{
+	printk(KERN_INFO "Smack:  Initializing.\n");
+
+	/*
+	 * Set the security state for the initial task.
+	 */
+	current->security = &smack_known_floor.smk_known;
+
+	/*
+	 * Initialize locks
+	 */
+	spin_lock_init(&smack_known_unset.smk_cipsolock);
+	spin_lock_init(&smack_known_huh.smk_cipsolock);
+	spin_lock_init(&smack_known_hat.smk_cipsolock);
+	spin_lock_init(&smack_known_star.smk_cipsolock);
+	spin_lock_init(&smack_known_floor.smk_cipsolock);
+	spin_lock_init(&smack_known_invalid.smk_cipsolock);
+
+	/*
+	 * Register with LSM
+	 */
+	if (register_security(&smack_ops))
+		panic("smack: Unable to register with kernel.\n");
+
+	return 0;
+}
+
+/*
+ * Smack requires early initialization in order to label
+ * all processes and objects when they are created.
+ */
+security_initcall(smack_init);
+
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
new file mode 100644
index 0000000..15aa37f
--- /dev/null
+++ b/security/smack/smackfs.c
@@ -0,0 +1,981 @@
+/*
+ * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *  	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation, version 2.
+ *
+ * Authors:
+ * 	Casey Schaufler <casey@schaufler-ca.com>
+ * 	Ahmed S. Darwish <darwish.07@gmail.com>
+ *
+ * Special thanks to the authors of selinuxfs.
+ *
+ *	Karl MacMillan <kmacmillan@tresys.com>
+ *	James Morris <jmorris@redhat.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+#include "smack.h"
+
+/*
+ * smackfs pseudo filesystem.
+ */
+
+enum smk_inos {
+	SMK_ROOT_INO	= 2,
+	SMK_LOAD	= 3,	/* load policy */
+	SMK_CIPSO	= 4,	/* load label -> CIPSO mapping */
+	SMK_DOI		= 5,	/* CIPSO DOI */
+	SMK_DIRECT	= 6,	/* CIPSO level indicating direct label */
+	SMK_AMBIENT	= 7,	/* internet ambient label */
+	SMK_NLTYPE	= 8,	/* label scheme to use by default */
+};
+
+/*
+ * List locks
+ */
+static DEFINE_MUTEX(smack_list_lock);
+static DEFINE_MUTEX(smack_cipso_lock);
+
+/*
+ * This is the "ambient" label for network traffic.
+ * If it isn't somehow marked, use this.
+ * It can be reset via smackfs/ambient
+ */
+char *smack_net_ambient = smack_known_floor.smk_known;
+
+/*
+ * This is the default packet marking scheme for network traffic.
+ * It can be reset via smackfs/nltype
+ */
+int smack_net_nltype = NETLBL_NLTYPE_CIPSOV4;
+
+/*
+ * This is the level in a CIPSO header that indicates a
+ * smack label is contained directly in the category set.
+ * It can be reset via smackfs/direct
+ */
+int smack_cipso_direct = SMACK_CIPSO_DIRECT_DEFAULT;
+
+static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT;
+struct smk_list_entry *smack_list;
+
+#define	SEQ_READ_FINISHED	1
+
+/*
+ * Disable concurrent writing open() operations
+ */
+static struct semaphore smack_write_sem;
+
+/*
+ * Values for parsing cipso rules
+ * SMK_DIGITLEN: Length of a digit field in a rule.
+ * SMK_CIPSOMEN: Minimum possible cipso rule length.
+ */
+#define SMK_DIGITLEN 4
+#define SMK_CIPSOMIN (SMK_MAXLEN + 2 * SMK_DIGITLEN)
+
+/*
+ * Seq_file read operations for /smack/load
+ */
+
+static void *load_seq_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos == SEQ_READ_FINISHED)
+		return NULL;
+
+	return smack_list;
+}
+
+static void *load_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct smk_list_entry *skp = ((struct smk_list_entry *) v)->smk_next;
+
+	if (skp == NULL)
+		*pos = SEQ_READ_FINISHED;
+
+	return skp;
+}
+
+static int load_seq_show(struct seq_file *s, void *v)
+{
+	struct smk_list_entry *slp = (struct smk_list_entry *) v;
+	struct smack_rule *srp = &slp->smk_rule;
+
+	seq_printf(s, "%s %s", (char *)srp->smk_subject,
+		   (char *)srp->smk_object);
+
+	seq_putc(s, ' ');
+
+	if (srp->smk_access & MAY_READ)
+		seq_putc(s, 'r');
+	if (srp->smk_access & MAY_WRITE)
+		seq_putc(s, 'w');
+	if (srp->smk_access & MAY_EXEC)
+		seq_putc(s, 'x');
+	if (srp->smk_access & MAY_APPEND)
+		seq_putc(s, 'a');
+	if (srp->smk_access == 0)
+		seq_putc(s, '-');
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static void load_seq_stop(struct seq_file *s, void *v)
+{
+	/* No-op */
+}
+
+static struct seq_operations load_seq_ops = {
+	.start = load_seq_start,
+	.next  = load_seq_next,
+	.show  = load_seq_show,
+	.stop  = load_seq_stop,
+};
+
+/**
+ * smk_open_load - open() for /smack/load
+ * @inode: inode structure representing file
+ * @file: "load" file pointer
+ *
+ * For reading, use load_seq_* seq_file reading operations.
+ */
+static int smk_open_load(struct inode *inode, struct file *file)
+{
+	if ((file->f_flags & O_ACCMODE) == O_RDONLY)
+		return seq_open(file, &load_seq_ops);
+
+	if (down_interruptible(&smack_write_sem))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+/**
+ * smk_release_load - release() for /smack/load
+ * @inode: inode structure representing file
+ * @file: "load" file pointer
+ *
+ * For a reading session, use the seq_file release
+ * implementation.
+ * Otherwise, we are at the end of a writing session so
+ * clean everything up.
+ */
+static int smk_release_load(struct inode *inode, struct file *file)
+{
+	if ((file->f_flags & O_ACCMODE) == O_RDONLY)
+		return seq_release(inode, file);
+
+	up(&smack_write_sem);
+	return 0;
+}
+
+/**
+ * smk_set_access - add a rule to the rule list
+ * @srp: the new rule to add
+ *
+ * Looks through the current subject/object/access list for
+ * the subject/object pair and replaces the access that was
+ * there. If the pair isn't found add it with the specified
+ * access.
+ */
+static void smk_set_access(struct smack_rule *srp)
+{
+	struct smk_list_entry *sp;
+	struct smk_list_entry *newp;
+
+	mutex_lock(&smack_list_lock);
+
+	for (sp = smack_list; sp != NULL; sp = sp->smk_next)
+		if (sp->smk_rule.smk_subject == srp->smk_subject &&
+		    sp->smk_rule.smk_object == srp->smk_object) {
+			sp->smk_rule.smk_access = srp->smk_access;
+			break;
+		}
+
+	if (sp == NULL) {
+		newp = kzalloc(sizeof(struct smk_list_entry), GFP_KERNEL);
+		newp->smk_rule = *srp;
+		newp->smk_next = smack_list;
+		smack_list = newp;
+	}
+
+	mutex_unlock(&smack_list_lock);
+
+	return;
+}
+
+/**
+ * smk_write_load - write() for /smack/load
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start - must be 0
+ *
+ * Get one smack access rule from above.
+ * The format is exactly:
+ *     char subject[SMK_LABELLEN]
+ *     char object[SMK_LABELLEN]
+ *     char access[SMK_ACCESSKINDS]
+ *
+ *     Anything following is commentary and ignored.
+ *
+ * writes must be SMK_LABELLEN+SMK_LABELLEN+4 bytes.
+ */
+#define MINIMUM_LOAD (SMK_LABELLEN + SMK_LABELLEN + SMK_ACCESSKINDS)
+
+static ssize_t smk_write_load(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct smack_rule rule;
+	char *data;
+	int rc = -EINVAL;
+
+	/*
+	 * Must have privilege.
+	 * No partial writes.
+	 * Enough data must be present.
+	 */
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+	if (*ppos != 0)
+		return -EINVAL;
+	if (count < MINIMUM_LOAD)
+		return -EINVAL;
+
+	data = kzalloc(count, GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(data, buf, count) != 0) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rule.smk_subject = smk_import(data, 0);
+	if (rule.smk_subject == NULL)
+		goto out;
+
+	rule.smk_object = smk_import(data + SMK_LABELLEN, 0);
+	if (rule.smk_object == NULL)
+		goto out;
+
+	rule.smk_access = 0;
+
+	switch (data[SMK_LABELLEN + SMK_LABELLEN]) {
+	case '-':
+		break;
+	case 'r':
+	case 'R':
+		rule.smk_access |= MAY_READ;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (data[SMK_LABELLEN + SMK_LABELLEN + 1]) {
+	case '-':
+		break;
+	case 'w':
+	case 'W':
+		rule.smk_access |= MAY_WRITE;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (data[SMK_LABELLEN + SMK_LABELLEN + 2]) {
+	case '-':
+		break;
+	case 'x':
+	case 'X':
+		rule.smk_access |= MAY_EXEC;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (data[SMK_LABELLEN + SMK_LABELLEN + 3]) {
+	case '-':
+		break;
+	case 'a':
+	case 'A':
+		rule.smk_access |= MAY_READ;
+		break;
+	default:
+		goto out;
+	}
+
+	smk_set_access(&rule);
+	rc = count;
+
+out:
+	kfree(data);
+	return rc;
+}
+
+static const struct file_operations smk_load_ops = {
+	.open           = smk_open_load,
+	.read		= seq_read,
+	.llseek         = seq_lseek,
+	.write		= smk_write_load,
+	.release        = smk_release_load,
+};
+
+/**
+ * smk_cipso_doi - initialize the CIPSO domain
+ */
+void smk_cipso_doi(void)
+{
+	int rc;
+	struct cipso_v4_doi *doip;
+	struct netlbl_audit audit_info;
+
+	rc = netlbl_cfg_map_del(NULL, &audit_info);
+	if (rc != 0)
+		printk(KERN_WARNING "%s:%d remove rc = %d\n",
+		       __func__, __LINE__, rc);
+
+	doip = kmalloc(sizeof(struct cipso_v4_doi), GFP_KERNEL);
+	if (doip == NULL)
+		panic("smack:  Failed to initialize cipso DOI.\n");
+	doip->map.std = NULL;
+	doip->doi = smk_cipso_doi_value;
+	doip->type = CIPSO_V4_MAP_PASS;
+	doip->tags[0] = CIPSO_V4_TAG_RBITMAP;
+	for (rc = 1; rc < CIPSO_V4_TAG_MAXCNT; rc++)
+		doip->tags[rc] = CIPSO_V4_TAG_INVALID;
+
+	rc = netlbl_cfg_cipsov4_add_map(doip, NULL, &audit_info);
+	if (rc != 0)
+		printk(KERN_WARNING "%s:%d add rc = %d\n",
+		       __func__, __LINE__, rc);
+}
+
+/*
+ * Seq_file read operations for /smack/cipso
+ */
+
+static void *cipso_seq_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos == SEQ_READ_FINISHED)
+		return NULL;
+
+	return smack_known;
+}
+
+static void *cipso_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct smack_known *skp = ((struct smack_known *) v)->smk_next;
+
+	/*
+	 * Omit labels with no associated cipso value
+	 */
+	while (skp != NULL && !skp->smk_cipso)
+		skp = skp->smk_next;
+
+	if (skp == NULL)
+		*pos = SEQ_READ_FINISHED;
+
+	return skp;
+}
+
+/*
+ * Print cipso labels in format:
+ * label level[/cat[,cat]]
+ */
+static int cipso_seq_show(struct seq_file *s, void *v)
+{
+	struct smack_known *skp = (struct smack_known *) v;
+	struct smack_cipso *scp = skp->smk_cipso;
+	char *cbp;
+	char sep = '/';
+	int cat = 1;
+	int i;
+	unsigned char m;
+
+	if (scp == NULL)
+		return 0;
+
+	seq_printf(s, "%s %3d", (char *)&skp->smk_known, scp->smk_level);
+
+	cbp = scp->smk_catset;
+	for (i = 0; i < SMK_LABELLEN; i++)
+		for (m = 0x80; m != 0; m >>= 1) {
+			if (m & cbp[i]) {
+				seq_printf(s, "%c%d", sep, cat);
+				sep = ',';
+			}
+			cat++;
+		}
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static void cipso_seq_stop(struct seq_file *s, void *v)
+{
+	/* No-op */
+}
+
+static struct seq_operations cipso_seq_ops = {
+	.start = cipso_seq_start,
+	.stop  = cipso_seq_stop,
+	.next  = cipso_seq_next,
+	.show  = cipso_seq_show,
+};
+
+/**
+ * smk_open_cipso - open() for /smack/cipso
+ * @inode: inode structure representing file
+ * @file: "cipso" file pointer
+ *
+ * Connect our cipso_seq_* operations with /smack/cipso
+ * file_operations
+ */
+static int smk_open_cipso(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cipso_seq_ops);
+}
+
+/**
+ * smk_write_cipso - write() for /smack/cipso
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Accepts only one cipso rule per write call.
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_cipso(struct file *file, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct smack_known *skp;
+	struct smack_cipso *scp = NULL;
+	char mapcatset[SMK_LABELLEN];
+	int maplevel;
+	int cat;
+	int catlen;
+	ssize_t rc = -EINVAL;
+	char *data = NULL;
+	char *rule;
+	int ret;
+	int i;
+
+	/*
+	 * Must have privilege.
+	 * No partial writes.
+	 * Enough data must be present.
+	 */
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+	if (*ppos != 0)
+		return -EINVAL;
+	if (count <= SMK_CIPSOMIN)
+		return -EINVAL;
+
+	data = kzalloc(count + 1, GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(data, buf, count) != 0) {
+		rc = -EFAULT;
+		goto unlockedout;
+	}
+
+	data[count] = '\0';
+	rule = data;
+	/*
+	 * Only allow one writer at a time. Writes should be
+	 * quite rare and small in any case.
+	 */
+	mutex_lock(&smack_cipso_lock);
+
+	skp = smk_import_entry(rule, 0);
+	if (skp == NULL)
+		goto out;
+
+	rule += SMK_LABELLEN;;
+	ret = sscanf(rule, "%d", &maplevel);
+	if (ret != 1 || maplevel > SMACK_CIPSO_MAXLEVEL)
+		goto out;
+
+	rule += SMK_DIGITLEN;
+	ret = sscanf(rule, "%d", &catlen);
+	if (ret != 1 || catlen > SMACK_CIPSO_MAXCATNUM)
+		goto out;
+
+	if (count <= (SMK_CIPSOMIN + catlen * SMK_DIGITLEN))
+		goto out;
+
+	memset(mapcatset, 0, sizeof(mapcatset));
+
+	for (i = 0; i < catlen; i++) {
+		rule += SMK_DIGITLEN;
+		ret = sscanf(rule, "%d", &cat);
+		if (ret != 1 || cat > SMACK_CIPSO_MAXCATVAL)
+			goto out;
+
+		smack_catset_bit(cat, mapcatset);
+	}
+
+	if (skp->smk_cipso == NULL) {
+		scp = kzalloc(sizeof(struct smack_cipso), GFP_KERNEL);
+		if (scp == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+
+	spin_lock_bh(&skp->smk_cipsolock);
+
+	if (scp == NULL)
+		scp = skp->smk_cipso;
+	else
+		skp->smk_cipso = scp;
+
+	scp->smk_level = maplevel;
+	memcpy(scp->smk_catset, mapcatset, sizeof(mapcatset));
+
+	spin_unlock_bh(&skp->smk_cipsolock);
+
+	rc = count;
+out:
+	mutex_unlock(&smack_cipso_lock);
+unlockedout:
+	kfree(data);
+	return rc;
+}
+
+static const struct file_operations smk_cipso_ops = {
+	.open           = smk_open_cipso,
+	.read		= seq_read,
+	.llseek         = seq_lseek,
+	.write		= smk_write_cipso,
+	.release        = seq_release,
+};
+
+/**
+ * smk_read_doi - read() for /smack/doi
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @count: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_doi(struct file *filp, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char temp[80];
+	ssize_t rc;
+
+	if (*ppos != 0)
+		return 0;
+
+	sprintf(temp, "%d", smk_cipso_doi_value);
+	rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+
+	return rc;
+}
+
+/**
+ * smk_write_doi - write() for /smack/doi
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_doi(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	char temp[80];
+	int i;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= sizeof(temp) || count == 0)
+		return -EINVAL;
+
+	if (copy_from_user(temp, buf, count) != 0)
+		return -EFAULT;
+
+	temp[count] = '\0';
+
+	if (sscanf(temp, "%d", &i) != 1)
+		return -EINVAL;
+
+	smk_cipso_doi_value = i;
+
+	smk_cipso_doi();
+
+	return count;
+}
+
+static const struct file_operations smk_doi_ops = {
+	.read		= smk_read_doi,
+	.write		= smk_write_doi,
+};
+
+/**
+ * smk_read_direct - read() for /smack/direct
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @count: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_direct(struct file *filp, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	char temp[80];
+	ssize_t rc;
+
+	if (*ppos != 0)
+		return 0;
+
+	sprintf(temp, "%d", smack_cipso_direct);
+	rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+
+	return rc;
+}
+
+/**
+ * smk_write_direct - write() for /smack/direct
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_direct(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char temp[80];
+	int i;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= sizeof(temp) || count == 0)
+		return -EINVAL;
+
+	if (copy_from_user(temp, buf, count) != 0)
+		return -EFAULT;
+
+	temp[count] = '\0';
+
+	if (sscanf(temp, "%d", &i) != 1)
+		return -EINVAL;
+
+	smack_cipso_direct = i;
+
+	return count;
+}
+
+static const struct file_operations smk_direct_ops = {
+	.read		= smk_read_direct,
+	.write		= smk_write_direct,
+};
+
+/**
+ * smk_read_ambient - read() for /smack/ambient
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @cn: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_ambient(struct file *filp, char __user *buf,
+				size_t cn, loff_t *ppos)
+{
+	ssize_t rc;
+	char out[SMK_LABELLEN];
+	int asize;
+
+	if (*ppos != 0)
+		return 0;
+	/*
+	 * Being careful to avoid a problem in the case where
+	 * smack_net_ambient gets changed in midstream.
+	 * Since smack_net_ambient is always set with a value
+	 * from the label list, including initially, and those
+	 * never get freed, the worst case is that the pointer
+	 * gets changed just after this strncpy, in which case
+	 * the value passed up is incorrect. Locking around
+	 * smack_net_ambient wouldn't be any better than this
+	 * copy scheme as by the time the caller got to look
+	 * at the ambient value it would have cleared the lock
+	 * and been changed.
+	 */
+	strncpy(out, smack_net_ambient, SMK_LABELLEN);
+	asize = strlen(out) + 1;
+
+	if (cn < asize)
+		return -EINVAL;
+
+	rc = simple_read_from_buffer(buf, cn, ppos, out, asize);
+
+	return rc;
+}
+
+/**
+ * smk_write_ambient - write() for /smack/ambient
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_ambient(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	char in[SMK_LABELLEN];
+	char *smack;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= SMK_LABELLEN)
+		return -EINVAL;
+
+	if (copy_from_user(in, buf, count) != 0)
+		return -EFAULT;
+
+	smack = smk_import(in, count);
+	if (smack == NULL)
+		return -EINVAL;
+
+	smack_net_ambient = smack;
+
+	return count;
+}
+
+static const struct file_operations smk_ambient_ops = {
+	.read		= smk_read_ambient,
+	.write		= smk_write_ambient,
+};
+
+struct option_names {
+	int	o_number;
+	char	*o_name;
+	char	*o_alias;
+};
+
+static struct option_names netlbl_choices[] = {
+	{ NETLBL_NLTYPE_RIPSO,
+		NETLBL_NLTYPE_RIPSO_NAME,	"ripso" },
+	{ NETLBL_NLTYPE_CIPSOV4,
+		NETLBL_NLTYPE_CIPSOV4_NAME,	"cipsov4" },
+	{ NETLBL_NLTYPE_CIPSOV4,
+		NETLBL_NLTYPE_CIPSOV4_NAME,	"cipso" },
+	{ NETLBL_NLTYPE_CIPSOV6,
+		NETLBL_NLTYPE_CIPSOV6_NAME,	"cipsov6" },
+	{ NETLBL_NLTYPE_UNLABELED,
+		NETLBL_NLTYPE_UNLABELED_NAME,	"unlabeled" },
+};
+
+/**
+ * smk_read_nltype - read() for /smack/nltype
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @count: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_nltype(struct file *filp, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	char bound[40];
+	ssize_t rc;
+	int i;
+
+	if (count < SMK_LABELLEN)
+		return -EINVAL;
+
+	if (*ppos != 0)
+		return 0;
+
+	sprintf(bound, "unknown");
+
+	for (i = 0; i < ARRAY_SIZE(netlbl_choices); i++)
+		if (smack_net_nltype == netlbl_choices[i].o_number) {
+			sprintf(bound, "%s", netlbl_choices[i].o_name);
+			break;
+		}
+
+	rc = simple_read_from_buffer(buf, count, ppos, bound, strlen(bound));
+
+	return rc;
+}
+
+/**
+ * smk_write_nltype - write() for /smack/nltype
+ * @filp: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_nltype(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char bound[40];
+	char *cp;
+	int i;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= 40)
+		return -EINVAL;
+
+	if (copy_from_user(bound, buf, count) != 0)
+		return -EFAULT;
+
+	bound[count] = '\0';
+	cp = strchr(bound, ' ');
+	if (cp != NULL)
+		*cp = '\0';
+	cp = strchr(bound, '\n');
+	if (cp != NULL)
+		*cp = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(netlbl_choices); i++)
+		if (strcmp(bound, netlbl_choices[i].o_name) == 0 ||
+		    strcmp(bound, netlbl_choices[i].o_alias) == 0) {
+			smack_net_nltype = netlbl_choices[i].o_number;
+			return count;
+		}
+	/*
+	 * Not a valid choice.
+	 */
+	return -EINVAL;
+}
+
+static const struct file_operations smk_nltype_ops = {
+	.read		= smk_read_nltype,
+	.write		= smk_write_nltype,
+};
+
+/**
+ * smk_fill_super - fill the /smackfs superblock
+ * @sb: the empty superblock
+ * @data: unused
+ * @silent: unused
+ *
+ * Fill in the well known entries for /smack
+ *
+ * Returns 0 on success, an error code on failure
+ */
+static int smk_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int rc;
+	struct inode *root_inode;
+
+	static struct tree_descr smack_files[] = {
+		[SMK_LOAD]	=
+			{"load", &smk_load_ops, S_IRUGO|S_IWUSR},
+		[SMK_CIPSO]	=
+			{"cipso", &smk_cipso_ops, S_IRUGO|S_IWUSR},
+		[SMK_DOI]	=
+			{"doi", &smk_doi_ops, S_IRUGO|S_IWUSR},
+		[SMK_DIRECT]	=
+			{"direct", &smk_direct_ops, S_IRUGO|S_IWUSR},
+		[SMK_AMBIENT]	=
+			{"ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR},
+		[SMK_NLTYPE]	=
+			{"nltype", &smk_nltype_ops, S_IRUGO|S_IWUSR},
+		/* last one */ {""}
+	};
+
+	rc = simple_fill_super(sb, SMACK_MAGIC, smack_files);
+	if (rc != 0) {
+		printk(KERN_ERR "%s failed %d while creating inodes\n",
+			__func__, rc);
+		return rc;
+	}
+
+	root_inode = sb->s_root->d_inode;
+	root_inode->i_security = new_inode_smack(smack_known_floor.smk_known);
+
+	return 0;
+}
+
+/**
+ * smk_get_sb - get the smackfs superblock
+ * @fs_type: passed along without comment
+ * @flags: passed along without comment
+ * @dev_name: passed along without comment
+ * @data: passed along without comment
+ * @mnt: passed along without comment
+ *
+ * Just passes everything along.
+ *
+ * Returns what the lower level code does.
+ */
+static int smk_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, data, smk_fill_super, mnt);
+}
+
+static struct file_system_type smk_fs_type = {
+	.name		= "smackfs",
+	.get_sb		= smk_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static struct vfsmount *smackfs_mount;
+
+/**
+ * init_smk_fs - get the smackfs superblock
+ *
+ * register the smackfs
+ *
+ * Returns 0 unless the registration fails.
+ */
+static int __init init_smk_fs(void)
+{
+	int err;
+
+	err = register_filesystem(&smk_fs_type);
+	if (!err) {
+		smackfs_mount = kern_mount(&smk_fs_type);
+		if (IS_ERR(smackfs_mount)) {
+			printk(KERN_ERR "smackfs:  could not mount!\n");
+			err = PTR_ERR(smackfs_mount);
+			smackfs_mount = NULL;
+		}
+	}
+
+	sema_init(&smack_write_sem, 1);
+	smk_cipso_doi();
+
+	return err;
+}
+
+__initcall(init_smk_fs);
-- 
cgit v1.1


From 7038220aa511131f348018b85df172a21476fef9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 4 Feb 2008 22:29:53 -0800
Subject: FRV: permit the memory to be located elsewhere in NOMMU mode

Permit the memory to be located somewhere other than address 0xC0000000 in
NOMMU mode.  The configuration options are already present, it just
requires wiring up in the linker script.

Note that only a limited set of locations of runtime addresses are available
because of the way the CPU protection registers work.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/Kconfig              | 9 +++++++++
 arch/frv/kernel/vmlinux.lds.S | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index bf0468c..96f7d70 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -138,6 +138,15 @@ config UCPAGE_OFFSET_C0000000
 
 endchoice
 
+config PAGE_OFFSET
+	hex
+	default 0x20000000 if UCPAGE_OFFSET_20000000
+	default 0x40000000 if UCPAGE_OFFSET_40000000
+	default 0x60000000 if UCPAGE_OFFSET_60000000
+	default 0x80000000 if UCPAGE_OFFSET_80000000
+	default 0xA0000000 if UCPAGE_OFFSET_A0000000
+	default 0xC0000000
+
 config PROTECT_KERNEL
 	bool "Protect core kernel against userspace"
 	depends on !MMU
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index f42b328..ef7527b 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@ ENTRY(_start)
 
 jiffies = jiffies_64 + 4;
 
-__page_offset = 0xc0000000;		/* start of area covered by struct pages */
+__page_offset = CONFIG_PAGE_OFFSET;	/* start of area covered by struct pages */
 __kernel_image_start = __page_offset;	/* address at which kernel image resides */
 
 SECTIONS
-- 
cgit v1.1


From 82b12e232d2a71f566dca48df4ae4548e6d8d53d Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Mon, 4 Feb 2008 22:29:54 -0800
Subject: FRV: move DMA macros to scatterlist.h for consistency.

To be consistent with other architectures, these two DMA macros should
be defined in scatterlist.h as opposed to dma-mapping.h

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-frv/dma-mapping.h | 10 ----------
 include/asm-frv/scatterlist.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index bcb2df6..2e8966c 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -17,16 +17,6 @@ void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle
 void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle);
 
 /*
- * These macros should be used after a pci_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns, or alternatively stop on the first sg_dma_len(sg) which
- * is 0.
- */
-#define sg_dma_address(sg)	((sg)->dma_address)
-#define sg_dma_len(sg)		((sg)->length)
-
-/*
  * Map a single buffer of the indicated size for DMA in streaming mode.
  * The 32-bit bus address to use is returned.
  *
diff --git a/include/asm-frv/scatterlist.h b/include/asm-frv/scatterlist.h
index 2e7143b..4bca8a2 100644
--- a/include/asm-frv/scatterlist.h
+++ b/include/asm-frv/scatterlist.h
@@ -31,6 +31,16 @@ struct scatterlist {
 	unsigned int	length;
 };
 
+/*
+ * These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns, or alternatively stop on the first sg_dma_len(sg) which
+ * is 0.
+ */
+#define sg_dma_address(sg)	((sg)->dma_address)
+#define sg_dma_len(sg)		((sg)->length)
+
 #define ISA_DMA_THRESHOLD (0xffffffffUL)
 
 #endif /* !_ASM_SCATTERLIST_H */
-- 
cgit v1.1


From 8c5900b2d669c7aac1e5b3e8bb8e331a0e6cff61 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Mon, 4 Feb 2008 22:29:55 -0800
Subject: frv: remove dead config symbol from FRV code

Remove dead config symbol from FRV code.

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-frv/page.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h
index 213d92f..bd9bd2d 100644
--- a/include/asm-frv/page.h
+++ b/include/asm-frv/page.h
@@ -76,10 +76,6 @@ extern unsigned long max_pfn;
 
 #endif /* __ASSEMBLY__ */
 
-#ifdef CONFIG_CONTIGUOUS_PAGE_ALLOC
-#define WANT_PAGE_VIRTUAL	1
-#endif
-
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>
 
-- 
cgit v1.1


From 540e3102f75dca9c5e614905527599de18294cc8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 4 Feb 2008 22:29:56 -0800
Subject: frv: use find_task_by_vpid in cxn_pin_by_pid

The function is question gets the pid from sysctl table, so this one is a
virtual pid, i.e.  the pid of a task as it is seen from inside a namespace.

So the find_task_by_vpid() must be used here.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/mm/mmu-context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 1530a411..81757d5 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid)
 
 	/* get a handle on the mm_struct */
 	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
+	tsk = find_task_by_vpid(pid);
 	if (tsk) {
 		ret = -EINVAL;
 
-- 
cgit v1.1


From 16791963ff7dd6a108251f5fa4b273cf1ffe531f Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@snapgear.com>
Date: Mon, 4 Feb 2008 22:29:56 -0800
Subject: m68knommu: use ARRAY_SIZE in ColdFire serial driver

Use ARRAY_SIZE macroto get maximum ports in ColdFire serial driver.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/mcf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c
index 051fcc2..e76fc72 100644
--- a/drivers/serial/mcf.c
+++ b/drivers/serial/mcf.c
@@ -434,7 +434,7 @@ static struct uart_ops mcf_uart_ops = {
 
 static struct mcf_uart mcf_ports[3];
 
-#define	MCF_MAXPORTS	(sizeof(mcf_ports) / sizeof(struct mcf_uart))
+#define	MCF_MAXPORTS	ARRAY_SIZE(mcf_ports)
 
 /****************************************************************************/
 #if defined(CONFIG_SERIAL_MCF_CONSOLE)
-- 
cgit v1.1


From c155f3f9c54c602823c3970ec8a465ec3f9c2017 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Mon, 4 Feb 2008 22:29:58 -0800
Subject: m68knomu: remove dead config symbols from m68knomu code

remove dead config symbols from m68knommu code

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68knommu/kernel/setup.c    |  3 ---
 include/asm-m68knommu/mcfne.h    | 27 ---------------------------
 include/asm-m68knommu/mcfsim.h   |  4 +---
 include/asm-m68knommu/mcftimer.h |  2 +-
 include/asm-m68knommu/mcfuart.h  |  2 +-
 include/asm-m68knommu/system.h   | 17 -----------------
 6 files changed, 3 insertions(+), 52 deletions(-)

diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index 332345d..81507c5 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -64,9 +64,6 @@ void (*mach_power_off)(void);
 #ifdef CONFIG_M68VZ328
 	#define CPU "MC68VZ328"
 #endif
-#ifdef CONFIG_M68332
-	#define CPU "MC68332"
-#endif
 #ifdef CONFIG_M68360
 	#define CPU "MC68360"
 #endif
diff --git a/include/asm-m68knommu/mcfne.h b/include/asm-m68knommu/mcfne.h
index c920ccd..431f63a 100644
--- a/include/asm-m68knommu/mcfne.h
+++ b/include/asm-m68knommu/mcfne.h
@@ -60,17 +60,6 @@
 #define	NE2000_BYTE		volatile unsigned char
 #endif
 
-#if defined(CONFIG_CFV240)
-#define NE2000_ADDR             0x40010000
-#define NE2000_ADDR1            0x40010001
-#define NE2000_ODDOFFSET        0x00000000
-#define NE2000_IRQ              1
-#define NE2000_IRQ_VECTOR       0x19
-#define NE2000_IRQ_PRIORITY     2
-#define NE2000_IRQ_LEVEL        1
-#define	NE2000_BYTE		volatile unsigned char
-#endif
-
 #if defined(CONFIG_M5307C3)
 #define NE2000_ADDR		0x40000300
 #define NE2000_ODDOFFSET	0x00010000
@@ -173,13 +162,8 @@ void ne2000_outsw(unsigned int addr, void *vbuf, unsigned long len);
  *	On most NE2000 implementations on ColdFire boards the chip is
  *	mapped in kinda funny, due to its ISA heritage.
  */
-#ifdef CONFIG_CFV240
-#define NE2000_PTR(addr)	(NE2000_ADDR + ((addr & 0x3f) << 1) + 1)
-#define NE2000_DATA_PTR(addr)	(NE2000_ADDR + ((addr & 0x3f) << 1))
-#else
 #define	NE2000_PTR(addr)	((addr&0x1)?(NE2000_ODDOFFSET+addr-1):(addr))
 #define	NE2000_DATA_PTR(addr)	(addr)
-#endif
 
 
 void ne2000_outb(unsigned int val, unsigned int addr)
@@ -285,17 +269,6 @@ void ne2000_irqsetup(int irq)
 }
 #endif
 
-#if defined(CONFIG_CFV240)
-void ne2000_irqsetup(int irq)
-{
-	volatile unsigned char  *icrp;
-
-	icrp = (volatile unsigned char *) (MCF_MBAR + MCFSIM_ICR1);
-	*icrp = MCFSIM_ICR_LEVEL1 | MCFSIM_ICR_PRI2 | MCFSIM_ICR_AUTOVEC;
-	mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_EINT1);
-}
-#endif
-
 #if defined(CONFIG_M5206e) && defined(CONFIG_NETtel)
 void ne2000_irqsetup(int irq)
 {
diff --git a/include/asm-m68knommu/mcfsim.h b/include/asm-m68knommu/mcfsim.h
index 1074ae7..da3f2ce 100644
--- a/include/asm-m68knommu/mcfsim.h
+++ b/include/asm-m68knommu/mcfsim.h
@@ -17,9 +17,7 @@
  *	Include 5204, 5206/e, 5235, 5249, 5270/5271, 5272, 5280/5282,
  *	5307 or 5407 specific addresses.
  */
-#if defined(CONFIG_M5204)
-#include <asm/m5204sim.h>
-#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e)
+#if defined(CONFIG_M5206) || defined(CONFIG_M5206e)
 #include <asm/m5206sim.h>
 #elif defined(CONFIG_M520x)
 #include <asm/m520xsim.h>
diff --git a/include/asm-m68knommu/mcftimer.h b/include/asm-m68knommu/mcftimer.h
index 6f4d796..0f90f6d 100644
--- a/include/asm-m68knommu/mcftimer.h
+++ b/include/asm-m68knommu/mcftimer.h
@@ -16,7 +16,7 @@
 /*
  *	Get address specific defines for this ColdFire member.
  */
-#if defined(CONFIG_M5204) || defined(CONFIG_M5206) || defined(CONFIG_M5206e)
+#if defined(CONFIG_M5206) || defined(CONFIG_M5206e)
 #define	MCFTIMER_BASE1		0x100		/* Base address of TIMER1 */
 #define	MCFTIMER_BASE2		0x120		/* Base address of TIMER2 */
 #elif defined(CONFIG_M5272)
diff --git a/include/asm-m68knommu/mcfuart.h b/include/asm-m68knommu/mcfuart.h
index 8a7a677..ef22938 100644
--- a/include/asm-m68knommu/mcfuart.h
+++ b/include/asm-m68knommu/mcfuart.h
@@ -19,7 +19,7 @@
 #if defined(CONFIG_M5272)
 #define	MCFUART_BASE1		0x100		/* Base address of UART1 */
 #define	MCFUART_BASE2		0x140		/* Base address of UART2 */
-#elif defined(CONFIG_M5204) || defined(CONFIG_M5206) || defined(CONFIG_M5206e)
+#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e)
 #if defined(CONFIG_NETtel)
 #define	MCFUART_BASE1		0x180		/* Base address of UART1 */
 #define	MCFUART_BASE2		0x140		/* Base address of UART2 */
diff --git a/include/asm-m68knommu/system.h b/include/asm-m68knommu/system.h
index 15b4c7d..ee2dc07 100644
--- a/include/asm-m68knommu/system.h
+++ b/include/asm-m68knommu/system.h
@@ -207,23 +207,6 @@ cmpxchg(volatile int *p, int old, int new)
 }
 
 
-#ifdef CONFIG_M68332
-#define HARD_RESET_NOW() ({		\
-        local_irq_disable();		\
-        asm("				\
-	movew   #0x0000, 0xfffa6a;	\
-        reset;				\
-        /*movew #0x1557, 0xfffa44;*/	\
-        /*movew #0x0155, 0xfffa46;*/	\
-        moveal #0, %a0;			\
-        movec %a0, %vbr;		\
-        moveal 0, %sp;			\
-        moveal 4, %a0;			\
-        jmp (%a0);			\
-        ");				\
-})
-#endif
-
 #if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \
 	defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 )
 #define HARD_RESET_NOW() ({		\
-- 
cgit v1.1


From f156ac8c7aeddb2d85294b7a3b849178625e15e2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Mon, 4 Feb 2008 22:29:58 -0800
Subject: m68knommu: removing config variable DUMPTOFLASH

Removing config variable DUMPTOFLASH, since it is not used

Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68knommu/Kconfig.debug | 7 -------
 arch/m68knommu/defconfig     | 1 -
 2 files changed, 8 deletions(-)

diff --git a/arch/m68knommu/Kconfig.debug b/arch/m68knommu/Kconfig.debug
index 9ff47bd..ed6d9a83 100644
--- a/arch/m68knommu/Kconfig.debug
+++ b/arch/m68knommu/Kconfig.debug
@@ -21,13 +21,6 @@ config BOOTPARAM_STRING
 	default 'console=ttyS0,19200'
 	depends on BOOTPARAM
 
-config DUMPTOFLASH
-	bool "Panic/Dump to FLASH"
-	depends on COLDFIRE
-	help
-	  Dump any panic of trap output into a flash memory segment
-	  for later analysis.
-
 config NO_KERNEL_MSG
 	bool "Suppress Kernel BUG Messages"
 	help
diff --git a/arch/m68knommu/defconfig b/arch/m68knommu/defconfig
index 5a0ecaa..6481130 100644
--- a/arch/m68knommu/defconfig
+++ b/arch/m68knommu/defconfig
@@ -597,7 +597,6 @@ CONFIG_MSDOS_PARTITION=y
 # CONFIG_FULLDEBUG is not set
 # CONFIG_HIGHPROFILE is not set
 # CONFIG_BOOTPARAM is not set
-# CONFIG_DUMPTOFLASH is not set
 # CONFIG_NO_KERNEL_MSG is not set
 # CONFIG_BDM_DISABLE is not set
 
-- 
cgit v1.1


From f905bc447c303fefcb180c7e8b641746ffa6cf87 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 4 Feb 2008 22:29:59 -0800
Subject: nommu: add new vmalloc_user() and remap_vmalloc_range() interfaces.

This builds on top of the earlier vmalloc_32_user() work introduced by
b50731732f926d6c49fd0724616a7344c31cd5cf, as we now have places in the nommu
allmodconfig that hit up against these missing APIs.

As vmalloc_32_user() is already implemented, this is moved over to
vmalloc_user() and simply made a wrapper.  As all current nommu platforms are
32-bit addressable, there's no special casing we have to do for ZONE_DMA and
things of that nature as per GFP_VMALLOC32.

remap_vmalloc_range() needs to check VM_USERMAP in order to figure out whether
we permit the remap or not, which means that we also have to rework the
vmalloc_user() code to grovel for the VMA and set the flag.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David McCullough <david_mccullough@securecomputing.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/mm/nommu.c b/mm/nommu.c
index f3bfd01..5d8ae08 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,6 +10,7 @@
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
+ *  Copyright (c) 2007      Paul Mundt <lethal@linux-sh.org>
  */
 
 #include <linux/module.h>
@@ -183,6 +184,26 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *vmalloc_user(unsigned long size)
+{
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL);
+	if (ret) {
+		struct vm_area_struct *vma;
+
+		down_write(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, (unsigned long)ret);
+		if (vma)
+			vma->vm_flags |= VM_USERMAP;
+		up_write(&current->mm->mmap_sem);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
 struct page *vmalloc_to_page(const void *addr)
 {
 	return virt_to_page(addr);
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32);
  *
  * The resulting memory area is 32bit addressable and zeroed so it can be
  * mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
  */
 void *vmalloc_32_user(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	/*
+	 * We'll have to sort out the ZONE_DMA bits for 64-bit,
+	 * but for now this can simply use vmalloc_user() directly.
+	 */
+	return vmalloc_user(size);
 }
 EXPORT_SYMBOL(vmalloc_32_user);
 
@@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+			unsigned long pgoff)
+{
+	unsigned int size = vma->vm_end - vma->vm_start;
+
+	if (!(vma->vm_flags & VM_USERMAP))
+		return -EINVAL;
+
+	vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+	vma->vm_end = vma->vm_start + size;
+
+	return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
-- 
cgit v1.1


From 49eaf7d7f0ec493beb724fbb3c60f37d7b92bc86 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:00 -0800
Subject: m68knommu: remove duplicate exports

One EXPORT_SYMBOL should be enough for everyone.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68knommu/kernel/m68k_ksyms.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index f795062..53fad14 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -24,14 +24,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(strnlen);
-EXPORT_SYMBOL(strrchr);
-EXPORT_SYMBOL(strstr);
-EXPORT_SYMBOL(strchr);
-EXPORT_SYMBOL(strcat);
-EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(strcmp);
-EXPORT_SYMBOL(strncmp);
 
 EXPORT_SYMBOL(ip_fast_csum);
 
@@ -46,9 +38,6 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
    it's OK to leave it out of version control.  */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcmp);
-EXPORT_SYMBOL(memscan);
-EXPORT_SYMBOL(memmove);
 
 EXPORT_SYMBOL(__down_failed);
 EXPORT_SYMBOL(__down_failed_interruptible);
-- 
cgit v1.1


From e820ce72d3aaadb8b53455cbdf213d3439f6c280 Mon Sep 17 00:00:00 2001
From: Lucas Woods <woodzy@gmail.com>
Date: Mon, 4 Feb 2008 22:30:01 -0800
Subject: arch/alpha: remove duplicate includes

Signed-off-by: Lucas Woods <woodzy@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/setup.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index bd5e68c..beff629 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -58,7 +58,6 @@ static struct notifier_block alpha_panic_block = {
 #include <asm/system.h>
 #include <asm/hwrpb.h>
 #include <asm/dma.h>
-#include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/console.h>
 
-- 
cgit v1.1


From 26a6e661b118b95422ccfcd10c9997db5967df58 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:30:02 -0800
Subject: alpha: atomic_add_return() should return int

Prevents stuff like

drivers/crypto/hifn_795x.c:2443: warning: format '%d' expects type 'int', but argument 4 has type 'long int'
drivers/crypto/hifn_795x.c:2443: warning: format '%d' expects type 'int', but argument 4 has type 'long int'

(at least).

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/atomic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h
index f5cb7b8..ca88e54 100644
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -100,7 +100,7 @@ static __inline__ void atomic64_sub(long i, atomic64_t * v)
 /*
  * Same as above, but return the result value
  */
-static __inline__ long atomic_add_return(int i, atomic_t * v)
+static inline int atomic_add_return(int i, atomic_t *v)
 {
 	long temp, result;
 	smp_mb();
-- 
cgit v1.1


From fd2e2633c208e111348c9f84fd7eeb9c844aca02 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Mon, 4 Feb 2008 22:30:02 -0800
Subject: alpha: kill deprecated virt_to_bus

pci-noop.c doesn't use DMA mappings.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/pci-noop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index 468b76c..8ac0831 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -165,7 +165,7 @@ dma_alloc_coherent(struct device *dev, size_t size,
 	ret = (void *)__get_free_pages(gfp, get_order(size));
 	if (ret) {
 		memset(ret, 0, size);
-		*dma_handle = virt_to_bus(ret);
+		*dma_handle = virt_to_phys(ret);
 	}
 	return ret;
 }
@@ -184,7 +184,7 @@ dma_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 
 		BUG_ON(!sg_page(sg));
 		va = sg_virt(sg);
-		sg_dma_address(sg) = (dma_addr_t)virt_to_bus(va);
+		sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va);
 		sg_dma_len(sg) = sg->length;
 	}
 
-- 
cgit v1.1


From 2f78dcfd30955a48a2e738d08e4001ebd0e8402c Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@citrix.com>
Date: Mon, 4 Feb 2008 22:30:03 -0800
Subject: Alpha doesn't use socketcall

Alpha doesn't use socketcall and doesn't provide __NR_socketcall.

Signed-off-by: Samuel Thibault <samuel.thibault@citrix.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/unistd.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/asm-alpha/unistd.h b/include/asm-alpha/unistd.h
index 29bf2fd..5b5c174 100644
--- a/include/asm-alpha/unistd.h
+++ b/include/asm-alpha/unistd.h
@@ -442,7 +442,6 @@
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_SOCKETCALL
 #define __ARCH_WANT_SYS_FADVISE64
 #define __ARCH_WANT_SYS_GETPGRP
 #define __ARCH_WANT_SYS_OLD_GETRLIMIT
-- 
cgit v1.1


From 6d6f8d52fd0ff5d3660368232f71c3224d7508e1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 4 Feb 2008 22:30:04 -0800
Subject: agp: alpha nopage

Convert AGP alpha driver from nopage to fault.
NULL is NOPAGE_SIGBUS, so we aren't changing behaviour there.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/agp/alpha-agp.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index aa8f3a3..e77c178 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -11,29 +11,28 @@
 
 #include "agp.h"
 
-static struct page *alpha_core_agp_vm_nopage(struct vm_area_struct *vma,
-					     unsigned long address,
-					     int *type)
+static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
 {
 	alpha_agp_info *agp = agp_bridge->dev_private_data;
 	dma_addr_t dma_addr;
 	unsigned long pa;
 	struct page *page;
 
-	dma_addr = address - vma->vm_start + agp->aperture.bus_base;
+	dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start
+						+ agp->aperture.bus_base;
 	pa = agp->ops->translate(agp, dma_addr);
 
 	if (pa == (unsigned long)-EINVAL)
-		return NULL;	/* no translation */
+		return VM_FAULT_SIGBUS;	/* no translation */
 
 	/*
 	 * Get the page, inc the use count, and return it
 	 */
 	page = virt_to_page(__va(pa));
 	get_page(page);
-	if (type)
-		*type = VM_FAULT_MINOR;
-	return page;
+	vmf->page = page;
+	return 0;
 }
 
 static struct aper_size_info_fixed alpha_core_agp_sizes[] =
@@ -42,7 +41,7 @@ static struct aper_size_info_fixed alpha_core_agp_sizes[] =
 };
 
 struct vm_operations_struct alpha_core_agp_vm_ops = {
-	.nopage = alpha_core_agp_vm_nopage,
+	.fault = alpha_core_agp_vm_fault,
 };
 
 
-- 
cgit v1.1


From cbed6c6e0f7a52859fd0207ef3ffcf4bfffdbf95 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 4 Feb 2008 22:30:05 -0800
Subject: alpha: fix warning by fixing flush_tlb_kernel_range()

mm/vmalloc.c: In function 'unmap_kernel_range':
mm/vmalloc.c:75: warning: unused variable 'start'

Macros are so horrid.

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/tlbflush.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/asm-alpha/tlbflush.h b/include/asm-alpha/tlbflush.h
index b9e9147..9d87aaa 100644
--- a/include/asm-alpha/tlbflush.h
+++ b/include/asm-alpha/tlbflush.h
@@ -142,6 +142,10 @@ extern void flush_tlb_range(struct vm_area_struct *, unsigned long,
 
 #endif /* CONFIG_SMP */
 
-#define flush_tlb_kernel_range(start, end) flush_tlb_all()
+static inline void flush_tlb_kernel_range(unsigned long start,
+					unsigned long end)
+{
+	flush_tlb_all();
+}
 
 #endif /* _ALPHA_TLBFLUSH_H */
-- 
cgit v1.1


From 47a460d5a307e639d6c9cdf9bb4857e2f5f3cb76 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:06 -0800
Subject: kernel/power/disk.c: make code static

resume_file[] and create_image() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d09da08..859a8e5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,7 +26,7 @@
 
 
 static int noresume = 0;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 
@@ -185,7 +185,7 @@ static void platform_restore_cleanup(int platform_mode)
  *	reappears in this routine after a restore.
  */
 
-int create_image(int platform_mode)
+static int create_image(int platform_mode)
 {
 	int error;
 
-- 
cgit v1.1


From 4ef7229ffa93695e346d510b871452811509ea65 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:06 -0800
Subject: make kernel_shutdown_prepare() static

kernel_shutdown_prepare() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h | 2 --
 kernel/sys.c           | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 85ea63f..b93b541 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -59,8 +59,6 @@ extern void machine_crash_shutdown(struct pt_regs *);
  * Architecture independent implemenations of sys_reboot commands.
  */
 
-extern void kernel_shutdown_prepare(enum system_states state);
-
 extern void kernel_restart(char *cmd);
 extern void kernel_halt(void);
 extern void kernel_power_off(void);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4162d12..53de35f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -315,7 +315,7 @@ static void kernel_kexec(void)
 #endif
 }
 
-void kernel_shutdown_prepare(enum system_states state)
+static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
-- 
cgit v1.1


From d82b35186eaa816267f044bd70cc0acb3c7971a3 Mon Sep 17 00:00:00 2001
From: Mark Gross <mgross@linux.intel.com>
Date: Mon, 4 Feb 2008 22:30:08 -0800
Subject: pm qos infrastructure and interface

The following patch is a generalization of the latency.c implementation done
by Arjan last year.  It provides infrastructure for more than one parameter,
and exposes a user mode interface for processes to register pm_qos
expectations of processes.

This interface provides a kernel and user mode interface for registering
performance expectations by drivers, subsystems and user space applications on
one of the parameters.

Currently we have {cpu_dma_latency, network_latency, network_throughput} as
the initial set of pm_qos parameters.

The infrastructure exposes multiple misc device nodes one per implemented
parameter.  The set of parameters implement is defined by pm_qos_power_init()
and pm_qos_params.h.  This is done because having the available parameters
being runtime configurable or changeable from a driver was seen as too easy to
abuse.

For each parameter a list of performance requirements is maintained along with
an aggregated target value.  The aggregated target value is updated with
changes to the requirement list or elements of the list.  Typically the
aggregated target value is simply the max or min of the requirement values
held in the parameter list elements.

>From kernel mode the use of this interface is simple:

pm_qos_add_requirement(param_id, name, target_value):

  Will insert a named element in the list for that identified PM_QOS
  parameter with the target value.  Upon change to this list the new target is
  recomputed and any registered notifiers are called only if the target value
  is now different.

pm_qos_update_requirement(param_id, name, new_target_value):

  Will search the list identified by the param_id for the named list element
  and then update its target value, calling the notification tree if the
  aggregated target is changed.  with that name is already registered.

pm_qos_remove_requirement(param_id, name):

  Will search the identified list for the named element and remove it, after
  removal it will update the aggregate target and call the notification tree
  if the target was changed as a result of removing the named requirement.

>From user mode:

  Only processes can register a pm_qos requirement.  To provide for
  automatic cleanup for process the interface requires the process to register
  its parameter requirements in the following way:

  To register the default pm_qos target for the specific parameter, the
  process must open one of /dev/[cpu_dma_latency, network_latency,
  network_throughput]

  As long as the device node is held open that process has a registered
  requirement on the parameter.  The name of the requirement is
  "process_<PID>" derived from the current->pid from within the open system
  call.

  To change the requested target value the process needs to write a s32
  value to the open device node.  This translates to a
  pm_qos_update_requirement call.

  To remove the user mode request for a target value simply close the device
  node.

[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: fix build again]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: mark gross <mgross@linux.intel.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Adam Belay <abelay@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/pm_qos_interface.txt |  59 +++++
 drivers/cpuidle/cpuidle.c          |   7 +-
 drivers/cpuidle/governors/ladder.c |   5 +-
 drivers/cpuidle/governors/menu.c   |   4 +-
 include/linux/pm_qos_params.h      |  25 +++
 kernel/Makefile                    |   2 +-
 kernel/pm_qos_params.c             | 425 +++++++++++++++++++++++++++++++++++++
 7 files changed, 520 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/pm_qos_interface.txt
 create mode 100644 include/linux/pm_qos_params.h
 create mode 100644 kernel/pm_qos_params.c

diff --git a/Documentation/pm_qos_interface.txt b/Documentation/pm_qos_interface.txt
new file mode 100644
index 0000000..49adb1a
--- /dev/null
+++ b/Documentation/pm_qos_interface.txt
@@ -0,0 +1,59 @@
+PM quality of Service interface.
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Currently we have {cpu_dma_latency, network_latency, network_throughput} as the
+initial set of pm_qos parameters.
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter.  The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h.  This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requirements is maintained along with
+an aggregated target value.  The aggregated target value is updated with
+changes to the requirement list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the requirement values held
+in the parameter list elements.
+
+From kernel mode the use of this interface is simple:
+pm_qos_add_requirement(param_id, name, target_value):
+Will insert a named element in the list for that identified PM_QOS parameter
+with the target value.  Upon change to this list the new target is recomputed
+and any registered notifiers are called only if the target value is now
+different.
+
+pm_qos_update_requirement(param_id, name, new_target_value):
+Will search the list identified by the param_id for the named list element and
+then update its target value, calling the notification tree if the aggregated
+target is changed.  with that name is already registered.
+
+pm_qos_remove_requirement(param_id, name):
+Will search the identified list for the named element and remove it, after
+removal it will update the aggregate target and call the notification tree if
+the target was changed as a result of removing the named requirement.
+
+
+From user mode:
+Only processes can register a pm_qos requirement.  To provide for automatic
+cleanup for process the interface requires the process to register its
+parameter requirements in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+requirement on the parameter.  The name of the requirement is "process_<PID>"
+derived from the current->pid from within the open system call.
+
+To change the requested target value the process needs to write a s32 value to
+the open device node.  This translates to a pm_qos_update_requirement call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d2fabe7..2a98d99 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -12,7 +12,7 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/notifier.h>
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 
@@ -265,7 +265,10 @@ static struct notifier_block cpuidle_latency_notifier = {
 	.notifier_call = cpuidle_latency_notify,
 };
 
-#define latency_notifier_init(x) do { register_latency_notifier(x); } while (0)
+static inline void latency_notifier_init(struct notifier_block *n)
+{
+	pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n);
+}
 
 #else /* CONFIG_SMP */
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index eb666ec..ba7b9a6 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -14,7 +14,7 @@
 
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 #include <linux/moduleparam.h>
 #include <linux/jiffies.h>
 
@@ -81,7 +81,8 @@ static int ladder_select_state(struct cpuidle_device *dev)
 	/* consider promotion */
 	if (last_idx < dev->state_count - 1 &&
 	    last_residency > last_state->threshold.promotion_time &&
-	    dev->states[last_idx + 1].exit_latency <= system_latency_constraint()) {
+	    dev->states[last_idx + 1].exit_latency <=
+			pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
 		last_state->stats.promotion_count++;
 		last_state->stats.demotion_count = 0;
 		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 299d45c..78d77c5 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -8,7 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 #include <linux/time.h>
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
@@ -48,7 +48,7 @@ static int menu_select(struct cpuidle_device *dev)
 			break;
 		if (s->target_residency > data->predicted_us)
 			break;
-		if (s->exit_latency > system_latency_constraint())
+		if (s->exit_latency > pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY))
 			break;
 	}
 
diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
new file mode 100644
index 0000000..2e4e97b
--- /dev/null
+++ b/include/linux/pm_qos_params.h
@@ -0,0 +1,25 @@
+/* interface for the pm_qos_power infrastructure of the linux kernel.
+ *
+ * Mark Gross
+ */
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/miscdevice.h>
+
+#define PM_QOS_RESERVED 0
+#define PM_QOS_CPU_DMA_LATENCY 1
+#define PM_QOS_NETWORK_LATENCY 2
+#define PM_QOS_NETWORK_THROUGHPUT 3
+
+#define PM_QOS_NUM_CLASSES 4
+#define PM_QOS_DEFAULT_VALUE -1
+
+int pm_qos_add_requirement(int qos, char *name, s32 value);
+int pm_qos_update_requirement(int qos, char *name, s32 new_value);
+void pm_qos_remove_requirement(int qos, char *name);
+
+int pm_qos_requirement(int qos);
+
+int pm_qos_add_notifier(int qos, struct notifier_block *notifier);
+int pm_qos_remove_notifier(int qos, struct notifier_block *notifier);
+
diff --git a/kernel/Makefile b/kernel/Makefile
index db9af70..8331243 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
-	    utsname.o notifier.o ksysfs.o
+	    utsname.o notifier.o ksysfs.o pm_qos_params.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
new file mode 100644
index 0000000..0afe32b
--- /dev/null
+++ b/kernel/pm_qos_params.c
@@ -0,0 +1,425 @@
+/*
+ * This module exposes the interface to kernel space for specifying
+ * QoS dependencies.  It provides infrastructure for registration of:
+ *
+ * Dependents on a QoS value : register requirements
+ * Watchers of QoS value : get notified when target QoS value changes
+ *
+ * This QoS design is best effort based.  Dependents register their QoS needs.
+ * Watchers register to keep track of the current QoS needs of the system.
+ *
+ * There are 3 basic classes of QoS parameter: latency, timeout, throughput
+ * each have defined units:
+ * latency: usec
+ * timeout: usec <-- currently not used.
+ * throughput: kbs (kilo byte / sec)
+ *
+ * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ *
+ * User mode requirements on a QOS parameter register themselves to the
+ * subsystem by opening the device node /dev/... and writing there request to
+ * the node.  As long as the process holds a file handle open to the node the
+ * client continues to be accounted for.  Upon file release the usermode
+ * requirement is removed and a new qos target is computed.  This way when the
+ * requirement that the application has is cleaned up when closes the file
+ * pointer or exits the pm_qos_object will get an opportunity to clean up.
+ *
+ * mark gross mgross@linux.intel.com
+ */
+
+#include <linux/pm_qos_params.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+
+#include <linux/uaccess.h>
+
+/*
+ * locking rule: all changes to target_value or requirements or notifiers lists
+ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
+ * held, taken with _irqsave.  One lock to rule them all
+ */
+struct requirement_list {
+	struct list_head list;
+	union {
+		s32 value;
+		s32 usec;
+		s32 kbps;
+	};
+	char *name;
+};
+
+static s32 max_compare(s32 v1, s32 v2);
+static s32 min_compare(s32 v1, s32 v2);
+
+struct pm_qos_object {
+	struct requirement_list requirements;
+	struct blocking_notifier_head *notifiers;
+	struct miscdevice pm_qos_power_miscdev;
+	char *name;
+	s32 default_value;
+	s32 target_value;
+	s32 (*comparitor)(s32, s32);
+};
+
+static struct pm_qos_object null_pm_qos;
+static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
+static struct pm_qos_object cpu_dma_pm_qos = {
+	.requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+	.notifiers = &cpu_dma_lat_notifier,
+	.name = "cpu_dma_latency",
+	.default_value = 2000 * USEC_PER_SEC,
+	.target_value = 2000 * USEC_PER_SEC,
+	.comparitor = min_compare
+};
+
+static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
+static struct pm_qos_object network_lat_pm_qos = {
+	.requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+	.notifiers = &network_lat_notifier,
+	.name = "network_latency",
+	.default_value = 2000 * USEC_PER_SEC,
+	.target_value = 2000 * USEC_PER_SEC,
+	.comparitor = min_compare
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
+static struct pm_qos_object network_throughput_pm_qos = {
+	.requirements =
+		{LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
+	.notifiers = &network_throughput_notifier,
+	.name = "network_throughput",
+	.default_value = 0,
+	.target_value = 0,
+	.comparitor = max_compare
+};
+
+
+static struct pm_qos_object *pm_qos_array[] = {
+	&null_pm_qos,
+	&cpu_dma_pm_qos,
+	&network_lat_pm_qos,
+	&network_throughput_pm_qos
+};
+
+static DEFINE_SPINLOCK(pm_qos_lock);
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+		size_t count, loff_t *f_pos);
+static int pm_qos_power_open(struct inode *inode, struct file *filp);
+static int pm_qos_power_release(struct inode *inode, struct file *filp);
+
+static const struct file_operations pm_qos_power_fops = {
+	.write = pm_qos_power_write,
+	.open = pm_qos_power_open,
+	.release = pm_qos_power_release,
+};
+
+/* static helper functions */
+static s32 max_compare(s32 v1, s32 v2)
+{
+	return max(v1, v2);
+}
+
+static s32 min_compare(s32 v1, s32 v2)
+{
+	return min(v1, v2);
+}
+
+
+static void update_target(int target)
+{
+	s32 extreme_value;
+	struct requirement_list *node;
+	unsigned long flags;
+	int call_notifier = 0;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	extreme_value = pm_qos_array[target]->default_value;
+	list_for_each_entry(node,
+			&pm_qos_array[target]->requirements.list, list) {
+		extreme_value = pm_qos_array[target]->comparitor(
+				extreme_value, node->value);
+	}
+	if (pm_qos_array[target]->target_value != extreme_value) {
+		call_notifier = 1;
+		pm_qos_array[target]->target_value = extreme_value;
+		pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
+			pm_qos_array[target]->target_value);
+	}
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	if (call_notifier)
+		blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
+			(unsigned long) extreme_value, NULL);
+}
+
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+	qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+	qos->pm_qos_power_miscdev.name = qos->name;
+	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+	return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+	int pm_qos_class;
+
+	for (pm_qos_class = 0;
+		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+		if (minor ==
+			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+			return pm_qos_class;
+	}
+	return -1;
+}
+
+/**
+ * pm_qos_requirement - returns current system wide qos expectation
+ * @pm_qos_class: identification of which qos value is requested
+ *
+ * This function returns the current target value in an atomic manner.
+ */
+int pm_qos_requirement(int pm_qos_class)
+{
+	int ret_val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	ret_val = pm_qos_array[pm_qos_class]->target_value;
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	return ret_val;
+}
+EXPORT_SYMBOL_GPL(pm_qos_requirement);
+
+/**
+ * pm_qos_add_requirement - inserts new qos request into the list
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * This function inserts a new entry in the pm_qos_class list of requested qos
+ * performance charactoistics.  It recomputes the agregate QoS expectations for
+ * the pm_qos_class of parrameters.
+ */
+int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+{
+	struct requirement_list *dep;
+	unsigned long flags;
+
+	dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
+	if (dep) {
+		if (value == PM_QOS_DEFAULT_VALUE)
+			dep->value = pm_qos_array[pm_qos_class]->default_value;
+		else
+			dep->value = value;
+		dep->name = kstrdup(name, GFP_KERNEL);
+		if (!dep->name)
+			goto cleanup;
+
+		spin_lock_irqsave(&pm_qos_lock, flags);
+		list_add(&dep->list,
+			&pm_qos_array[pm_qos_class]->requirements.list);
+		spin_unlock_irqrestore(&pm_qos_lock, flags);
+		update_target(pm_qos_class);
+
+		return 0;
+	}
+
+cleanup:
+	kfree(dep);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+
+/**
+ * pm_qos_update_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ * @value: defines the qos request
+ *
+ * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * If the named request isn't in the lest then no change is made.
+ */
+int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+{
+	unsigned long flags;
+	struct requirement_list *node;
+	int pending_update = 0;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	list_for_each_entry(node,
+		&pm_qos_array[pm_qos_class]->requirements.list, list) {
+		if (strcmp(node->name, name) == 0) {
+			if (new_value == PM_QOS_DEFAULT_VALUE)
+				node->value =
+				pm_qos_array[pm_qos_class]->default_value;
+			else
+				node->value = new_value;
+			pending_update = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+	if (pending_update)
+		update_target(pm_qos_class);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+
+/**
+ * pm_qos_remove_requirement - modifies an existing qos request
+ * @pm_qos_class: identifies which list of qos request to us
+ * @name: identifies the request
+ *
+ * Will remove named qos request from pm_qos_class list of parrameters and
+ * recompute the current target value for the pm_qos_class.
+ */
+void pm_qos_remove_requirement(int pm_qos_class, char *name)
+{
+	unsigned long flags;
+	struct requirement_list *node;
+	int pending_update = 0;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	list_for_each_entry(node,
+		&pm_qos_array[pm_qos_class]->requirements.list, list) {
+		if (strcmp(node->name, name) == 0) {
+			kfree(node->name);
+			list_del(&node->list);
+			kfree(node);
+			pending_update = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+	if (pending_update)
+		update_target(pm_qos_class);
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+
+/**
+ * pm_qos_add_notifier - sets notification entry for changes to target value
+ * @pm_qos_class: identifies which qos target changes should be notified.
+ * @notifier: notifier block managed by caller.
+ *
+ * will register the notifier into a notification chain that gets called
+ * uppon changes to the pm_qos_class target value.
+ */
+ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+	int retval;
+
+	retval = blocking_notifier_chain_register(
+			pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
+
+/**
+ * pm_qos_remove_notifier - deletes notification entry from chain.
+ * @pm_qos_class: identifies which qos target changes are notified.
+ * @notifier: notifier block to be removed.
+ *
+ * will remove the notifier from the notification chain that gets called
+ * uppon changes to the pm_qos_class target value.
+ */
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+	int retval;
+
+	retval = blocking_notifier_chain_unregister(
+			pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+
+#define PID_NAME_LEN sizeof("process_1234567890")
+static char name[PID_NAME_LEN];
+
+static int pm_qos_power_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	long pm_qos_class;
+
+	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
+	if (pm_qos_class >= 0) {
+		filp->private_data = (void *)pm_qos_class;
+		sprintf(name, "process_%d", current->pid);
+		ret = pm_qos_add_requirement(pm_qos_class, name,
+					PM_QOS_DEFAULT_VALUE);
+		if (ret >= 0)
+			return 0;
+	}
+
+	return -EPERM;
+}
+
+static int pm_qos_power_release(struct inode *inode, struct file *filp)
+{
+	int pm_qos_class;
+
+	pm_qos_class = (long)filp->private_data;
+	sprintf(name, "process_%d", current->pid);
+	pm_qos_remove_requirement(pm_qos_class, name);
+
+	return 0;
+}
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+		size_t count, loff_t *f_pos)
+{
+	s32 value;
+	int pm_qos_class;
+
+	pm_qos_class = (long)filp->private_data;
+	if (count != sizeof(s32))
+		return -EINVAL;
+	if (copy_from_user(&value, buf, sizeof(s32)))
+		return -EFAULT;
+	sprintf(name, "process_%d", current->pid);
+	pm_qos_update_requirement(pm_qos_class, name, value);
+
+	return  sizeof(s32);
+}
+
+
+static int __init pm_qos_power_init(void)
+{
+	int ret = 0;
+
+	ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+	if (ret < 0) {
+		printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+		return ret;
+	}
+	ret = register_pm_qos_misc(&network_lat_pm_qos);
+	if (ret < 0) {
+		printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+		return ret;
+	}
+	ret = register_pm_qos_misc(&network_throughput_pm_qos);
+	if (ret < 0)
+		printk(KERN_ERR
+			"pm_qos_param: network_throughput setup failed\n");
+
+	return ret;
+}
+
+late_initcall(pm_qos_power_init);
-- 
cgit v1.1


From f011e2e2df3393c16b0fdc48e855e909b7e021ee Mon Sep 17 00:00:00 2001
From: Mark Gross <mgross@linux.intel.com>
Date: Mon, 4 Feb 2008 22:30:09 -0800
Subject: latency.c: use QoS infrastructure

Replace latency.c use with pm_qos_params use.

Signed-off-by: mark gross <mgross@linux.intel.com>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/processor_idle.c  |  18 +--
 drivers/net/wireless/ipw2100.c |  12 +-
 include/linux/latency.h        |  25 ----
 kernel/Makefile                |   2 +-
 kernel/latency.c               | 280 -----------------------------------------
 sound/core/pcm_native.c        |  11 +-
 6 files changed, 26 insertions(+), 322 deletions(-)
 delete mode 100644 include/linux/latency.h
 delete mode 100644 kernel/latency.c

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index eb1f82f..199ea21 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -38,7 +38,7 @@
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>	/* need_resched() */
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 #include <linux/clockchips.h>
 #include <linux/cpuidle.h>
 
@@ -648,7 +648,8 @@ static void acpi_processor_idle(void)
 	if (cx->promotion.state &&
 	    ((cx->promotion.state - pr->power.states) <= max_cstate)) {
 		if (sleep_ticks > cx->promotion.threshold.ticks &&
-		  cx->promotion.state->latency <= system_latency_constraint()) {
+		  cx->promotion.state->latency <=
+				pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
 			cx->promotion.count++;
 			cx->demotion.count = 0;
 			if (cx->promotion.count >=
@@ -692,7 +693,8 @@ static void acpi_processor_idle(void)
 	 * or if the latency of the current state is unacceptable
 	 */
 	if ((pr->power.state - pr->power.states) > max_cstate ||
-		pr->power.state->latency > system_latency_constraint()) {
+		pr->power.state->latency >
+				pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
 		if (cx->demotion.state)
 			next_state = cx->demotion.state;
 	}
@@ -1200,7 +1202,7 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
 		   "maximum allowed latency: %d usec\n",
 		   pr->power.state ? pr->power.state - pr->power.states : 0,
 		   max_cstate, (unsigned)pr->power.bm_activity,
-		   system_latency_constraint());
+		   pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY));
 
 	seq_puts(seq, "states:\n");
 
@@ -1718,8 +1720,9 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 			       "ACPI: processor limited to max C-state %d\n",
 			       max_cstate);
 		first_run++;
-#if !defined (CONFIG_CPU_IDLE) && defined (CONFIG_SMP)
-		register_latency_notifier(&acpi_processor_latency_notifier);
+#if !defined(CONFIG_CPU_IDLE) && defined(CONFIG_SMP)
+		pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY,
+				&acpi_processor_latency_notifier);
 #endif
 	}
 
@@ -1806,7 +1809,8 @@ int acpi_processor_power_exit(struct acpi_processor *pr,
 		 */
 		cpu_idle_wait();
 #ifdef CONFIG_SMP
-		unregister_latency_notifier(&acpi_processor_latency_notifier);
+		pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY,
+				&acpi_processor_latency_notifier);
 #endif
 	}
 #endif
diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 2ab107f..5bf9e00 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -162,7 +162,7 @@ that only one external action is invoked at a time.
 #include <linux/firmware.h>
 #include <linux/acpi.h>
 #include <linux/ctype.h>
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 
 #include "ipw2100.h"
 
@@ -1701,7 +1701,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
 	/* the ipw2100 hardware really doesn't want power management delays
 	 * longer than 175usec
 	 */
-	modify_acceptable_latency("ipw2100", 175);
+	pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 175);
 
 	/* If the interrupt is enabled, turn it off... */
 	spin_lock_irqsave(&priv->low_lock, flags);
@@ -1856,7 +1856,8 @@ static void ipw2100_down(struct ipw2100_priv *priv)
 	ipw2100_disable_interrupts(priv);
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 
-	modify_acceptable_latency("ipw2100", INFINITE_LATENCY);
+	pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
+			PM_QOS_DEFAULT_VALUE);
 
 	/* We have to signal any supplicant if we are disassociating */
 	if (associated)
@@ -6554,7 +6555,8 @@ static int __init ipw2100_init(void)
 	if (ret)
 		goto out;
 
-	set_acceptable_latency("ipw2100", INFINITE_LATENCY);
+	pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
+			PM_QOS_DEFAULT_VALUE);
 #ifdef CONFIG_IPW2100_DEBUG
 	ipw2100_debug_level = debug;
 	ret = driver_create_file(&ipw2100_pci_driver.driver,
@@ -6576,7 +6578,7 @@ static void __exit ipw2100_exit(void)
 			   &driver_attr_debug_level);
 #endif
 	pci_unregister_driver(&ipw2100_pci_driver);
-	remove_acceptable_latency("ipw2100");
+	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100");
 }
 
 module_init(ipw2100_init);
diff --git a/include/linux/latency.h b/include/linux/latency.h
deleted file mode 100644
index c08b52b..0000000
--- a/include/linux/latency.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * latency.h: Explicit system-wide latency-expectation infrastructure
- *
- * (C) Copyright 2006 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- */
-
-#ifndef _INCLUDE_GUARD_LATENCY_H_
-#define _INCLUDE_GUARD_LATENCY_H_
-
-#include <linux/notifier.h>
-
-void set_acceptable_latency(char *identifier, int usecs);
-void modify_acceptable_latency(char *identifier, int usecs);
-void remove_acceptable_latency(char *identifier);
-void synchronize_acceptable_latency(void);
-int system_latency_constraint(void);
-
-int register_latency_notifier(struct notifier_block * nb);
-int unregister_latency_notifier(struct notifier_block * nb);
-
-#define INFINITE_LATENCY 1000000
-
-#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 8331243..135a1b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
+	    hrtimer.o rwsem.o nsproxy.o srcu.o \
 	    utsname.o notifier.o ksysfs.o pm_qos_params.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
diff --git a/kernel/latency.c b/kernel/latency.c
deleted file mode 100644
index e63fcac..0000000
--- a/kernel/latency.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * latency.c: Explicit system-wide latency-expectation infrastructure
- *
- * The purpose of this infrastructure is to allow device drivers to set
- * latency constraint they have and to collect and summarize these
- * expectations globally. The cummulated result can then be used by
- * power management and similar users to make decisions that have
- * tradoffs with a latency component.
- *
- * An example user of this are the x86 C-states; each higher C state saves
- * more power, but has a higher exit latency. For the idle loop power
- * code to make a good decision which C-state to use, information about
- * acceptable latencies is required.
- *
- * An example announcer of latency is an audio driver that knowns it
- * will get an interrupt when the hardware has 200 usec of samples
- * left in the DMA buffer; in that case the driver can set a latency
- * constraint of, say, 150 usec.
- *
- * Multiple drivers can each announce their maximum accepted latency,
- * to keep these appart, a string based identifier is used.
- *
- *
- * (C) Copyright 2006 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/latency.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <asm/atomic.h>
-
-struct latency_info {
-	struct list_head list;
-	int usecs;
-	char *identifier;
-};
-
-/*
- * locking rule: all modifications to current_max_latency and
- * latency_list need to be done while holding the latency_lock.
- * latency_lock needs to be taken _irqsave.
- */
-static atomic_t current_max_latency;
-static DEFINE_SPINLOCK(latency_lock);
-
-static LIST_HEAD(latency_list);
-static BLOCKING_NOTIFIER_HEAD(latency_notifier);
-
-/*
- * This function returns the maximum latency allowed, which
- * happens to be the minimum of all maximum latencies on the
- * list.
- */
-static int __find_max_latency(void)
-{
-	int min = INFINITE_LATENCY;
-	struct latency_info *info;
-
-	list_for_each_entry(info, &latency_list, list) {
-		if (info->usecs < min)
-			min = info->usecs;
-	}
-	return min;
-}
-
-/**
- * set_acceptable_latency - sets the maximum latency acceptable
- * @identifier: string that identifies this driver
- * @usecs: maximum acceptable latency for this driver
- *
- * This function informs the kernel that this device(driver)
- * can accept at most usecs latency. This setting is used for
- * power management and similar tradeoffs.
- *
- * This function sleeps and can only be called from process
- * context.
- * Calling this function with an existing identifier is valid
- * and will cause the existing latency setting to be changed.
- */
-void set_acceptable_latency(char *identifier, int usecs)
-{
-	struct latency_info *info, *iter;
-	unsigned long flags;
-	int found_old = 0;
-
-	info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
-	if (!info)
-		return;
-	info->usecs = usecs;
-	info->identifier = kstrdup(identifier, GFP_KERNEL);
-	if (!info->identifier)
-		goto free_info;
-
-	spin_lock_irqsave(&latency_lock, flags);
-	list_for_each_entry(iter, &latency_list, list) {
-		if (strcmp(iter->identifier, identifier)==0) {
-			found_old = 1;
-			iter->usecs = usecs;
-			break;
-		}
-	}
-	if (!found_old)
-		list_add(&info->list, &latency_list);
-
-	if (usecs < atomic_read(&current_max_latency))
-		atomic_set(&current_max_latency, usecs);
-
-	spin_unlock_irqrestore(&latency_lock, flags);
-
-	blocking_notifier_call_chain(&latency_notifier,
-		atomic_read(&current_max_latency), NULL);
-
-	/*
-	 * if we inserted the new one, we're done; otherwise there was
-	 * an existing one so we need to free the redundant data
-	 */
-	if (!found_old)
-		return;
-
-	kfree(info->identifier);
-free_info:
-	kfree(info);
-}
-EXPORT_SYMBOL_GPL(set_acceptable_latency);
-
-/**
- * modify_acceptable_latency - changes the maximum latency acceptable
- * @identifier: string that identifies this driver
- * @usecs: maximum acceptable latency for this driver
- *
- * This function informs the kernel that this device(driver)
- * can accept at most usecs latency. This setting is used for
- * power management and similar tradeoffs.
- *
- * This function does not sleep and can be called in any context.
- * Trying to use a non-existing identifier silently gets ignored.
- *
- * Due to the atomic nature of this function, the modified latency
- * value will only be used for future decisions; past decisions
- * can still lead to longer latencies in the near future.
- */
-void modify_acceptable_latency(char *identifier, int usecs)
-{
-	struct latency_info *iter;
-	unsigned long flags;
-
-	spin_lock_irqsave(&latency_lock, flags);
-	list_for_each_entry(iter, &latency_list, list) {
-		if (strcmp(iter->identifier, identifier) == 0) {
-			iter->usecs = usecs;
-			break;
-		}
-	}
-	if (usecs < atomic_read(&current_max_latency))
-		atomic_set(&current_max_latency, usecs);
-	spin_unlock_irqrestore(&latency_lock, flags);
-}
-EXPORT_SYMBOL_GPL(modify_acceptable_latency);
-
-/**
- * remove_acceptable_latency - removes the maximum latency acceptable
- * @identifier: string that identifies this driver
- *
- * This function removes a previously set maximum latency setting
- * for the driver and frees up any resources associated with the
- * bookkeeping needed for this.
- *
- * This function does not sleep and can be called in any context.
- * Trying to use a non-existing identifier silently gets ignored.
- */
-void remove_acceptable_latency(char *identifier)
-{
-	unsigned long flags;
-	int newmax = 0;
-	struct latency_info *iter, *temp;
-
-	spin_lock_irqsave(&latency_lock, flags);
-
-	list_for_each_entry_safe(iter,  temp, &latency_list, list) {
-		if (strcmp(iter->identifier, identifier) == 0) {
-			list_del(&iter->list);
-			newmax = iter->usecs;
-			kfree(iter->identifier);
-			kfree(iter);
-			break;
-		}
-	}
-
-	/* If we just deleted the system wide value, we need to
-	 * recalculate with a full search
-	 */
-	if (newmax == atomic_read(&current_max_latency)) {
-		newmax = __find_max_latency();
-		atomic_set(&current_max_latency, newmax);
-	}
-	spin_unlock_irqrestore(&latency_lock, flags);
-}
-EXPORT_SYMBOL_GPL(remove_acceptable_latency);
-
-/**
- * system_latency_constraint - queries the system wide latency maximum
- *
- * This function returns the system wide maximum latency in
- * microseconds.
- *
- * This function does not sleep and can be called in any context.
- */
-int system_latency_constraint(void)
-{
-	return atomic_read(&current_max_latency);
-}
-EXPORT_SYMBOL_GPL(system_latency_constraint);
-
-/**
- * synchronize_acceptable_latency - recalculates all latency decisions
- *
- * This function will cause a callback to various kernel pieces that
- * will make those pieces rethink their latency decisions. This implies
- * that if there are overlong latencies in hardware state already, those
- * latencies get taken right now. When this call completes no overlong
- * latency decisions should be active anymore.
- *
- * Typical usecase of this is after a modify_acceptable_latency() call,
- * which in itself is non-blocking and non-synchronizing.
- *
- * This function blocks and should not be called with locks held.
- */
-
-void synchronize_acceptable_latency(void)
-{
-	blocking_notifier_call_chain(&latency_notifier,
-		atomic_read(&current_max_latency), NULL);
-}
-EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
-
-/*
- * Latency notifier: this notifier gets called when a non-atomic new
- * latency value gets set. The expectation nof the caller of the
- * non-atomic set is that when the call returns, future latencies
- * are within bounds, so the functions on the notifier list are
- * expected to take the overlong latencies immediately, inside the
- * callback, and not make a overlong latency decision anymore.
- *
- * The callback gets called when the new latency value is made
- * active so system_latency_constraint() returns the new latency.
- */
-int register_latency_notifier(struct notifier_block * nb)
-{
-	return blocking_notifier_chain_register(&latency_notifier, nb);
-}
-EXPORT_SYMBOL_GPL(register_latency_notifier);
-
-int unregister_latency_notifier(struct notifier_block * nb)
-{
-	return blocking_notifier_chain_unregister(&latency_notifier, nb);
-}
-EXPORT_SYMBOL_GPL(unregister_latency_notifier);
-
-static __init int latency_init(void)
-{
-	atomic_set(&current_max_latency, INFINITE_LATENCY);
-	/*
-	 * we don't want by default to have longer latencies than 2 ticks,
-	 * since that would cause lost ticks
-	 */
-	set_acceptable_latency("kernel", 2*1000000/HZ);
-	return 0;
-}
-
-module_init(latency_init);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 6244911..61f5d42 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -23,7 +23,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/latency.h>
+#include <linux/pm_qos_params.h>
 #include <linux/uio.h>
 #include <sound/core.h>
 #include <sound/control.h>
@@ -443,9 +443,11 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
 	snd_pcm_timer_resolution_change(substream);
 	runtime->status->state = SNDRV_PCM_STATE_SETUP;
 
-	remove_acceptable_latency(substream->latency_id);
+	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
+				substream->latency_id);
 	if ((usecs = period_to_usecs(runtime)) >= 0)
-		set_acceptable_latency(substream->latency_id, usecs);
+		pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY,
+					substream->latency_id, usecs);
 	return 0;
  _error:
 	/* hardware might be unuseable from this time,
@@ -505,7 +507,8 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
 	if (substream->ops->hw_free)
 		result = substream->ops->hw_free(substream);
 	runtime->status->state = SNDRV_PCM_STATE_OPEN;
-	remove_acceptable_latency(substream->latency_id);
+	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
+		substream->latency_id);
 	return result;
 }
 
-- 
cgit v1.1


From 533354d4ac48b7df99f18f952e5d51c3f59ba56c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 4 Feb 2008 22:30:11 -0800
Subject: Misc: Add possibility to remove misc devices during suspend/resume

Make it possible to unregister a misc device object in a safe way during a
suspend/resume cycle.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/misc.c        | 13 +++++++++----
 include/linux/miscdevice.h | 10 +++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 71c8cd7..a39101f 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -232,8 +232,9 @@ int misc_register(struct miscdevice * misc)
 }
 
 /**
- *	misc_deregister - unregister a miscellaneous device
+ *	__misc_deregister - unregister a miscellaneous device
  *	@misc: device to unregister
+ *	@suspended: to be set if the function is used during suspend/resume
  *
  *	Unregister a miscellaneous device that was previously
  *	successfully registered with misc_register(). Success
@@ -241,7 +242,7 @@ int misc_register(struct miscdevice * misc)
  *	indicates an error.
  */
 
-int misc_deregister(struct miscdevice * misc)
+int __misc_deregister(struct miscdevice *misc, bool suspended)
 {
 	int i = misc->minor;
 
@@ -250,7 +251,11 @@ int misc_deregister(struct miscdevice * misc)
 
 	mutex_lock(&misc_mtx);
 	list_del(&misc->list);
-	device_destroy(misc_class, MKDEV(MISC_MAJOR, misc->minor));
+	if (suspended)
+		destroy_suspended_device(misc_class,
+					MKDEV(MISC_MAJOR, misc->minor));
+	else
+		device_destroy(misc_class, MKDEV(MISC_MAJOR, misc->minor));
 	if (i < DYNAMIC_MINORS && i>0) {
 		misc_minors[i>>3] &= ~(1 << (misc->minor & 7));
 	}
@@ -259,7 +264,7 @@ int misc_deregister(struct miscdevice * misc)
 }
 
 EXPORT_SYMBOL(misc_register);
-EXPORT_SYMBOL(misc_deregister);
+EXPORT_SYMBOL(__misc_deregister);
 
 static int __init misc_init(void)
 {
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index dff9ea3..24b30b9 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -43,7 +43,15 @@ struct miscdevice  {
 };
 
 extern int misc_register(struct miscdevice * misc);
-extern int misc_deregister(struct miscdevice * misc);
+extern int __misc_deregister(struct miscdevice *misc, bool suspended);
+static inline int misc_deregister(struct miscdevice *misc)
+{
+	return __misc_deregister(misc, false);
+}
+static inline int misc_deregister_suspended(struct miscdevice *misc)
+{
+	return __misc_deregister(misc, true);
+}
 
 #define MODULE_ALIAS_MISCDEV(minor)				\
 	MODULE_ALIAS("char-major-" __stringify(MISC_MAJOR)	\
-- 
cgit v1.1


From a41e3dc4060cca2599afa14fbd4c745763746ba8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 4 Feb 2008 22:30:13 -0800
Subject: HWRNG: add possibility to remove hwrng devices during suspend/resume

Make it possible to unregister a Hardware Random Number Generator
device object in a safe way during a suspend/resume cycle.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Michael Buesch <mb@bu3sch.de>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hw_random/core.c | 10 +++++-----
 include/linux/hw_random.h     | 10 +++++++++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 0118b98..84cdf90 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -234,11 +234,11 @@ static DEVICE_ATTR(rng_available, S_IRUGO,
 		   NULL);
 
 
-static void unregister_miscdev(void)
+static void unregister_miscdev(bool suspended)
 {
 	device_remove_file(rng_miscdev.this_device, &dev_attr_rng_available);
 	device_remove_file(rng_miscdev.this_device, &dev_attr_rng_current);
-	misc_deregister(&rng_miscdev);
+	__misc_deregister(&rng_miscdev, suspended);
 }
 
 static int register_miscdev(void)
@@ -313,7 +313,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(hwrng_register);
 
-void hwrng_unregister(struct hwrng *rng)
+void __hwrng_unregister(struct hwrng *rng, bool suspended)
 {
 	int err;
 
@@ -332,11 +332,11 @@ void hwrng_unregister(struct hwrng *rng)
 		}
 	}
 	if (list_empty(&rng_list))
-		unregister_miscdev();
+		unregister_miscdev(suspended);
 
 	mutex_unlock(&rng_mutex);
 }
-EXPORT_SYMBOL_GPL(hwrng_unregister);
+EXPORT_SYMBOL_GPL(__hwrng_unregister);
 
 
 MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver");
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 85d1191..4213182 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -44,7 +44,15 @@ struct hwrng {
 /** Register a new Hardware Random Number Generator driver. */
 extern int hwrng_register(struct hwrng *rng);
 /** Unregister a Hardware Random Number Generator driver. */
-extern void hwrng_unregister(struct hwrng *rng);
+extern void __hwrng_unregister(struct hwrng *rng, bool suspended);
+static inline void hwrng_unregister(struct hwrng *rng)
+{
+	__hwrng_unregister(rng, false);
+}
+static inline void hwrng_unregister_suspended(struct hwrng *rng)
+{
+	__hwrng_unregister(rng, true);
+}
 
 #endif /* __KERNEL__ */
 #endif /* LINUX_HWRANDOM_H_ */
-- 
cgit v1.1


From fa23f5cce8cda2095013afc837ccf74b352f9f7b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 4 Feb 2008 22:30:14 -0800
Subject: leds: add possibility to remove leds classdevs during suspend/resume

Make it possible to unregister a led classdev object in a safe way during a
suspend/resume cycle.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/leds/led-class.c | 13 +++++++++----
 include/linux/leds.h     | 10 +++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 64c66b3..4a93878 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -137,12 +137,14 @@ err_out:
 EXPORT_SYMBOL_GPL(led_classdev_register);
 
 /**
- * led_classdev_unregister - unregisters a object of led_properties class.
+ * __led_classdev_unregister - unregisters a object of led_properties class.
  * @led_cdev: the led device to unregister
+ * @suspended: indicates whether system-wide suspend or resume is in progress
  *
  * Unregisters a previously registered via led_classdev_register object.
  */
-void led_classdev_unregister(struct led_classdev *led_cdev)
+void __led_classdev_unregister(struct led_classdev *led_cdev,
+				      bool suspended)
 {
 	device_remove_file(led_cdev->dev, &dev_attr_brightness);
 #ifdef CONFIG_LEDS_TRIGGERS
@@ -153,13 +155,16 @@ void led_classdev_unregister(struct led_classdev *led_cdev)
 	up_write(&led_cdev->trigger_lock);
 #endif
 
-	device_unregister(led_cdev->dev);
+	if (suspended)
+		device_pm_schedule_removal(led_cdev->dev);
+	else
+		device_unregister(led_cdev->dev);
 
 	down_write(&leds_list_lock);
 	list_del(&led_cdev->node);
 	up_write(&leds_list_lock);
 }
-EXPORT_SYMBOL_GPL(led_classdev_unregister);
+EXPORT_SYMBOL_GPL(__led_classdev_unregister);
 
 static int __init leds_init(void)
 {
diff --git a/include/linux/leds.h b/include/linux/leds.h
index b4130ff..00f89fd 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -54,7 +54,15 @@ struct led_classdev {
 
 extern int led_classdev_register(struct device *parent,
 				 struct led_classdev *led_cdev);
-extern void led_classdev_unregister(struct led_classdev *led_cdev);
+extern void __led_classdev_unregister(struct led_classdev *led_cdev, bool sus);
+static inline void led_classdev_unregister(struct led_classdev *lcd)
+{
+	__led_classdev_unregister(lcd, false);
+}
+static inline void led_classdev_unregister_suspended(struct led_classdev *lcd)
+{
+	__led_classdev_unregister(lcd, true);
+}
 extern void led_classdev_suspend(struct led_classdev *led_cdev);
 extern void led_classdev_resume(struct led_classdev *led_cdev);
 
-- 
cgit v1.1


From 3506e0c49a5ceba72c0405d1a470184c2d6705f7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 4 Feb 2008 22:30:15 -0800
Subject: b43: avoid unregistering device objects during suspend

Modify the b43 driver to avoid deadlocking suspend and resume, which happens
as a result of attempting to unregister device objects locked by the PM core
during suspend/resume cycles.  Also, make it use a suspend-safe method of
unregistering device object in the resume error path.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Michael Buesch <mb@bu3sch.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <greg@kroah.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/wireless/b43/b43.h  |  1 +
 drivers/net/wireless/b43/leds.c |  5 ++++-
 drivers/net/wireless/b43/main.c | 25 ++++++++++++++++---------
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h
index 32a24f5..08a011f 100644
--- a/drivers/net/wireless/b43/b43.h
+++ b/drivers/net/wireless/b43/b43.h
@@ -724,6 +724,7 @@ struct b43_wldev {
 	bool short_preamble;	/* TRUE, if short preamble is enabled. */
 	bool short_slot;	/* TRUE, if short slot timing is enabled. */
 	bool radio_hw_enable;	/* saved state of radio hardware enabled state */
+	bool suspend_in_progress;	/* TRUE, if we are in a suspend/resume cycle */
 
 	/* PHY/Radio device. */
 	struct b43_phy phy;
diff --git a/drivers/net/wireless/b43/leds.c b/drivers/net/wireless/b43/leds.c
index 4b590d8..0908335 100644
--- a/drivers/net/wireless/b43/leds.c
+++ b/drivers/net/wireless/b43/leds.c
@@ -116,7 +116,10 @@ static void b43_unregister_led(struct b43_led *led)
 {
 	if (!led->dev)
 		return;
-	led_classdev_unregister(&led->led_dev);
+	if (led->dev->suspend_in_progress)
+		led_classdev_unregister_suspended(&led->led_dev);
+	else
+		led_classdev_unregister(&led->led_dev);
 	b43_led_turn_off(led->dev, led->index, led->activelow);
 	led->dev = NULL;
 }
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c
index 6faa57a..ef65c41 100644
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -2555,10 +2555,10 @@ static int b43_rng_read(struct hwrng *rng, u32 * data)
 	return (sizeof(u16));
 }
 
-static void b43_rng_exit(struct b43_wl *wl)
+static void b43_rng_exit(struct b43_wl *wl, bool suspended)
 {
 	if (wl->rng_initialized)
-		hwrng_unregister(&wl->rng);
+		__hwrng_unregister(&wl->rng, suspended);
 }
 
 static int b43_rng_init(struct b43_wl *wl)
@@ -3418,8 +3418,10 @@ static void b43_wireless_core_exit(struct b43_wldev *dev)
 	macctl |= B43_MACCTL_PSM_JMP0;
 	b43_write32(dev, B43_MMIO_MACCTL, macctl);
 
-	b43_leds_exit(dev);
-	b43_rng_exit(dev->wl);
+	if (!dev->suspend_in_progress) {
+		b43_leds_exit(dev);
+		b43_rng_exit(dev->wl, false);
+	}
 	b43_dma_free(dev);
 	b43_chip_exit(dev);
 	b43_radio_turn_off(dev, 1);
@@ -3535,11 +3537,13 @@ static int b43_wireless_core_init(struct b43_wldev *dev)
 	ssb_bus_powerup(bus, 1);	/* Enable dynamic PCTL */
 	b43_upload_card_macaddress(dev);
 	b43_security_init(dev);
-	b43_rng_init(wl);
+	if (!dev->suspend_in_progress)
+		b43_rng_init(wl);
 
 	b43_set_status(dev, B43_STAT_INITIALIZED);
 
-	b43_leds_init(dev);
+	if (!dev->suspend_in_progress)
+		b43_leds_init(dev);
 out:
 	return err;
 
@@ -4136,6 +4140,7 @@ static int b43_suspend(struct ssb_device *dev, pm_message_t state)
 	b43dbg(wl, "Suspending...\n");
 
 	mutex_lock(&wl->mutex);
+	wldev->suspend_in_progress = true;
 	wldev->suspend_init_status = b43_status(wldev);
 	if (wldev->suspend_init_status >= B43_STAT_STARTED)
 		b43_wireless_core_stop(wldev);
@@ -4167,15 +4172,17 @@ static int b43_resume(struct ssb_device *dev)
 	if (wldev->suspend_init_status >= B43_STAT_STARTED) {
 		err = b43_wireless_core_start(wldev);
 		if (err) {
+			b43_leds_exit(wldev);
+			b43_rng_exit(wldev->wl, true);
 			b43_wireless_core_exit(wldev);
 			b43err(wl, "Resume failed at core start\n");
 			goto out;
 		}
 	}
-	mutex_unlock(&wl->mutex);
-
 	b43dbg(wl, "Device resumed.\n");
-      out:
+ out:
+	wldev->suspend_in_progress = false;
+	mutex_unlock(&wl->mutex);
 	return err;
 }
 
-- 
cgit v1.1


From 6f7127db311fe3e0ea6a2a332833f85abab6cb4b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 4 Feb 2008 22:30:17 -0800
Subject: m68k: Use cc-cross-prefix

The cc-cross-prefix is new and developed on request from Geert Uytterhoeven.

With cc-cross-prefix it is now much easier to have a few default cross compile
prefixes and defaulting to none - if none of them were present.  ARCH
maintainers are expected to pick up this feature soon.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Andreas Schwab <schwab@suse.de>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/Makefile | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 4a1bd44..2cba605 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -13,16 +13,15 @@
 # Copyright (C) 1994 by Hamish Macdonald
 #
 
-# test for cross compiling
-COMPILE_ARCH = $(shell uname -m)
-
 # override top level makefile
 AS += -m68020
 LDFLAGS := -m m68kelf
 LDFLAGS_MODULE += -T $(srctree)/arch/m68k/kernel/module.lds
-ifneq ($(COMPILE_ARCH),$(ARCH))
-	# prefix for cross-compiling binaries
-	CROSS_COMPILE = m68k-linux-gnu-
+ifneq ($(SUBARCH),$(ARCH))
+	ifeq ($(CROSS_COMPILE),)
+		CROSS_COMPILE := $(call cc-cross-prefix, \
+			m68k-linux-gnu- m68k-linux- m68k-unknown-linux-gnu-)
+	endif
 endif
 
 ifdef CONFIG_SUN3
-- 
cgit v1.1


From b9e5e119fd9905c2a06f73a96f9c18aa760a8a65 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <alex@flawedcode.org>
Date: Mon, 4 Feb 2008 22:30:17 -0800
Subject: m68k: ARRAY_SIZE() cleanup

Signed-off-by: Alejandro Martinez Ruiz <alex@flawedcode.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/amiga/amisound.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index 1f5bfb5..92a92fb 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -21,7 +21,7 @@ static const signed char sine_data[] = {
 	0,  39,  75,  103,  121,  127,  121,  103,  75,  39,
 	0, -39, -75, -103, -121, -127, -121, -103, -75, -39
 };
-#define DATA_SIZE	(sizeof(sine_data)/sizeof(sine_data[0]))
+#define DATA_SIZE	ARRAY_SIZE(sine_data)
 
 #define custom amiga_custom
 
-- 
cgit v1.1


From 0a8320b04c0762a1c65b5800b84023b454c2da54 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <alex@flawedcode.org>
Date: Mon, 4 Feb 2008 22:30:19 -0800
Subject: dio: ARRAY_SIZE() cleanup

[Geert: eliminate NUMNAMES, as suggested by Richard Knutsson ]
[akpm@linux-foundation.org: coding-syle fixes]
Signed-off-by: Alejandro Martinez Ruiz <alex@flawedcode.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Richard Knutsson <ricknu-0@student.ltu.se>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dio/dio.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c
index 17502d6..07f274f 100644
--- a/drivers/dio/dio.c
+++ b/drivers/dio/dio.c
@@ -88,8 +88,6 @@ static struct dioname names[] =
 #undef DIONAME
 #undef DIOFBNAME
 
-#define NUMNAMES (sizeof(names) / sizeof(struct dioname))
-
 static const char *unknowndioname 
         = "unknown DIO board -- please email <linux-m68k@lists.linux-m68k.org>!";
 
@@ -97,7 +95,7 @@ static const char *dio_getname(int id)
 {
         /* return pointer to a constant string describing the board with given ID */
 	unsigned int i;
-        for (i = 0; i < NUMNAMES; i++)
+	for (i = 0; i < ARRAY_SIZE(names); i++)
                 if (names[i].id == id) 
                         return names[i].name;
 
-- 
cgit v1.1


From 96762379915f0c46bfac566038e66dee3dd0e7b5 Mon Sep 17 00:00:00 2001
From: Roel Kluin <12o3l@tiscali.nl>
Date: Mon, 4 Feb 2008 22:30:22 -0800
Subject: m68k: Balance ioremap and iounmap in m68k/atari/hades-pci.c

Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/atari/hades-pci.c | 54 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/m68k/atari/hades-pci.c b/arch/m68k/atari/hades-pci.c
index bee2b14..2bbabc0 100644
--- a/arch/m68k/atari/hades-pci.c
+++ b/arch/m68k/atari/hades-pci.c
@@ -376,8 +376,8 @@ struct pci_bus_info * __init init_hades_pci(void)
 	 */
 
 	bus = kzalloc(sizeof(struct pci_bus_info), GFP_KERNEL);
-	if (!bus)
-		return NULL;
+	if (unlikely(!bus))
+		goto iounmap_base_virt;
 
 	/*
 	 * Claim resources. The m68k has no separate I/O space, both
@@ -385,43 +385,25 @@ struct pci_bus_info * __init init_hades_pci(void)
 	 * the I/O resources are requested in memory space as well.
 	 */
 
-	if (request_resource(&iomem_resource, &config_space) != 0)
-	{
-		kfree(bus);
-		return NULL;
-	}
+	if (unlikely(request_resource(&iomem_resource, &config_space) != 0))
+		goto free_bus;
 
-	if (request_resource(&iomem_resource, &io_space) != 0)
-	{
-		release_resource(&config_space);
-		kfree(bus);
-		return NULL;
-	}
+	if (unlikely(request_resource(&iomem_resource, &io_space) != 0))
+		goto release_config_space;
 
 	bus->mem_space.start = HADES_MEM_BASE;
 	bus->mem_space.end = HADES_MEM_BASE + HADES_MEM_SIZE - 1;
 	bus->mem_space.name = pci_mem_name;
 #if 1
-	if (request_resource(&iomem_resource, &bus->mem_space) != 0)
-	{
-		release_resource(&io_space);
-		release_resource(&config_space);
-		kfree(bus);
-		return NULL;
-	}
+	if (unlikely(request_resource(&iomem_resource, &bus->mem_space) != 0))
+		goto release_io_space;
 #endif
 	bus->io_space.start = pci_io_base_virt;
 	bus->io_space.end = pci_io_base_virt + HADES_VIRT_IO_SIZE - 1;
 	bus->io_space.name = pci_io_name;
 #if 1
-	if (request_resource(&ioport_resource, &bus->io_space) != 0)
-	{
-		release_resource(&bus->mem_space);
-		release_resource(&io_space);
-		release_resource(&config_space);
-		kfree(bus);
-		return NULL;
-	}
+	if (unlikely(request_resource(&ioport_resource, &bus->io_space) != 0))
+		goto release_bus_mem_space;
 #endif
 	/*
 	 * Set hardware dependent functions.
@@ -438,5 +420,21 @@ struct pci_bus_info * __init init_hades_pci(void)
 	tt_mfp.active_edge &= ~0x27;
 
 	return bus;
+
+release_bus_mem_space:
+	release_resource(&bus->mem_space);
+release_io_space:
+	release_resource(&io_space);
+release_config_space:
+	release_resource(&config_space);
+free_bus:
+	kfree(bus);
+iounmap_base_virt:
+	iounmap((void *)pci_io_base_virt);
+
+	for (i = 0; i < N_SLOTS; i++)
+		iounmap((void *)pci_conf_base_virt[i]);
+
+	return NULL;
 }
 #endif
-- 
cgit v1.1


From 99ffab81071b7088ddebd4be9bbf1ad03c2a9e98 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:23 -0800
Subject: nubus: kill drivers/nubus/nubus_syms.c

nubus: kill drivers/nubus/nubus_syms.c

EXPORT_SYMBOL's belong to the actual code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/nubus/Makefile     |  1 -
 drivers/nubus/nubus.c      | 13 +++++++++++++
 drivers/nubus/nubus_syms.c | 28 ----------------------------
 drivers/nubus/proc.c       |  4 ++++
 4 files changed, 17 insertions(+), 29 deletions(-)
 delete mode 100644 drivers/nubus/nubus_syms.c

diff --git a/drivers/nubus/Makefile b/drivers/nubus/Makefile
index f5ef03c..21bda20 100644
--- a/drivers/nubus/Makefile
+++ b/drivers/nubus/Makefile
@@ -4,5 +4,4 @@
 
 obj-y   := nubus.o
 
-obj-$(CONFIG_MODULES) += nubus_syms.o 
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c
index f4076ae..2f047e5 100644
--- a/drivers/nubus/nubus.c
+++ b/drivers/nubus/nubus.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <asm/setup.h>
 #include <asm/system.h>
 #include <asm/page.h>
@@ -186,6 +187,7 @@ void nubus_get_rsrc_mem(void *dest, const struct nubus_dirent* dirent,
 		len--;
 	}
 }
+EXPORT_SYMBOL(nubus_get_rsrc_mem);
 
 void nubus_get_rsrc_str(void *dest, const struct nubus_dirent* dirent,
 			int len)
@@ -200,6 +202,7 @@ void nubus_get_rsrc_str(void *dest, const struct nubus_dirent* dirent,
 		len--;
 	}
 }
+EXPORT_SYMBOL(nubus_get_rsrc_str);
 
 int nubus_get_root_dir(const struct nubus_board* board,
 		       struct nubus_dir* dir)
@@ -209,6 +212,7 @@ int nubus_get_root_dir(const struct nubus_board* board,
 	dir->mask = board->lanes;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_get_root_dir);
 
 /* This is a slyly renamed version of the above */
 int nubus_get_func_dir(const struct nubus_dev* dev,
@@ -219,6 +223,7 @@ int nubus_get_func_dir(const struct nubus_dev* dev,
 	dir->mask = dev->board->lanes;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_get_func_dir);
 
 int nubus_get_board_dir(const struct nubus_board* board,
 			struct nubus_dir* dir)
@@ -237,6 +242,7 @@ int nubus_get_board_dir(const struct nubus_board* board,
 		return -1;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_get_board_dir);
 
 int nubus_get_subdir(const struct nubus_dirent *ent,
 		     struct nubus_dir *dir)
@@ -246,6 +252,7 @@ int nubus_get_subdir(const struct nubus_dirent *ent,
 	dir->mask = ent->mask;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_get_subdir);
 
 int nubus_readdir(struct nubus_dir *nd, struct nubus_dirent *ent)
 {
@@ -274,12 +281,14 @@ int nubus_readdir(struct nubus_dir *nd, struct nubus_dirent *ent)
 	ent->mask  = nd->mask;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_readdir);
 
 int nubus_rewinddir(struct nubus_dir* dir)
 {
 	dir->ptr = dir->base;
 	return 0;
 }
+EXPORT_SYMBOL(nubus_rewinddir);
 
 /* Driver interface functions, more or less like in pci.c */
 
@@ -303,6 +312,7 @@ nubus_find_device(unsigned short category,
 	}
 	return NULL;
 }
+EXPORT_SYMBOL(nubus_find_device);
 
 struct nubus_dev*
 nubus_find_type(unsigned short category,
@@ -320,6 +330,7 @@ nubus_find_type(unsigned short category,
 	}
 	return NULL;
 }
+EXPORT_SYMBOL(nubus_find_type);
 
 struct nubus_dev*
 nubus_find_slot(unsigned int slot,
@@ -335,6 +346,7 @@ nubus_find_slot(unsigned int slot,
 	}
 	return NULL;
 }
+EXPORT_SYMBOL(nubus_find_slot);
 
 int
 nubus_find_rsrc(struct nubus_dir* dir, unsigned char rsrc_type,
@@ -346,6 +358,7 @@ nubus_find_rsrc(struct nubus_dir* dir, unsigned char rsrc_type,
 	}	
 	return -1;
 }
+EXPORT_SYMBOL(nubus_find_rsrc);
 
 /* Initialization functions - decide which slots contain stuff worth
    looking at, and print out lots and lots of information from the
diff --git a/drivers/nubus/nubus_syms.c b/drivers/nubus/nubus_syms.c
deleted file mode 100644
index 9204f04..0000000
--- a/drivers/nubus/nubus_syms.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Exported symbols for NuBus services
-
-   (c) 1999 David Huggins-Daines <dhd@debian.org> */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/nubus.h>
-
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(nubus_proc_attach_device);
-EXPORT_SYMBOL(nubus_proc_detach_device);
-#endif
-
-MODULE_LICENSE("GPL");
-
-EXPORT_SYMBOL(nubus_find_device);
-EXPORT_SYMBOL(nubus_find_type);
-EXPORT_SYMBOL(nubus_find_slot);
-EXPORT_SYMBOL(nubus_get_root_dir);
-EXPORT_SYMBOL(nubus_get_board_dir);
-EXPORT_SYMBOL(nubus_get_func_dir);
-EXPORT_SYMBOL(nubus_readdir);
-EXPORT_SYMBOL(nubus_find_rsrc);
-EXPORT_SYMBOL(nubus_rewinddir);
-EXPORT_SYMBOL(nubus_get_subdir);
-EXPORT_SYMBOL(nubus_get_rsrc_mem);
-EXPORT_SYMBOL(nubus_get_rsrc_str);
-
diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index 5271a4a..e07492b 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -22,6 +22,8 @@
 #include <linux/nubus.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
+#include <linux/module.h>
+
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 
@@ -140,6 +142,7 @@ int nubus_proc_attach_device(struct nubus_dev *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL(nubus_proc_attach_device);
 
 /* FIXME: this is certainly broken! */
 int nubus_proc_detach_device(struct nubus_dev *dev)
@@ -154,6 +157,7 @@ int nubus_proc_detach_device(struct nubus_dev *dev)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(nubus_proc_detach_device);
 
 void __init proc_bus_nubus_add_devices(void)
 {
-- 
cgit v1.1


From deea7775e2ae460aace0bf7b5bb2ff3b412017f1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:24 -0800
Subject: m68k: kill arch/m68k/mac/mac_ksyms.c

EXPORT_SYMBOL's belong to the actual code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/mac/Makefile    | 2 +-
 arch/m68k/mac/mac_ksyms.c | 8 --------
 arch/m68k/mac/via.c       | 5 ++++-
 3 files changed, 5 insertions(+), 10 deletions(-)
 delete mode 100644 arch/m68k/mac/mac_ksyms.c

diff --git a/arch/m68k/mac/Makefile b/arch/m68k/mac/Makefile
index 995a09d9..1d265ba 100644
--- a/arch/m68k/mac/Makefile
+++ b/arch/m68k/mac/Makefile
@@ -3,4 +3,4 @@
 #
 
 obj-y		:= config.o bootparse.o macints.o iop.o via.o oss.o psc.o \
-			baboon.o macboing.o debug.o misc.o mac_ksyms.o
+			baboon.o macboing.o debug.o misc.o
diff --git a/arch/m68k/mac/mac_ksyms.c b/arch/m68k/mac/mac_ksyms.c
deleted file mode 100644
index 6e37ceb..0000000
--- a/arch/m68k/mac/mac_ksyms.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <linux/module.h>
-#include <asm/ptrace.h>
-#include <asm/traps.h>
-
-/* Says whether we're using A/UX interrupts or not */
-extern int via_alt_mapping;
-
-EXPORT_SYMBOL(via_alt_mapping);
diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c
index 8df270e..fa485df 100644
--- a/arch/m68k/mac/via.c
+++ b/arch/m68k/mac/via.c
@@ -28,6 +28,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/ide.h>
+#include <linux/module.h>
 
 #include <asm/bootinfo.h>
 #include <asm/macintosh.h>
@@ -41,7 +42,9 @@ volatile __u8 *via1, *via2;
 /* See note in mac_via.h about how this is possibly not useful */
 volatile long *via_memory_bogon=(long *)&via_memory_bogon;
 #endif
-int rbv_present, via_alt_mapping;
+int rbv_present;
+int via_alt_mapping;
+EXPORT_SYMBOL(via_alt_mapping);
 __u8 rbv_clear;
 
 /*
-- 
cgit v1.1


From 3456fef5d988e9014cd318a3ae7991c9a03c8cf2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:25 -0800
Subject: m68k: kill arch/m68k/hp300/ksyms.c

It was empty.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/hp300/Makefile | 2 +-
 arch/m68k/hp300/ksyms.c  | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)
 delete mode 100644 arch/m68k/hp300/ksyms.c

diff --git a/arch/m68k/hp300/Makefile b/arch/m68k/hp300/Makefile
index 288b9c6..96d4244 100644
--- a/arch/m68k/hp300/Makefile
+++ b/arch/m68k/hp300/Makefile
@@ -2,4 +2,4 @@
 # Makefile for Linux arch/m68k/hp300 source directory
 #
 
-obj-y		:= ksyms.o config.o time.o reboot.o
+obj-y		:= config.o time.o reboot.o
diff --git a/arch/m68k/hp300/ksyms.c b/arch/m68k/hp300/ksyms.c
deleted file mode 100644
index 8202830..0000000
--- a/arch/m68k/hp300/ksyms.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- *  linux/arch/m68k/hp300/ksyms.c
- *
- *  Copyright (C) 1998 Philip Blundell <philb@gnu.org>
- *
- *  This file contains the HP300-specific kernel symbols.  None yet. :-)
- */
-
-#include <linux/module.h>
-- 
cgit v1.1


From 8b169fa2c942bc2a579da3f33986bd3fc48d9684 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:25 -0800
Subject: m68k: kill arch/m68k/amiga/amiga_ksyms.c

EXPORT_SYMBOL's belong to the actual code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/amiga/Makefile      |  2 +-
 arch/m68k/amiga/amiga_ksyms.c | 33 ---------------------------------
 arch/m68k/amiga/amisound.c    |  3 +++
 arch/m68k/amiga/chipram.c     |  7 +++++++
 arch/m68k/amiga/config.c      | 12 ++++++++++++
 arch/m68k/amiga/pcmcia.c      |  9 +++++++++
 6 files changed, 32 insertions(+), 34 deletions(-)
 delete mode 100644 arch/m68k/amiga/amiga_ksyms.c

diff --git a/arch/m68k/amiga/Makefile b/arch/m68k/amiga/Makefile
index 8b41565..6a0d765 100644
--- a/arch/m68k/amiga/Makefile
+++ b/arch/m68k/amiga/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Linux arch/m68k/amiga source directory
 #
 
-obj-y		:= config.o amiints.o cia.o chipram.o amisound.o amiga_ksyms.o
+obj-y		:= config.o amiints.o cia.o chipram.o amisound.o
 
 obj-$(CONFIG_AMIGA_PCMCIA)	+= pcmcia.o
diff --git a/arch/m68k/amiga/amiga_ksyms.c b/arch/m68k/amiga/amiga_ksyms.c
deleted file mode 100644
index 7fdcf6b..0000000
--- a/arch/m68k/amiga/amiga_ksyms.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <linux/module.h>
-#include <linux/types.h>
-#include <asm/ptrace.h>
-#include <asm/amigahw.h>
-#include <asm/amigaints.h>
-#include <asm/amipcmcia.h>
-
-extern volatile u_short amiga_audio_min_period;
-extern u_short amiga_audio_period;
-
-/*
- * Add things here when you find the need for it.
- */
-EXPORT_SYMBOL(amiga_model);
-EXPORT_SYMBOL(amiga_chipset);
-EXPORT_SYMBOL(amiga_hw_present);
-EXPORT_SYMBOL(amiga_eclock);
-EXPORT_SYMBOL(amiga_colorclock);
-EXPORT_SYMBOL(amiga_chip_alloc);
-EXPORT_SYMBOL(amiga_chip_free);
-EXPORT_SYMBOL(amiga_chip_avail);
-EXPORT_SYMBOL(amiga_chip_size);
-EXPORT_SYMBOL(amiga_audio_period);
-EXPORT_SYMBOL(amiga_audio_min_period);
-
-#ifdef CONFIG_AMIGA_PCMCIA
-  EXPORT_SYMBOL(pcmcia_reset);
-  EXPORT_SYMBOL(pcmcia_copy_tuple);
-  EXPORT_SYMBOL(pcmcia_program_voltage);
-  EXPORT_SYMBOL(pcmcia_access_speed);
-  EXPORT_SYMBOL(pcmcia_write_enable);
-  EXPORT_SYMBOL(pcmcia_write_disable);
-#endif
diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index 92a92fb..61e5c54 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -12,6 +12,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/amigahw.h>
@@ -31,6 +32,7 @@ static const signed char sine_data[] = {
      */
 
 volatile unsigned short amiga_audio_min_period = 124; /* Default for pre-OCS */
+EXPORT_SYMBOL(amiga_audio_min_period);
 
 #define MAX_PERIOD	(65535)
 
@@ -40,6 +42,7 @@ volatile unsigned short amiga_audio_min_period = 124; /* Default for pre-OCS */
      */
 
 unsigned short amiga_audio_period = MAX_PERIOD;
+EXPORT_SYMBOL(amiga_audio_period);
 
 static unsigned long clock_constant;
 
diff --git a/arch/m68k/amiga/chipram.c b/arch/m68k/amiga/chipram.c
index fa015d8..d10726f 100644
--- a/arch/m68k/amiga/chipram.c
+++ b/arch/m68k/amiga/chipram.c
@@ -13,10 +13,13 @@
 #include <linux/ioport.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/module.h>
+
 #include <asm/page.h>
 #include <asm/amigahw.h>
 
 unsigned long amiga_chip_size;
+EXPORT_SYMBOL(amiga_chip_size);
 
 static struct resource chipram_res = {
     .name = "Chip RAM", .start = CHIP_PHYSADDR
@@ -67,6 +70,7 @@ void *amiga_chip_alloc(unsigned long size, const char *name)
 #endif
     return (void *)ZTWO_VADDR(res->start);
 }
+EXPORT_SYMBOL(amiga_chip_alloc);
 
 
     /*
@@ -120,6 +124,7 @@ void amiga_chip_free(void *ptr)
     }
     printk("amiga_chip_free: trying to free nonexistent region at %p\n", ptr);
 }
+EXPORT_SYMBOL(amiga_chip_free);
 
 
 unsigned long amiga_chip_avail(void)
@@ -129,3 +134,5 @@ unsigned long amiga_chip_avail(void)
 #endif
 	return chipavail;
 }
+EXPORT_SYMBOL(amiga_chip_avail);
+
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c
index 3574853..50f5daa 100644
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -23,6 +23,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/zorro.h>
+#include <linux/module.h>
 
 #include <asm/bootinfo.h>
 #include <asm/setup.h>
@@ -36,13 +37,24 @@
 #include <asm/io.h>
 
 unsigned long amiga_model;
+EXPORT_SYMBOL(amiga_model);
+
 unsigned long amiga_eclock;
+EXPORT_SYMBOL(amiga_eclock);
+
 unsigned long amiga_masterclock;
+
 unsigned long amiga_colorclock;
+EXPORT_SYMBOL(amiga_colorclock);
+
 unsigned long amiga_chipset;
+EXPORT_SYMBOL(amiga_chipset);
+
 unsigned char amiga_vblank;
 unsigned char amiga_psfreq;
+
 struct amiga_hw_present amiga_hw_present;
+EXPORT_SYMBOL(amiga_hw_present);
 
 static char s_a500[] __initdata = "A500";
 static char s_a500p[] __initdata = "A500+";
diff --git a/arch/m68k/amiga/pcmcia.c b/arch/m68k/amiga/pcmcia.c
index 186662c..7106f0c 100644
--- a/arch/m68k/amiga/pcmcia.c
+++ b/arch/m68k/amiga/pcmcia.c
@@ -15,6 +15,8 @@
 #include <linux/types.h>
 #include <linux/jiffies.h>
 #include <linux/timer.h>
+#include <linux/module.h>
+
 #include <asm/amigayle.h>
 #include <asm/amipcmcia.h>
 
@@ -30,6 +32,7 @@ void pcmcia_reset(void)
 	while (time_before(jiffies, reset_start_time + 1*HZ/100));
 	b = gayle_reset;
 }
+EXPORT_SYMBOL(pcmcia_reset);
 
 
 /* copy a tuple, including tuple header. return nb bytes copied */
@@ -61,6 +64,7 @@ int pcmcia_copy_tuple(unsigned char tuple_id, void *tuple, int max_len)
 
 	return 0;
 }
+EXPORT_SYMBOL(pcmcia_copy_tuple);
 
 void pcmcia_program_voltage(int voltage)
 {
@@ -84,6 +88,7 @@ void pcmcia_program_voltage(int voltage)
 	gayle.config = cfg_byte;
 
 }
+EXPORT_SYMBOL(pcmcia_program_voltage);
 
 void pcmcia_access_speed(int speed)
 {
@@ -101,13 +106,17 @@ void pcmcia_access_speed(int speed)
 	cfg_byte = (cfg_byte & 0xf3) | s;
 	gayle.config = cfg_byte;
 }
+EXPORT_SYMBOL(pcmcia_access_speed);
 
 void pcmcia_write_enable(void)
 {
 	gayle.cardstatus = GAYLE_CS_WR|GAYLE_CS_DA;
 }
+EXPORT_SYMBOL(pcmcia_write_enable);
 
 void pcmcia_write_disable(void)
 {
 	gayle.cardstatus = 0;
 }
+EXPORT_SYMBOL(pcmcia_write_disable);
+
-- 
cgit v1.1


From a3b2004a2671455ee7aef1d9aee5a24381999ddb Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:26 -0800
Subject: m68k: kill arch/m68k/atari/atari_ksyms.c

EXPORT_SYMBOL's belong to the actual code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/atari/Makefile      |  2 +-
 arch/m68k/atari/ataints.c     |  3 +++
 arch/m68k/atari/atari_ksyms.c | 35 -----------------------------------
 arch/m68k/atari/atasound.c    |  2 ++
 arch/m68k/atari/config.c      | 11 +++++++++++
 arch/m68k/atari/debug.c       |  6 ++++++
 arch/m68k/atari/stdma.c       |  5 +++++
 arch/m68k/atari/stram.c       |  3 +++
 8 files changed, 31 insertions(+), 36 deletions(-)
 delete mode 100644 arch/m68k/atari/atari_ksyms.c

diff --git a/arch/m68k/atari/Makefile b/arch/m68k/atari/Makefile
index 2cb8619..2cd905e 100644
--- a/arch/m68k/atari/Makefile
+++ b/arch/m68k/atari/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-y		:= config.o time.o debug.o ataints.o stdma.o \
-			atasound.o stram.o atari_ksyms.o
+			atasound.o stram.o
 
 ifeq ($(CONFIG_PCI),y)
 obj-$(CONFIG_HADES)	+= hades-pci.o
diff --git a/arch/m68k/atari/ataints.c b/arch/m68k/atari/ataints.c
index b85ca22..b45593a 100644
--- a/arch/m68k/atari/ataints.c
+++ b/arch/m68k/atari/ataints.c
@@ -40,6 +40,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/traps.h>
@@ -446,6 +447,7 @@ unsigned long atari_register_vme_int(void)
 	free_vme_vec_bitmap |= 1 << i;
 	return VME_SOURCE_BASE + i;
 }
+EXPORT_SYMBOL(atari_register_vme_int);
 
 
 void atari_unregister_vme_int(unsigned long irq)
@@ -455,5 +457,6 @@ void atari_unregister_vme_int(unsigned long irq)
 		free_vme_vec_bitmap &= ~(1 << irq);
 	}
 }
+EXPORT_SYMBOL(atari_unregister_vme_int);
 
 
diff --git a/arch/m68k/atari/atari_ksyms.c b/arch/m68k/atari/atari_ksyms.c
deleted file mode 100644
index a047571..0000000
--- a/arch/m68k/atari/atari_ksyms.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <linux/module.h>
-
-#include <asm/ptrace.h>
-#include <asm/traps.h>
-#include <asm/atarihw.h>
-#include <asm/atariints.h>
-#include <asm/atarikb.h>
-#include <asm/atari_joystick.h>
-#include <asm/atari_stdma.h>
-#include <asm/atari_stram.h>
-
-extern void atari_microwire_cmd( int cmd );
-extern int atari_MFP_init_done;
-extern int atari_SCC_init_done;
-extern int atari_SCC_reset_done;
-
-EXPORT_SYMBOL(atari_mch_cookie);
-EXPORT_SYMBOL(atari_mch_type);
-EXPORT_SYMBOL(atari_hw_present);
-EXPORT_SYMBOL(atari_switches);
-EXPORT_SYMBOL(atari_dont_touch_floppy_select);
-EXPORT_SYMBOL(atari_register_vme_int);
-EXPORT_SYMBOL(atari_unregister_vme_int);
-EXPORT_SYMBOL(stdma_lock);
-EXPORT_SYMBOL(stdma_release);
-EXPORT_SYMBOL(stdma_others_waiting);
-EXPORT_SYMBOL(stdma_islocked);
-EXPORT_SYMBOL(atari_stram_alloc);
-EXPORT_SYMBOL(atari_stram_free);
-
-EXPORT_SYMBOL(atari_MFP_init_done);
-EXPORT_SYMBOL(atari_SCC_init_done);
-EXPORT_SYMBOL(atari_SCC_reset_done);
-
-EXPORT_SYMBOL(atari_microwire_cmd);
diff --git a/arch/m68k/atari/atasound.c b/arch/m68k/atari/atasound.c
index ee04250..d266fe8 100644
--- a/arch/m68k/atari/atasound.c
+++ b/arch/m68k/atari/atasound.c
@@ -22,6 +22,7 @@
 #include <linux/fcntl.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/atarihw.h>
 #include <asm/system.h>
@@ -43,6 +44,7 @@ void atari_microwire_cmd (int cmd)
 	while( tt_microwire.mask != 0x7ff)
 		;
 }
+EXPORT_SYMBOL(atari_microwire_cmd);
 
 
 /* PSG base frequency */
diff --git a/arch/m68k/atari/config.c b/arch/m68k/atari/config.c
index e40e5dc..5945e15 100644
--- a/arch/m68k/atari/config.c
+++ b/arch/m68k/atari/config.c
@@ -31,6 +31,7 @@
 #include <linux/delay.h>
 #include <linux/ioport.h>
 #include <linux/vt_kern.h>
+#include <linux/module.h>
 
 #include <asm/bootinfo.h>
 #include <asm/setup.h>
@@ -43,10 +44,20 @@
 #include <asm/io.h>
 
 u_long atari_mch_cookie;
+EXPORT_SYMBOL(atari_mch_cookie);
+
 u_long atari_mch_type;
+EXPORT_SYMBOL(atari_mch_type);
+
 struct atari_hw_present atari_hw_present;
+EXPORT_SYMBOL(atari_hw_present);
+
 u_long atari_switches;
+EXPORT_SYMBOL(atari_switches);
+
 int atari_dont_touch_floppy_select;
+EXPORT_SYMBOL(atari_dont_touch_floppy_select);
+
 int atari_rtc_year_offset;
 
 /* local function prototypes */
diff --git a/arch/m68k/atari/debug.c b/arch/m68k/atari/debug.c
index fbeed8c..043ddbc 100644
--- a/arch/m68k/atari/debug.c
+++ b/arch/m68k/atari/debug.c
@@ -15,17 +15,23 @@
 #include <linux/console.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 
 #include <asm/atarihw.h>
 #include <asm/atariints.h>
 
 /* Flag that Modem1 port is already initialized and used */
 int atari_MFP_init_done;
+EXPORT_SYMBOL(atari_MFP_init_done);
+
 /* Flag that Modem1 port is already initialized and used */
 int atari_SCC_init_done;
+EXPORT_SYMBOL(atari_SCC_init_done);
+
 /* Can be set somewhere, if a SCC master reset has already be done and should
  * not be repeated; used by kgdb */
 int atari_SCC_reset_done;
+EXPORT_SYMBOL(atari_SCC_reset_done);
 
 static struct console atari_console_driver = {
 	.name	= "debug",
diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c
index ab3fd52..d1bd029 100644
--- a/arch/m68k/atari/stdma.c
+++ b/arch/m68k/atari/stdma.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 
 #include <asm/atari_stdma.h>
 #include <asm/atariints.h>
@@ -91,6 +92,7 @@ void stdma_lock(irq_handler_t handler, void *data)
 	stdma_isr_data = data;
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL(stdma_lock);
 
 
 /*
@@ -117,6 +119,7 @@ void stdma_release(void)
 
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL(stdma_release);
 
 
 /*
@@ -134,6 +137,7 @@ int stdma_others_waiting(void)
 {
 	return waitqueue_active(&stdma_wait);
 }
+EXPORT_SYMBOL(stdma_others_waiting);
 
 
 /*
@@ -155,6 +159,7 @@ int stdma_islocked(void)
 {
 	return stdma_locked;
 }
+EXPORT_SYMBOL(stdma_islocked);
 
 
 /*
diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c
index bf4588c..8dda651 100644
--- a/arch/m68k/atari/stram.c
+++ b/arch/m68k/atari/stram.c
@@ -20,6 +20,7 @@
 #include <linux/bootmem.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 
 #include <asm/setup.h>
 #include <asm/machdep.h>
@@ -208,6 +209,7 @@ void *atari_stram_alloc(long size, const char *owner)
 	}
 	return( addr );
 }
+EXPORT_SYMBOL(atari_stram_alloc);
 
 void atari_stram_free( void *addr )
 
@@ -237,6 +239,7 @@ void atari_stram_free( void *addr )
 	printk( KERN_ERR "atari_stram_free: cannot free block at %p "
 			"(called from %p)\n", addr, __builtin_return_address(0) );
 }
+EXPORT_SYMBOL(atari_stram_free);
 
 
 /* ------------------------------------------------------------------------ */
-- 
cgit v1.1


From 612e484fdb8802ffee84218cb35f3cff61a9c8c6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 4 Feb 2008 22:30:27 -0800
Subject: m68k: kill arch/m68k/mvme16x/mvme16x_ksyms.c

EXPORT_SYMBOL's belong to the actual code.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/mvme16x/Makefile        | 2 +-
 arch/m68k/mvme16x/config.c        | 2 ++
 arch/m68k/mvme16x/mvme16x_ksyms.c | 6 ------
 3 files changed, 3 insertions(+), 7 deletions(-)
 delete mode 100644 arch/m68k/mvme16x/mvme16x_ksyms.c

diff --git a/arch/m68k/mvme16x/Makefile b/arch/m68k/mvme16x/Makefile
index 950e82f..edb3f6e 100644
--- a/arch/m68k/mvme16x/Makefile
+++ b/arch/m68k/mvme16x/Makefile
@@ -2,4 +2,4 @@
 # Makefile for Linux arch/m68k/mvme16x source directory
 #
 
-obj-y		:= config.o rtc.o mvme16x_ksyms.o
+obj-y		:= config.o rtc.o
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index daa7851..24cbc30 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -25,6 +25,7 @@
 #include <linux/genhd.h>
 #include <linux/rtc.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>
 
 #include <asm/bootinfo.h>
 #include <asm/system.h>
@@ -58,6 +59,7 @@ static irq_handler_t tick_handler;
 
 
 unsigned short mvme16x_config;
+EXPORT_SYMBOL(mvme16x_config);
 
 
 int mvme16x_parse_bootinfo(const struct bi_record *bi)
diff --git a/arch/m68k/mvme16x/mvme16x_ksyms.c b/arch/m68k/mvme16x/mvme16x_ksyms.c
deleted file mode 100644
index 4a8a363..0000000
--- a/arch/m68k/mvme16x/mvme16x_ksyms.c
+++ /dev/null
@@ -1,6 +0,0 @@
-#include <linux/module.h>
-#include <linux/types.h>
-#include <asm/ptrace.h>
-#include <asm/mvme16xhw.h>
-
-EXPORT_SYMBOL(mvme16x_config);
-- 
cgit v1.1


From eb4da4cec30309e3b3d584616e32318cc8f6d63c Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 4 Feb 2008 22:30:27 -0800
Subject: mac68k: macii adb comment correction

Corrects a mistake I made in a comment.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/macintosh/via-macii.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c
index 01b8eca..6e6dd17 100644
--- a/drivers/macintosh/via-macii.c
+++ b/drivers/macintosh/via-macii.c
@@ -111,7 +111,7 @@ static enum macii_state {
 static struct adb_request *current_req; /* first request struct in the queue */
 static struct adb_request *last_req;     /* last request struct in the queue */
 static unsigned char reply_buf[16];        /* storage for autopolled replies */
-static unsigned char *reply_ptr;      /* next byte in req->data or reply_buf */
+static unsigned char *reply_ptr;     /* next byte in reply_buf or req->reply */
 static int reading_reply;        /* store reply in reply_buf else req->reply */
 static int data_index;      /* index of the next byte to send from req->data */
 static int reply_len; /* number of bytes received in reply_buf or req->reply */
-- 
cgit v1.1


From 00bf59ca9bfe4085e6ee546128a71b82d061ab04 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 4 Feb 2008 22:30:29 -0800
Subject: mac68k: remove dead code

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/mac/config.c       | 2 --
 include/asm-m68k/macintosh.h | 2 --
 2 files changed, 4 deletions(-)

diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 01b468b..735a49b 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -58,8 +58,6 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
 
 extern struct mem_info m68k_ramdisk;
 
-extern char m68k_command_line[CL_SIZE];
-
 void *mac_env;					/* Loaded by the boot asm */
 
 /* The phys. video addr. - might be bogus on some machines */
diff --git a/include/asm-m68k/macintosh.h b/include/asm-m68k/macintosh.h
index 27d11da..28b0f49 100644
--- a/include/asm-m68k/macintosh.h
+++ b/include/asm-m68k/macintosh.h
@@ -14,8 +14,6 @@ extern void mac_init_IRQ(void);
 extern int mac_irq_pending(unsigned int);
 extern void mac_identify(void);
 extern void mac_report_hardware(void);
-extern void mac_debugging_penguin(int);
-extern void mac_boom(int);
 
 /*
  *	Floppy driver magic hook - probably shouldnt be here
-- 
cgit v1.1


From 57dfee7c3f1fcac24e986b69bdfdccc8ea025813 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 4 Feb 2008 22:30:30 -0800
Subject: mac68k: add nubus card definitions and a typo fix

Add some new card definitions and fix a typo (from Eugen Paiuc).

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nubus.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index cdb3e9b..c435507 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -132,10 +132,12 @@ enum nubus_drhw {
 	NUBUS_DRHW_RDIUS_DCGX     = 0x027C, /* Radius DirectColor/GX */
 	NUBUS_DRHW_RDIUS_PC8      = 0x0291, /* Radius PrecisionColor 8 */
 	NUBUS_DRHW_LAPIS_PCS8     = 0x0292, /* Lapis ProColorServer 8 */
-	NUBUS_DRHW_RASTER_24LXI   = 0x02A0, /* RasterOps 8/24 XLi */
+	NUBUS_DRHW_RASTER_24XLI   = 0x02A0, /* RasterOps 8/24 XLi */
 	NUBUS_DRHW_RASTER_PBPGT   = 0x02A5, /* RasterOps PaintBoard Prism GT */
 	NUBUS_DRHW_EMACH_FSX      = 0x02AE, /* E-Machines Futura SX */
+	NUBUS_DRHW_RASTER_24XLTV  = 0x02B7, /* RasterOps 24XLTV */
 	NUBUS_DRHW_SMAC_THUND24   = 0x02CB, /* SuperMac Thunder/24 */
+	NUBUS_DRHW_SMAC_THUNDLGHT = 0x03D9, /* SuperMac ThunderLight */
 	NUBUS_DRHW_RDIUS_PC24XP   = 0x0406, /* Radius PrecisionColor 24Xp */
 	NUBUS_DRHW_RDIUS_PC24X    = 0x040A, /* Radius PrecisionColor 24X */
 	NUBUS_DRHW_RDIUS_PC8XJ    = 0x040B, /* Radius PrecisionColor 8XJ */
-- 
cgit v1.1


From 7e02dbf7249b1eadeb8b225f1af98701cd1320cc Mon Sep 17 00:00:00 2001
From: Stanislav Brabec <sbrabec@suse.cz>
Date: Mon, 4 Feb 2008 22:30:30 -0800
Subject: mac68k: remove dead MAC_ADBKEYCODES

It seems, that current kernel source code contains no traces of
MAC_ADBKEYCODES and no reference to keyboard_sends_linux_keycodes any more.

Remove them from configuration files.

Signed-off-by: Stanislav Brabec <sbrabec@suse.cz>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/Kconfig               | 14 --------------
 arch/m68k/configs/mac_defconfig |  1 -
 2 files changed, 15 deletions(-)

diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 8236e42..ffabd01 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -577,20 +577,6 @@ config MAC_HID
 	depends on INPUT_ADBHID
 	default y
 
-config MAC_ADBKEYCODES
-	bool "Support for ADB raw keycodes"
-	depends on INPUT_ADBHID
-	help
-	  This provides support for sending raw ADB keycodes to console
-	  devices.  This is the default up to 2.4.0, but in future this may be
-	  phased out in favor of generic Linux keycodes.  If you say Y here,
-	  you can dynamically switch via the
-	  /proc/sys/dev/mac_hid/keyboard_sends_linux_keycodes
-	  sysctl and with the "keyboard_sends_linux_keycodes=" kernel
-	  argument.
-
-	  If unsure, say Y here.
-
 config ADB_KEYBOARD
 	bool "Support for ADB keyboard (old driver)"
 	depends on MAC && !INPUT_ADBHID
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 15b80ab..ff9dffa 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -678,7 +678,6 @@ CONFIG_LOGO_MAC_CLUT224=y
 #
 CONFIG_MAC_SCC=y
 CONFIG_MAC_HID=y
-CONFIG_MAC_ADBKEYCODES=y
 CONFIG_SERIAL_CONSOLE=y
 
 #
-- 
cgit v1.1


From 2d33d563b1e2b4748c585e3169f46481e897c829 Mon Sep 17 00:00:00 2001
From: Jesper Nilsson <jesper.nilsson@axis.com>
Date: Mon, 4 Feb 2008 22:30:31 -0800
Subject: CRIS: avoid using arch links in Kconfig

Improve including of architecture dependent Kconfig files.

- Always include the architecture dependent Kconfig files.
- Wrap architecture dependent Kconfig files inside an appropriate
  "if ETRAX_ARCH_Vxx" block.

This makes it possible to run the configuration even without the arch links,
which are created later in the build process.

Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/cris/Kconfig                  | 5 +++--
 arch/cris/arch-v10/Kconfig         | 4 ++++
 arch/cris/arch-v10/drivers/Kconfig | 4 ++++
 arch/cris/arch-v32/Kconfig         | 4 ++++
 arch/cris/arch-v32/drivers/Kconfig | 4 ++++
 5 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 7f0be4c..27b082a 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -150,6 +150,7 @@ config ETRAX_FLASH_BUSWIDTH
 	  Width in bytes of the Flash bus (1, 2 or 4). Is usually 2.
 
 source arch/cris/arch-v10/Kconfig
+source arch/cris/arch-v32/Kconfig
 
 endmenu
 
@@ -157,8 +158,8 @@ source "net/Kconfig"
 
 # bring in ETRAX built-in drivers
 menu "Drivers for built-in interfaces"
-# arch/cris/arch is a symlink to correct arch (arch-v10 or arch-v32)
-source arch/cris/arch/drivers/Kconfig
+source arch/cris/arch-v10/drivers/Kconfig
+source arch/cris/arch-v32/drivers/Kconfig
 
 endmenu
 
diff --git a/arch/cris/arch-v10/Kconfig b/arch/cris/arch-v10/Kconfig
index f1ce6f6..1d61fae 100644
--- a/arch/cris/arch-v10/Kconfig
+++ b/arch/cris/arch-v10/Kconfig
@@ -1,3 +1,5 @@
+if ETRAX_ARCH_V10
+
 # ETRAX 100LX v1 has a MMU "feature" requiring a low mapping
 config CRIS_LOW_MAP
 	bool
@@ -451,3 +453,5 @@ config ETRAX_POWERBUTTON_BIT
 	default "25"
 	help
 	  Configure where power button is connected.
+
+endif
diff --git a/arch/cris/arch-v10/drivers/Kconfig b/arch/cris/arch-v10/drivers/Kconfig
index e3c0f29..96740ef 100644
--- a/arch/cris/arch-v10/drivers/Kconfig
+++ b/arch/cris/arch-v10/drivers/Kconfig
@@ -1,3 +1,5 @@
+if ETRAX_ARCH_V10
+
 config ETRAX_ETHERNET
 	bool "Ethernet support"
 	depends on ETRAX_ARCH_V10
@@ -806,3 +808,5 @@ config ETRAX_DS1302_TRICKLE_CHARGE
 	  1 = 2kohm, 2 = 4kohm, 3 = 4kohm
 	  4 = 1 diode, 8 = 2 diodes
 	  Allowed values are (increasing current): 0, 11, 10, 9, 7, 6, 5
+
+endif
diff --git a/arch/cris/arch-v32/Kconfig b/arch/cris/arch-v32/Kconfig
index 4f79d8e..d8acaa9 100644
--- a/arch/cris/arch-v32/Kconfig
+++ b/arch/cris/arch-v32/Kconfig
@@ -1,3 +1,5 @@
+if ETRAX_ARCH_V32
+
 config ETRAX_DRAM_VIRTUAL_BASE
 	hex
 	depends on ETRAX_ARCH_V32
@@ -294,3 +296,5 @@ config ETRAX_DEF_GIO_PE_OUT
 	help
 	  Configures the initial data for the general port E bits.  Most
 	  products should use 00000 here.
+
+endif
diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig
index 9bccb5e2..c329cce 100644
--- a/arch/cris/arch-v32/drivers/Kconfig
+++ b/arch/cris/arch-v32/drivers/Kconfig
@@ -1,3 +1,5 @@
+if ETRAX_ARCH_V32
+
 config ETRAX_ETHERNET
 	bool "Ethernet support"
 	depends on ETRAX_ARCH_V32
@@ -610,3 +612,5 @@ config ETRAX_STREAMCOPROC
 	help
 	  This option enables a driver for the stream co-processor
 	  for cryptographic operations.
+
+endif
-- 
cgit v1.1


From 6d9f4c5cfb6084c16a800e8a2ca9db8d0859611c Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 4 Feb 2008 22:30:32 -0800
Subject: arch/cris: add a missing iounmap

An extra error handling label is needed for the case where the ioremap has
succeeded.

The problem was detected using the following semantic match
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@@
type T,T1,T2;
identifier E;
statement S;
expression x1,x2;
constant C;
int ret;
@@

  T E;
  ...
* E = ioremap(...);
  if (E == NULL) S
  ... when != iounmap(E)
      when != if (E != NULL) { ... iounmap(E); ...}
      when != x1 = (T1)E
  if (...) {
    ... when != iounmap(E)
        when != if (E != NULL) { ... iounmap(E); ...}
        when != x2 = (T2)E
(
*   return;
|
*   return C;
|
*   return ret;
)
  }
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: Mikael Starvik <starvik@axis.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/cris/arch-v32/drivers/pci/dma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/cris/arch-v32/drivers/pci/dma.c b/arch/cris/arch-v32/drivers/pci/dma.c
index 66f9500..e036465 100644
--- a/arch/cris/arch-v32/drivers/pci/dma.c
+++ b/arch/cris/arch-v32/drivers/pci/dma.c
@@ -93,7 +93,7 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
 
 	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
 	if (!dev->dma_mem)
-		goto out;
+		goto iounmap_out;
 	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
 	if (!dev->dma_mem->bitmap)
 		goto free1_out;
@@ -110,6 +110,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
 
  free1_out:
 	kfree(dev->dma_mem);
+ iounmap_out:
+	iounmap(mem_base);
  out:
 	return 0;
 }
-- 
cgit v1.1


From 747646a4d9d9ddfc7952f8c6d25ea1f323e689a0 Mon Sep 17 00:00:00 2001
From: Jesper Nilsson <jesper.nilsson@axis.com>
Date: Mon, 4 Feb 2008 22:30:33 -0800
Subject: cris: remove unused __dummy, CONST_ADDR and ADDR from bitops.h

This is very old code, it hasn't changed since 2001 and it is not used
anywhere.  Noticed by Clemens Koller.

Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-cris/bitops.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index e2f49c2..75ea6e0 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -24,13 +24,6 @@
 #include <linux/compiler.h>
 
 /*
- * Some hacks to defeat gcc over-optimizations..
- */
-struct __dummy { unsigned long a[100]; };
-#define ADDR (*(struct __dummy *) addr)
-#define CONST_ADDR (*(const struct __dummy *) addr)
-
-/*
  * set_bit - Atomically set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
-- 
cgit v1.1


From 99c9f502cb3498b02b1e5df60c9ea0ee2e535373 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:34 -0800
Subject: uml: arch/um/include/init.h needs a definition of __used

init.h started breaking now for some reason.  It turns out that there wasn't a
definition of __used.  Fixed this by copying the relevant stuff from
compiler.h in the userspace case, and including compiler.h in the kernel case.

[xiyou.wangcong@gmail.com: added definition of __section]
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/init.h | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/um/include/init.h b/arch/um/include/init.h
index cebc6ca..b00a957 100644
--- a/arch/um/include/init.h
+++ b/arch/um/include/init.h
@@ -40,6 +40,20 @@
 typedef int (*initcall_t)(void);
 typedef void (*exitcall_t)(void);
 
+#ifndef __KERNEL__
+#ifndef __section
+# define __section(S) __attribute__ ((__section__(#S)))
+#endif
+
+#if __GNUC_MINOR__ >= 3
+# define __used			__attribute__((__used__))
+#else
+# define __used			__attribute__((__unused__))
+#endif
+
+#else
+#include <linux/compiler.h>
+#endif
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
 #define __init		__section(.init.text)
@@ -127,14 +141,3 @@ extern struct uml_param __uml_setup_start, __uml_setup_end;
 #endif
 
 #endif /* _LINUX_UML_INIT_H */
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
-- 
cgit v1.1


From 4d18de45fa921600e1b3770c3a7cb106ab48cd9d Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:35 -0800
Subject: uml: remove xmm checking on x86

This patch removes some code which ran at every boot, but does not seem to do
anything anymore.  Please test.  It works for me but mistakes can happen.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/sys-i386/bugs.c         | 3 ---
 include/asm-um/processor-i386.h | 1 -
 2 files changed, 4 deletions(-)

diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index 806895d..5ee4c36 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -15,7 +15,6 @@
 
 /* Set during early boot */
 int host_has_cmov = 1;
-int host_has_xmm = 0;
 
 static char token(int fd, char *buf, int len, char stop)
 {
@@ -163,8 +162,6 @@ void arch_check_bugs(void)
 	}
 	if (check_cpu_flag("cmov", &have_it))
 		host_has_cmov = have_it;
-	if (check_cpu_flag("xmm", &have_it))
-		host_has_xmm = have_it;
 }
 
 int arch_handle_signal(int sig, struct uml_pt_regs *regs)
diff --git a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h
index 595f1c3..a2b7fe1 100644
--- a/include/asm-um/processor-i386.h
+++ b/include/asm-um/processor-i386.h
@@ -10,7 +10,6 @@
 #include "asm/host_ldt.h"
 #include "asm/segment.h"
 
-extern int host_has_xmm;
 extern int host_has_cmov;
 
 /* include faultinfo structure */
-- 
cgit v1.1


From c9a3072d13e4b8a6549ecc1db6390a55c7ee2ddf Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 4 Feb 2008 22:30:35 -0800
Subject: uml: code tidying under arch/um/os-Linux

This patch contains varied fixes and improvements for some files under
arch/um/os-Linux/, such as a typo fix in a perror message, a missing
argument fix for a printf, some constifying for pointers and so on.

[ jdike - made sigprocmask failure return -errno instead of -1 ]

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h        | 12 ++++++------
 arch/um/os-Linux/file.c     | 14 +++++++-------
 arch/um/os-Linux/main.c     | 12 ++++++++----
 arch/um/os-Linux/mem.c      |  5 ++++-
 arch/um/os-Linux/signal.c   |  3 ++-
 arch/um/os-Linux/start_up.c |  2 +-
 6 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 6f0d1c7..82e5aea 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -137,24 +137,24 @@ extern int os_set_owner(int fd, int pid);
 extern int os_mode_fd(int fd, int mode);
 
 extern int os_seek_file(int fd, unsigned long long offset);
-extern int os_open_file(char *file, struct openflags flags, int mode);
+extern int os_open_file(const char *file, struct openflags flags, int mode);
 extern int os_read_file(int fd, void *buf, int len);
 extern int os_write_file(int fd, const void *buf, int count);
-extern int os_file_size(char *file, unsigned long long *size_out);
-extern int os_file_modtime(char *file, unsigned long *modtime);
+extern int os_file_size(const char *file, unsigned long long *size_out);
+extern int os_file_modtime(const char *file, unsigned long *modtime);
 extern int os_pipe(int *fd, int stream, int close_on_exec);
 extern int os_set_fd_async(int fd, int owner);
 extern int os_clear_fd_async(int fd);
 extern int os_set_fd_block(int fd, int blocking);
 extern int os_accept_connection(int fd);
-extern int os_create_unix_socket(char *file, int len, int close_on_exec);
+extern int os_create_unix_socket(const char *file, int len, int close_on_exec);
 extern int os_shutdown_socket(int fd, int r, int w);
 extern void os_close_file(int fd);
 extern int os_rcv_fd(int fd, int *helper_pid_out);
 extern int create_unix_socket(char *file, int len, int close_on_exec);
-extern int os_connect_socket(char *name);
+extern int os_connect_socket(const char *name);
 extern int os_file_type(char *file);
-extern int os_file_mode(char *file, struct openflags *mode_out);
+extern int os_file_mode(const char *file, struct openflags *mode_out);
 extern int os_lock_file(int fd, int excl);
 extern void os_flush_stdout(void);
 extern int os_stat_filesystem(char *path, long *bsize_out,
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index f834627..c3bb5ce 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -19,7 +19,7 @@
 #include "user.h"
 #include "kern_util.h"
 
-static void copy_stat(struct uml_stat *dst, struct stat64 *src)
+static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
 {
 	*dst = ((struct uml_stat) {
 		.ust_dev     = src->st_dev,     /* device */
@@ -168,7 +168,7 @@ int os_file_type(char *file)
 	else return OS_TYPE_FILE;
 }
 
-int os_file_mode(char *file, struct openflags *mode_out)
+int os_file_mode(const char *file, struct openflags *mode_out)
 {
 	int err;
 
@@ -189,7 +189,7 @@ int os_file_mode(char *file, struct openflags *mode_out)
 	return err;
 }
 
-int os_open_file(char *file, struct openflags flags, int mode)
+int os_open_file(const char *file, struct openflags flags, int mode)
 {
 	int fd, err, f = 0;
 
@@ -216,7 +216,7 @@ int os_open_file(char *file, struct openflags flags, int mode)
 	return fd;
 }
 
-int os_connect_socket(char *name)
+int os_connect_socket(const char *name)
 {
 	struct sockaddr_un sock;
 	int fd, err;
@@ -277,7 +277,7 @@ int os_write_file(int fd, const void *buf, int len)
 	return n;
 }
 
-int os_file_size(char *file, unsigned long long *size_out)
+int os_file_size(const char *file, unsigned long long *size_out)
 {
 	struct uml_stat buf;
 	int err;
@@ -314,7 +314,7 @@ int os_file_size(char *file, unsigned long long *size_out)
 	return 0;
 }
 
-int os_file_modtime(char *file, unsigned long *modtime)
+int os_file_modtime(const char *file, unsigned long *modtime)
 {
 	struct uml_stat buf;
 	int err;
@@ -514,7 +514,7 @@ int os_rcv_fd(int fd, int *helper_pid_out)
 	return new;
 }
 
-int os_create_unix_socket(char *file, int len, int close_on_exec)
+int os_create_unix_socket(const char *file, int len, int close_on_exec)
 {
 	struct sockaddr_un addr;
 	int sock, err;
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 82c3778..de664e7 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -73,7 +73,7 @@ static void install_fatal_handler(int sig)
 	action.sa_handler = last_ditch_exit;
 	if (sigaction(sig, &action, NULL) < 0) {
 		printf("failed to install handler for signal %d - errno = %d\n",
-		       errno);
+		       sig, errno);
 		exit(1);
 	}
 }
@@ -92,7 +92,8 @@ static void setup_env_path(void)
 	 * just use the default + /usr/lib/uml
 	 */
 	if (!old_path || (path_len = strlen(old_path)) == 0) {
-		putenv("PATH=:/bin:/usr/bin/" UML_LIB_PATH);
+		if (putenv("PATH=:/bin:/usr/bin/" UML_LIB_PATH))
+			perror("couldn't putenv");
 		return;
 	}
 
@@ -100,11 +101,14 @@ static void setup_env_path(void)
 	path_len += strlen("PATH=" UML_LIB_PATH) + 1;
 	new_path = malloc(path_len);
 	if (!new_path) {
-		perror("coudn't malloc to set a new PATH");
+		perror("couldn't malloc to set a new PATH");
 		return;
 	}
 	snprintf(new_path, path_len, "PATH=%s" UML_LIB_PATH, old_path);
-	putenv(new_path);
+	if (putenv(new_path)) {
+		perror("couldn't putenv to set a new PATH");
+		free(new_path);
+	}
 }
 
 extern int uml_exitcode;
diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
index 436f8d2..c3b736a 100644
--- a/arch/um/os-Linux/mem.c
+++ b/arch/um/os-Linux/mem.c
@@ -172,13 +172,15 @@ int __init make_tempfile(const char *template, char **out_tempname,
 
 	which_tmpdir();
 	tempname = malloc(MAXPATHLEN);
+	if (!tempname)
+		goto out;
 
 	find_tempdir();
 	if (template[0] != '/')
 		strcpy(tempname, tempdir);
 	else
 		tempname[0] = '\0';
-	strcat(tempname, template);
+	strncat(tempname, template, MAXPATHLEN-1-strlen(tempname));
 	fd = mkstemp(tempname);
 	if(fd < 0){
 		fprintf(stderr, "open - cannot create %s: %s\n", tempname,
@@ -268,6 +270,7 @@ void __init check_tmpexec(void)
 	if(addr == MAP_FAILED){
 		err = errno;
 		perror("failed");
+		close(fd);
 		if(err == EPERM)
 			printf("%s must be not mounted noexec\n",tempdir);
 		exit(1);
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index e9800b0..37302e8 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -183,7 +183,8 @@ int change_sig(int signal, int on)
 
 	sigemptyset(&sigset);
 	sigaddset(&sigset, signal);
-	sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old);
+	if (sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old) < 0)
+		return -errno;
 	return !sigismember(&old, signal);
 }
 
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 7b81f6c..c6cf648 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -63,7 +63,7 @@ static int ptrace_child(void)
 	_exit(ret);
 }
 
-static void fatal_perror(char *str)
+static void fatal_perror(const char *str)
 {
 	perror(str);
 	exit(1);
-- 
cgit v1.1


From c11274655558e72d8d4a598c0077874c094d97d5 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:36 -0800
Subject: uml: implement get_wchan

Implement get_wchan - the algorithm is similar to x86.  It starts with the
stack pointer of the process in question and looks above that for addresses
that are kernel text.  The second one which isn't in the scheduler is the one
that's returned.  The first one is ignored because that will be UML's own
context switching routine.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/process.c           | 35 +++++++++++++++++++++++++++++++++++
 include/asm-um/processor-generic.h |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 0eae00b..c7ea7f2 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -459,3 +459,38 @@ unsigned long arch_align_stack(unsigned long sp)
 	return sp & ~0xf;
 }
 #endif
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long stack_page, sp, ip;
+	bool seen_sched = 0;
+
+	if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING))
+		return 0;
+
+	stack_page = (unsigned long) task_stack_page(p);
+	/* Bail if the process has no kernel stack for some reason */
+	if (stack_page == 0)
+		return 0;
+
+	sp = p->thread.switch_buf->JB_SP;
+	/*
+	 * Bail if the stack pointer is below the bottom of the kernel
+	 * stack for some reason
+	 */
+	if (sp < stack_page)
+		return 0;
+
+	while (sp < stack_page + THREAD_SIZE) {
+		ip = *((unsigned long *) sp);
+		if (in_sched_functions(ip))
+			/* Ignore everything until we're above the scheduler */
+			seen_sched = 1;
+		else if (kernel_text_address(ip) && seen_sched)
+			return ip;
+
+		sp += sizeof(unsigned long);
+	}
+
+	return 0;
+}
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 78c0599..057a76d 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -128,6 +128,6 @@ extern struct cpuinfo_um cpu_data[];
 
 
 #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf)
-#define get_wchan(p) (0)
+extern unsigned long get_wchan(struct task_struct *p);
 
 #endif
-- 
cgit v1.1


From b2cf770758cfd9dceb62afcb86e56e96ff37234a Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:37 -0800
Subject: uml: get rid of asmlinkage

Get rid of asmlinkage and remove some old cruft from asm/linkage.h.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/sys-i386/tls.c   | 9 +++++----
 include/asm-um/linkage.h | 6 ------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/um/sys-i386/tls.c b/arch/um/sys-i386/tls.c
index fcaff86..027e86a 100644
--- a/arch/um/sys-i386/tls.c
+++ b/arch/um/sys-i386/tls.c
@@ -225,7 +225,8 @@ out:
 }
 
 /* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */
-static int get_tls_entry(struct task_struct* task, struct user_desc *info, int idx)
+static int get_tls_entry(struct task_struct *task, struct user_desc *info,
+			 int idx)
 {
 	struct thread_struct *t = &task->thread;
 
@@ -263,7 +264,7 @@ clear:
 	goto out;
 }
 
-asmlinkage int sys_set_thread_area(struct user_desc __user *user_desc)
+int sys_set_thread_area(struct user_desc __user *user_desc)
 {
 	struct user_desc info;
 	int idx, ret;
@@ -298,7 +299,7 @@ asmlinkage int sys_set_thread_area(struct user_desc __user *user_desc)
  * i386. However the only possible error are caused by bugs.
  */
 int ptrace_set_thread_area(struct task_struct *child, int idx,
-		struct user_desc __user *user_desc)
+			   struct user_desc __user *user_desc)
 {
 	struct user_desc info;
 
@@ -311,7 +312,7 @@ int ptrace_set_thread_area(struct task_struct *child, int idx,
 	return set_tls_entry(child, &info, idx, 0);
 }
 
-asmlinkage int sys_get_thread_area(struct user_desc __user *user_desc)
+int sys_get_thread_area(struct user_desc __user *user_desc)
 {
 	struct user_desc info;
 	int idx, ret;
diff --git a/include/asm-um/linkage.h b/include/asm-um/linkage.h
index cdb3024..7dfce37 100644
--- a/include/asm-um/linkage.h
+++ b/include/asm-um/linkage.h
@@ -3,10 +3,4 @@
 
 #include "asm/arch/linkage.h"
 
-
-/* <linux/linkage.h> will pick sane defaults */
-#ifdef CONFIG_GPROF
-#undef fastcall
-#endif
-
 #endif
-- 
cgit v1.1


From 20ede453af796bbaaf46a2bb0ee0bcc90ab505b3 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:37 -0800
Subject: uml: document new ubd flag

The ubd help message didn't document the 'c' flag.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/ubd_kern.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 99f9f96..9793e3d 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -437,7 +437,10 @@ __uml_help(ubd_setup,
 "    machine by running 'dd' on the device. <n> must be in the range\n"
 "    0 to 7. Appending an 'r' to the number will cause that device\n"
 "    to be mounted read-only. For example ubd1r=./ext_fs. Appending\n"
-"    an 's' will cause data to be written to disk on the host immediately.\n\n"
+"    an 's' will cause data to be written to disk on the host immediately.\n"
+"    'c' will cause the device to be treated as being shared between multiple\n"
+"    UMLs and file locking will be turned off - this is appropriate for a\n"
+"    cluster filesystem and inappropriate at almost all other times.\n\n"
 );
 
 static int udb_setup(char *str)
-- 
cgit v1.1


From 0ba9d3f91d213f6d07c84230a0b3e2b16a0bb176 Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:38 -0800
Subject: uml: fix URLs in Kconfig and help strings

This patch updates links which broke during the transition to the new UML
website.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig             |  2 +-
 arch/um/Kconfig.char        |  2 +-
 arch/um/Kconfig.debug       |  4 ++--
 arch/um/Kconfig.net         | 12 ++++++------
 arch/um/include/chan_user.h |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 55945db..5261db8 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -145,7 +145,7 @@ config HPPFS
 	  by removing or changing anything in /proc which gives away the
 	  identity of a UML.
 
-	  See <http://user-mode-linux.sf.net/hppfs.html> for more information.
+	  See <http://user-mode-linux.sf.net/old/hppfs.html> for more information.
 
 	  You only need this if you are setting up a UML honeypot.  Otherwise,
 	  it is safe to say 'N' here.
diff --git a/arch/um/Kconfig.char b/arch/um/Kconfig.char
index 9a78d35..3a4b396 100644
--- a/arch/um/Kconfig.char
+++ b/arch/um/Kconfig.char
@@ -18,7 +18,7 @@ config SSL
           lines on the UML that are usually made to show up on the host as
           ttys or ptys.
 
-          See <http://user-mode-linux.sourceforge.net/input.html> for more
+          See <http://user-mode-linux.sourceforge.net/old/input.html> for more
           information and command line examples of how to use this facility.
 
           Unless you have a specific reason for disabling this, say Y.
diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug
index 1f6462f..40456f4 100644
--- a/arch/um/Kconfig.debug
+++ b/arch/um/Kconfig.debug
@@ -9,7 +9,7 @@ config GPROF
 	  This allows profiling of a User-Mode Linux kernel with the gprof
 	  utility.
 
-	  See <http://user-mode-linux.sourceforge.net/gprof.html> for more
+	  See <http://user-mode-linux.sourceforge.net/old/gprof.html> for more
 	  details.
 
 	  If you're involved in UML kernel development and want to use gprof,
@@ -22,7 +22,7 @@ config GCOV
 	  This option allows developers to retrieve coverage data from a UML
 	  session.
 
-	  See <http://user-mode-linux.sourceforge.net/gprof.html> for more
+	  See <http://user-mode-linux.sourceforge.net/old/gprof.html> for more
 	  details.
 
 	  If you're involved in UML kernel development and want to use gcov,
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 66e5002..9e9a4aa 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -14,7 +14,7 @@ config UML_NET
 
         For more information, including explanations of the networking and
         sample configurations, see
-        <http://user-mode-linux.sourceforge.net/networking.html>.
+        <http://user-mode-linux.sourceforge.net/old/networking.html>.
 
         If you'd like to be able to enable networking in the User-Mode
         linux environment, say Y; otherwise say N.  Note that you must
@@ -38,7 +38,7 @@ config UML_NET_ETHERTAP
         CONFIG_NETLINK_DEV configured as Y or M.
 
         For more information, see
-        <http://user-mode-linux.sourceforge.net/networking.html>  That site
+        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
         has examples of the UML command line to use to enable Ethertap
         networking.
 
@@ -72,7 +72,7 @@ config UML_NET_SLIP
         To use this, your host must support slip devices.
 
         For more information, see
-        <http://user-mode-linux.sourceforge.net/networking.html>.  That site
+        <http://user-mode-linux.sourceforge.net/old/networking.html>.
         has examples of the UML command line to use to enable slip
         networking, and details of a few quirks with it.
 
@@ -96,7 +96,7 @@ config UML_NET_DAEMON
         networking daemon on the host.
 
         For more information, see
-        <http://user-mode-linux.sourceforge.net/networking.html>  That site
+        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
         has examples of the UML command line to use to enable Daemon
         networking.
 
@@ -144,7 +144,7 @@ config UML_NET_MCAST
         To use this, your host kernel(s) must support IP Multicasting.
 
         For more information, see
-        <http://user-mode-linux.sourceforge.net/networking.html>  That site
+        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
         has examples of the UML command line to use to enable Multicast
         networking, and notes about the security of this approach.
 
@@ -165,7 +165,7 @@ config UML_NET_PCAP
 	installed in order to build the pcap transport into UML.
 
         For more information, see
-        <http://user-mode-linux.sourceforge.net/networking.html>  That site
+        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
         has examples of the UML command line to use to enable this option.
 
 	If you intend to use UML as a network monitor for the host, say
diff --git a/arch/um/include/chan_user.h b/arch/um/include/chan_user.h
index 5a2263e..9b9ced8 100644
--- a/arch/um/include/chan_user.h
+++ b/arch/um/include/chan_user.h
@@ -48,7 +48,7 @@ extern void register_winch_irq(int fd, int tty_fd, int pid,
 #define __channel_help(fn, prefix) \
 __uml_help(fn, prefix "[0-9]*=<channel description>\n" \
 "    Attach a console or serial line to a host channel.  See\n" \
-"    http://user-mode-linux.sourceforge.net/input.html for a complete\n" \
+"    http://user-mode-linux.sourceforge.net/old/input.html for a complete\n" \
 "    description of this switch.\n\n" \
 );
 
-- 
cgit v1.1


From 235a6f06eb5571db600a743cda7c73fd4f74127f Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:38 -0800
Subject: uml: improve detection of host cmov

This patch introduces a new way of checking for the cmov instruction.  I use
signal handling instead of reading /proc/cpuinfo.

[ jdike - Fiddled the asm to make it obvious that it didn't mess with
any in-use registers and made test_for_host_cmov void ]

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/common-offsets.h |  1 +
 arch/um/sys-i386/bugs.c          | 40 +++++++++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/um/include/common-offsets.h b/arch/um/include/common-offsets.h
index 0edab69..b54bd35 100644
--- a/arch/um/include/common-offsets.h
+++ b/arch/um/include/common-offsets.h
@@ -18,6 +18,7 @@ DEFINE_STR(UM_KERN_WARNING, KERN_WARNING);
 DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE);
 DEFINE_STR(UM_KERN_INFO, KERN_INFO);
 DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG);
+DEFINE_STR(UM_KERN_CONT, KERN_CONT);
 
 DEFINE(UM_ELF_CLASS, ELF_CLASS);
 DEFINE(UM_ELFCLASS32, ELFCLASS32);
diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index 5ee4c36..797945c 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -10,11 +10,41 @@
 #include "os.h"
 #include "task.h"
 #include "user.h"
+#include "sysdep/archsetjmp.h"
 
 #define MAXTOKEN 64
 
 /* Set during early boot */
 int host_has_cmov = 1;
+static jmp_buf cmov_test_return;
+
+static void cmov_sigill_test_handler(int sig)
+{
+	host_has_cmov = 0;
+	longjmp(cmov_test_return, 1);
+}
+
+static void test_for_host_cmov(void)
+{
+	struct sigaction old, new;
+
+	printk(UM_KERN_INFO "Checking for host processor cmov support...");
+	new.sa_handler = cmov_sigill_test_handler;
+
+	/* Make sure that SIGILL is enabled after the handler longjmps back */
+	new.sa_flags = SA_NODEFER;
+	sigemptyset(&new.sa_mask);
+	sigaction(SIGILL, &new, &old);
+
+	if (setjmp(cmov_test_return) == 0) {
+		unsigned long foo = 0;
+		__asm__ __volatile__("cmovz %0, %1" : "=r" (foo) : "0" (foo));
+		printk(UM_KERN_CONT "Yes\n");
+	} else
+		printk(UM_KERN_CONT "No\n");
+
+	sigaction(SIGILL, &old, &new);
+}
 
 static char token(int fd, char *buf, int len, char stop)
 {
@@ -153,15 +183,7 @@ void arch_init_thread(void)
 
 void arch_check_bugs(void)
 {
-	int have_it;
-
-	if (os_access("/proc/cpuinfo", OS_ACC_R_OK) < 0) {
-		printk(UM_KERN_ERR "/proc/cpuinfo not available - skipping CPU "
-		       "capability checks\n");
-		return;
-	}
-	if (check_cpu_flag("cmov", &have_it))
-		host_has_cmov = have_it;
+	test_for_host_cmov();
 }
 
 int arch_handle_signal(int sig, struct uml_pt_regs *regs)
-- 
cgit v1.1


From 06d9bd3ad6da2422f838fd11d5d5348fa579bdb2 Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:39 -0800
Subject: uml: remove now unused code

This patch finishes what the previous one started.  The code was not used
after my first patch, and now can be removed with ease.

[ jdike - also deleted the #if 0 lcall stuff ]

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/sys-i386/bugs.c | 137 ------------------------------------------------
 1 file changed, 137 deletions(-)

diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index 797945c..fc99118 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -12,8 +12,6 @@
 #include "user.h"
 #include "sysdep/archsetjmp.h"
 
-#define MAXTOKEN 64
-
 /* Set during early boot */
 int host_has_cmov = 1;
 static jmp_buf cmov_test_return;
@@ -46,139 +44,8 @@ static void test_for_host_cmov(void)
 	sigaction(SIGILL, &old, &new);
 }
 
-static char token(int fd, char *buf, int len, char stop)
-{
-	int n;
-	char *ptr, *end, c;
-
-	ptr = buf;
-	end = &buf[len];
-	do {
-		n = os_read_file(fd, ptr, sizeof(*ptr));
-		c = *ptr++;
-		if (n != sizeof(*ptr)) {
-			if (n == 0)
-				return 0;
-			printk(UM_KERN_ERR "Reading /proc/cpuinfo failed, "
-			       "err = %d\n", -n);
-			if (n < 0)
-				return n;
-			else return -EIO;
-		}
-	} while ((c != '\n') && (c != stop) && (ptr < end));
-
-	if (ptr == end) {
-		printk(UM_KERN_ERR "Failed to find '%c' in /proc/cpuinfo\n",
-		       stop);
-		return -1;
-	}
-	*(ptr - 1) = '\0';
-	return c;
-}
-
-static int find_cpuinfo_line(int fd, char *key, char *scratch, int len)
-{
-	int n;
-	char c;
-
-	scratch[len - 1] = '\0';
-	while (1) {
-		c = token(fd, scratch, len - 1, ':');
-		if (c <= 0)
-			return 0;
-		else if (c != ':') {
-			printk(UM_KERN_ERR "Failed to find ':' in "
-			       "/proc/cpuinfo\n");
-			return 0;
-		}
-
-		if (!strncmp(scratch, key, strlen(key)))
-			return 1;
-
-		do {
-			n = os_read_file(fd, &c, sizeof(c));
-			if (n != sizeof(c)) {
-				printk(UM_KERN_ERR "Failed to find newline in "
-				       "/proc/cpuinfo, err = %d\n", -n);
-				return 0;
-			}
-		} while (c != '\n');
-	}
-	return 0;
-}
-
-static int check_cpu_flag(char *feature, int *have_it)
-{
-	char buf[MAXTOKEN], c;
-	int fd, len = ARRAY_SIZE(buf);
-
-	printk(UM_KERN_INFO "Checking for host processor %s support...",
-	       feature);
-	fd = os_open_file("/proc/cpuinfo", of_read(OPENFLAGS()), 0);
-	if (fd < 0) {
-		printk(UM_KERN_ERR "Couldn't open /proc/cpuinfo, err = %d\n",
-		       -fd);
-		return 0;
-	}
-
-	*have_it = 0;
-	if (!find_cpuinfo_line(fd, "flags", buf, ARRAY_SIZE(buf)))
-		goto out;
-
-	c = token(fd, buf, len - 1, ' ');
-	if (c < 0)
-		goto out;
-	else if (c != ' ') {
-		printk(UM_KERN_ERR "Failed to find ' ' in /proc/cpuinfo\n");
-		goto out;
-	}
-
-	while (1) {
-		c = token(fd, buf, len - 1, ' ');
-		if (c < 0)
-			goto out;
-		else if (c == '\n')
-			break;
-
-		if (!strcmp(buf, feature)) {
-			*have_it = 1;
-			goto out;
-		}
-	}
- out:
-	if (*have_it == 0)
-		printk("No\n");
-	else if (*have_it == 1)
-		printk("Yes\n");
-	os_close_file(fd);
-	return 1;
-}
-
-#if 0 /*
-       * This doesn't work in tt mode, plus it's causing compilation problems
-       * for some people.
-       */
-static void disable_lcall(void)
-{
-	struct modify_ldt_ldt_s ldt;
-	int err;
-
-	bzero(&ldt, sizeof(ldt));
-	ldt.entry_number = 7;
-	ldt.base_addr = 0;
-	ldt.limit = 0;
-	err = modify_ldt(1, &ldt, sizeof(ldt));
-	if (err)
-		printk(UM_KERN_ERR "Failed to disable lcall7 - errno = %d\n",
-		       errno);
-}
-#endif
-
 void arch_init_thread(void)
 {
-#if 0
-	disable_lcall();
-#endif
 }
 
 void arch_check_bugs(void)
@@ -209,10 +76,6 @@ int arch_handle_signal(int sig, struct uml_pt_regs *regs)
 	else if (host_has_cmov == 1)
 		panic("SIGILL caused by cmov, which this processor claims to "
 		      "implement");
-	else if (host_has_cmov == -1)
-		panic("SIGILL caused by cmov, couldn't tell if this processor "
-		      "implements it, boot a filesystem compiled for older "
-		      "processors");
 	else panic("Bad value for host_has_cmov (%d)", host_has_cmov);
 	return 0;
 }
-- 
cgit v1.1


From 9226b8384776798986640ce07764d17ba66aa54f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:40 -0800
Subject: uml: further bugs.c tidying

bugs.c, for both i386 and x86_64, can undergo further cleaning -
	The i386 arch_check_bugs only does one thing, so we might as
well inline the cmov checking.
	The i386 includes can be trimmed down a bit.
	arch_init_thread wasn't used, so it is deleted.
	The panics in arch_handle_signal are turned into printks
because the process is about to get segfaulted anyway, so something is
dying no matter what happens here.  Also, the return value was always
the same, so it contained no information, so it can be void instead.
The name is changed to arch_examine_signal because it doesn't handle
anything.
	The caller of arch_handle_signal, relay_signal, does things in
a different order.  The kernel-mode signal check is now first, which
puts everything else together, making things a bit clearer conceptually.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/arch.h    |  2 +-
 arch/um/kernel/trap.c     |  5 ++---
 arch/um/sys-i386/bugs.c   | 46 ++++++++++++++++++++--------------------------
 arch/um/sys-x86_64/bugs.c |  7 +------
 4 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/arch/um/include/arch.h b/arch/um/include/arch.h
index 49c601ff..2de92a0 100644
--- a/arch/um/include/arch.h
+++ b/arch/um/include/arch.h
@@ -10,6 +10,6 @@
 
 extern void arch_check_bugs(void);
 extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs);
-extern int arch_handle_signal(int sig, struct uml_pt_regs *regs);
+extern void arch_examine_signal(int sig, struct uml_pt_regs *regs);
 
 #endif
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index cb3321f..e3a3ab8 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -216,9 +216,6 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 
 void relay_signal(int sig, struct uml_pt_regs *regs)
 {
-	if (arch_handle_signal(sig, regs))
-		return;
-
 	if (!UPT_IS_USER(regs)) {
 		if (sig == SIGBUS)
 			printk(KERN_ERR "Bus error - the host /dev/shm or /tmp "
@@ -226,6 +223,8 @@ void relay_signal(int sig, struct uml_pt_regs *regs)
 		panic("Kernel mode signal %d", sig);
 	}
 
+	arch_examine_signal(sig, regs);
+
 	current->thread.arch.faultinfo = *UPT_FAULTINFO(regs);
 	force_sig(sig, current);
 }
diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index fc99118..b0cb052 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -3,14 +3,12 @@
  * Licensed under the GPL
  */
 
-#include <errno.h>
 #include <signal.h>
-#include <string.h>
 #include "kern_constants.h"
-#include "os.h"
+#include "longjmp.h"
 #include "task.h"
 #include "user.h"
-#include "sysdep/archsetjmp.h"
+#include "sysdep/ptrace.h"
 
 /* Set during early boot */
 int host_has_cmov = 1;
@@ -22,7 +20,7 @@ static void cmov_sigill_test_handler(int sig)
 	longjmp(cmov_test_return, 1);
 }
 
-static void test_for_host_cmov(void)
+void arch_check_bugs(void)
 {
 	struct sigaction old, new;
 
@@ -44,16 +42,7 @@ static void test_for_host_cmov(void)
 	sigaction(SIGILL, &old, &new);
 }
 
-void arch_init_thread(void)
-{
-}
-
-void arch_check_bugs(void)
-{
-	test_for_host_cmov();
-}
-
-int arch_handle_signal(int sig, struct uml_pt_regs *regs)
+void arch_examine_signal(int sig, struct uml_pt_regs *regs)
 {
 	unsigned char tmp[2];
 
@@ -62,20 +51,25 @@ int arch_handle_signal(int sig, struct uml_pt_regs *regs)
 	 * SIGILL in init.
 	 */
 	if ((sig != SIGILL) || (TASK_PID(get_current()) != 1))
-		return 0;
+		return;
+
+	if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) {
+		printk(UM_KERN_ERR "SIGILL in init, could not read "
+		       "instructions!\n");
+		return;
+	}
 
-	if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2))
-		panic("SIGILL in init, could not read instructions!\n");
 	if ((tmp[0] != 0x0f) || ((tmp[1] & 0xf0) != 0x40))
-		return 0;
+		return;
 
 	if (host_has_cmov == 0)
-		panic("SIGILL caused by cmov, which this processor doesn't "
-		      "implement, boot a filesystem compiled for older "
-		      "processors");
+		printk(UM_KERN_ERR "SIGILL caused by cmov, which this "
+		       "processor doesn't implement.  Boot a filesystem "
+		       "compiled for older processors");
 	else if (host_has_cmov == 1)
-		panic("SIGILL caused by cmov, which this processor claims to "
-		      "implement");
-	else panic("Bad value for host_has_cmov (%d)", host_has_cmov);
-	return 0;
+		printk(UM_KERN_ERR "SIGILL caused by cmov, which this "
+		       "processor claims to implement");
+	else
+		printk(UM_KERN_ERR "Bad value for host_has_cmov (%d)",
+			host_has_cmov);
 }
diff --git a/arch/um/sys-x86_64/bugs.c b/arch/um/sys-x86_64/bugs.c
index 506b676..44e02ba 100644
--- a/arch/um/sys-x86_64/bugs.c
+++ b/arch/um/sys-x86_64/bugs.c
@@ -6,15 +6,10 @@
 
 #include "sysdep/ptrace.h"
 
-void arch_init_thread(void)
-{
-}
-
 void arch_check_bugs(void)
 {
 }
 
-int arch_handle_signal(int sig, struct uml_pt_regs *regs)
+void arch_examine_signal(int sig, struct uml_pt_regs *regs)
 {
-	return 0;
 }
-- 
cgit v1.1


From c0a9290ecf0dbb89958cb3a3f78964015a7de402 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 4 Feb 2008 22:30:41 -0800
Subject: uml: const and other tidying

This patch also does some improvements for uml code.  Improvements include
dropping unnecessary cast, killing some unnecessary code and still some
constifying for pointers etc..

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/ubd_kern.c             | 6 +++---
 arch/um/include/kern_util.h            | 2 +-
 arch/um/kernel/mem.c                   | 2 +-
 arch/um/kernel/process.c               | 4 +---
 arch/um/os-Linux/drivers/tuntap_user.c | 2 +-
 arch/um/os-Linux/mem.c                 | 7 ++++---
 arch/um/os-Linux/sigio.c               | 2 +-
 7 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 9793e3d..7a252ab 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -229,7 +229,7 @@ static int proc_ide_read_media(char *page, char **start, off_t off, int count,
 	return len;
 }
 
-static void make_ide_entries(char *dev_name)
+static void make_ide_entries(const char *dev_name)
 {
 	struct proc_dir_entry *dir, *ent;
 	char name[64];
@@ -244,7 +244,7 @@ static void make_ide_entries(char *dev_name)
 	ent->data = NULL;
 	ent->read_proc = proc_ide_read_media;
 	ent->write_proc = NULL;
-	sprintf(name,"ide0/%s", dev_name);
+	snprintf(name, sizeof(name), "ide0/%s", dev_name);
 	proc_symlink(dev_name, proc_ide_root, name);
 }
 
@@ -443,7 +443,7 @@ __uml_help(ubd_setup,
 "    cluster filesystem and inappropriate at almost all other times.\n\n"
 );
 
-static int udb_setup(char *str)
+static int udb_setup(const char *str)
 {
 	printk("udb%s specified on command line is almost certainly a ubd -> "
 	       "udb TYPO\n", str);
diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index 74ce8e5..aa27eb0 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -81,7 +81,7 @@ extern void do_uml_exitcalls(void);
 extern int attach_debugger(int idle_pid, int pid, int stop);
 extern int config_gdb(char *str);
 extern int remove_gdb(void);
-extern char *uml_strdup(char *string);
+extern char *uml_strdup(const char *string);
 extern void unprotect_kernel_mem(void);
 extern void protect_kernel_mem(void);
 extern void uml_cleanup(void);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index e3e7241..96072e2 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -65,7 +65,7 @@ static void setup_highmem(unsigned long highmem_start,
 void __init mem_init(void)
 {
 	/* clear the zero-page */
-	memset((void *) empty_zero_page, 0, PAGE_SIZE);
+	memset(empty_zero_page, 0, PAGE_SIZE);
 
 	/* Map in the area just after the brk now that kmalloc is about
 	 * to be turned on.
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c7ea7f2..91bd68e 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -60,8 +60,6 @@ unsigned long alloc_stack(int order, int atomic)
 	if (atomic)
 		flags = GFP_ATOMIC;
 	page = __get_free_pages(flags, order);
-	if (page == 0)
-		return 0;
 
 	return page;
 }
@@ -331,7 +329,7 @@ void do_uml_exitcalls(void)
 		(*call)();
 }
 
-char *uml_strdup(char *string)
+char *uml_strdup(const char *string)
 {
 	return kstrdup(string, GFP_KERNEL);
 }
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
index 1037a3b6..1d84795 100644
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ b/arch/um/os-Linux/drivers/tuntap_user.c
@@ -148,7 +148,7 @@ static int tuntap_open(void *data)
 		memset(&ifr, 0, sizeof(ifr));
 		ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
 		strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name));
-		if (ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0) {
+		if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) {
 			err = -errno;
 			printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n",
 			       errno);
diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
index c3b736a..9674ed1 100644
--- a/arch/um/os-Linux/mem.c
+++ b/arch/um/os-Linux/mem.c
@@ -30,7 +30,7 @@ static char *tempdir = NULL;
 
 static void __init find_tempdir(void)
 {
-	char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL };
+	const char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL };
 	int i;
 	char *dir = NULL;
 
@@ -59,9 +59,10 @@ static void __init find_tempdir(void)
  * read the file as needed.  If there's an error, -errno is returned;
  * if the end of the file is reached, 0 is returned.
  */
-static int next(int fd, char *buf, int size, char c)
+static int next(int fd, char *buf, size_t size, char c)
 {
-	int n, len;
+	ssize_t n;
+	size_t len;
 	char *ptr;
 
 	while((ptr = strchr(buf, c)) == NULL){
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index dc03e9c..7243f57 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -407,7 +407,7 @@ static int async_pty(int master, int slave)
 	if((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0))
 		return -errno;
 
-	return(0);
+	return 0;
 }
 
 static void __init check_one_sigio(void (*proc)(int, int))
-- 
cgit v1.1


From 02bff1f091db446b07a9275cbcd048264dcf2260 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:41 -0800
Subject: uml: SMP needs to depend on BROKEN for now

SMP still needs to depend on BROKEN for now.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 5261db8..58f5a14 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -189,8 +189,7 @@ config MAGIC_SYSRQ
 config SMP
 	bool "Symmetric multi-processing support (EXPERIMENTAL)"
 	default n
-	#SMP_BROKEN is for x86_64.
-	depends on EXPERIMENTAL && (!SMP_BROKEN || (BROKEN && SMP_BROKEN))
+	depends on BROKEN
 	help
 	  This option enables UML SMP support.
 	  It is NOT related to having a real SMP box. Not directly, at least.
-- 
cgit v1.1


From 054211acad0cb911177507a039d8e7a25bff084d Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:42 -0800
Subject: uml: GPROF needs to depend on FRAME_POINTER

This is a short Kconfig fix for a problem in User Mode Linux.  Frame pointers
are required for gprof support to work.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug
index 40456f4..8fce5e5 100644
--- a/arch/um/Kconfig.debug
+++ b/arch/um/Kconfig.debug
@@ -4,7 +4,7 @@ source "lib/Kconfig.debug"
 
 config GPROF
 	bool "Enable gprof support"
-	depends on DEBUG_INFO
+	depends on DEBUG_INFO && FRAME_POINTER
 	help
 	  This allows profiling of a User-Mode Linux kernel with the gprof
 	  utility.
-- 
cgit v1.1


From acb2cf34cf624b9b3ab20c2c83543146128a4dad Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:42 -0800
Subject: uml: console driver cleanups

Console driver cleanups -
	Changed an instance of foo = bar + foo to foo += bar
	Removed checks of tty->stopped - I don't think the low-level
driver has any business looking at that
	Removed an annoying warning

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/line.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 83bf15a..5cff653 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -48,7 +48,7 @@ static int write_room(struct line *line)
 	n = line->head - line->tail;
 
 	if (n <= 0)
-		n = LINE_BUFSIZE + n; /* The other case */
+		n += LINE_BUFSIZE; /* The other case */
 	return n - 1;
 }
 
@@ -58,17 +58,10 @@ int line_write_room(struct tty_struct *tty)
 	unsigned long flags;
 	int room;
 
-	if (tty->stopped)
-		return 0;
-
 	spin_lock_irqsave(&line->lock, flags);
 	room = write_room(line);
 	spin_unlock_irqrestore(&line->lock, flags);
 
-	/*XXX: Warning to remove */
-	if (0 == room)
-		printk(KERN_DEBUG "%s: %s: no room left in buffer\n",
-		       __FUNCTION__,tty->name);
 	return room;
 }
 
@@ -79,8 +72,7 @@ int line_chars_in_buffer(struct tty_struct *tty)
 	int ret;
 
 	spin_lock_irqsave(&line->lock, flags);
-
-	/*write_room subtracts 1 for the needed NULL, so we readd it.*/
+	/* write_room subtracts 1 for the needed NULL, so we readd it.*/
 	ret = LINE_BUFSIZE - (write_room(line) + 1);
 	spin_unlock_irqrestore(&line->lock, flags);
 
@@ -184,10 +176,6 @@ void line_flush_buffer(struct tty_struct *tty)
 	unsigned long flags;
 	int err;
 
-	/*XXX: copied from line_write, verify if it is correct!*/
-	if (tty->stopped)
-		return;
-
 	spin_lock_irqsave(&line->lock, flags);
 	err = flush_buffer(line);
 	spin_unlock_irqrestore(&line->lock, flags);
@@ -213,9 +201,6 @@ int line_write(struct tty_struct *tty, const unsigned char *buf, int len)
 	unsigned long flags;
 	int n, ret = 0;
 
-	if (tty->stopped)
-		return 0;
-
 	spin_lock_irqsave(&line->lock, flags);
 	if (line->head != line->tail)
 		ret = buffer_data(line, buf, len);
-- 
cgit v1.1


From ee56314b79039b669396ee04aac3e342cd2e5a1f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:43 -0800
Subject: uml: clone.c tidying

clone.c needed some style attention -
	updated copyright
	include trimming
	coding style

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/skas/clone.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c
index 8d07a7a..2c8583c 100644
--- a/arch/um/kernel/skas/clone.c
+++ b/arch/um/kernel/skas/clone.c
@@ -1,17 +1,20 @@
-#include <sched.h>
+/*
+ * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
 #include <signal.h>
-#include <sys/mman.h>
-#include <sys/time.h>
+#include <sched.h>
 #include <asm/unistd.h>
+#include <sys/time.h>
 #include "as-layout.h"
+#include "kern_constants.h"
 #include "ptrace_user.h"
-#include "skas.h"
 #include "stub-data.h"
-#include "uml-config.h"
 #include "sysdep/stub.h"
-#include "kern_constants.h"
 
-/* This is in a separate file because it needs to be compiled with any
+/*
+ * This is in a separate file because it needs to be compiled with any
  * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled
  *
  * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize
@@ -26,25 +29,26 @@ stub_clone_handler(void)
 
 	err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD,
 			    STUB_DATA + UM_KERN_PAGE_SIZE / 2 - sizeof(void *));
-	if(err != 0)
+	if (err != 0)
 		goto out;
 
 	err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
-	if(err)
+	if (err)
 		goto out;
 
-	err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL, 
+	err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL,
 			    (long) &data->timer, 0);
-	if(err)
+	if (err)
 		goto out;
 
 	remap_stack(data->fd, data->offset);
 	goto done;
 
  out:
-	/* save current result. 
-	 * Parent: pid; 
-	 * child: retcode of mmap already saved and it jumps around this 
+	/*
+	 * save current result.
+	 * Parent: pid;
+	 * child: retcode of mmap already saved and it jumps around this
 	 * assignment
 	 */
 	data->err = err;
-- 
cgit v1.1


From 4bdf8bc4a15d4540d71db9fa01955db5edcf89ec Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:44 -0800
Subject: uml: borrow const.h techniques

Suggested by Geert Uytterhoeven - use const.h to get constants that are usable
in both C and assembly.  I can't include it directly since this code can't
include kernel headers.  const.h is also for numeric constants that can be
typed by tacking a "UL" or similar on the end.  The constants here have to be
typed by casting them.

So, the relevant parts of const.h are copied here and modified in order to
allow the constants to be uncasted in assembly and casted in C.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/as-layout.h | 34 +++++++++++++++++++++-------------
 arch/um/sys-i386/stub.S     |  8 ++++----
 arch/um/sys-x86_64/stub.S   |  8 ++++----
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/arch/um/include/as-layout.h b/arch/um/include/as-layout.h
index a5cdf95..2b859e0 100644
--- a/arch/um/include/as-layout.h
+++ b/arch/um/include/as-layout.h
@@ -10,23 +10,31 @@
 #include "kern_constants.h"
 
 /*
- * Assembly doesn't want any casting, but C does, so define these
- * without casts here, and define new symbols with casts inside the C
- * section.
+ * Stolen from linux/const.h, which can't be directly included since
+ * this is used in userspace code, which has no access to the kernel
+ * headers.  Changed to be suitable for adding casts to the start,
+ * rather than "UL" to the end.
  */
-#define ASM_STUB_CODE (UML_CONFIG_TOP_ADDR - 2 * UM_KERN_PAGE_SIZE)
-#define ASM_STUB_DATA (UML_CONFIG_TOP_ADDR - UM_KERN_PAGE_SIZE)
-#define ASM_STUB_START ASM_STUB_CODE
 
-/*
- * This file is included by the assembly stubs, which just want the
- * definitions above.
+/* Some constant macros are used in both assembler and
+ * C code.  Therefore we cannot annotate them always with
+ * 'UL' and other type specifiers unilaterally.  We
+ * use the following macros to deal with this.
  */
-#ifndef __ASSEMBLY__
 
-#define STUB_CODE ((unsigned long) ASM_STUB_CODE)
-#define STUB_DATA ((unsigned long) ASM_STUB_DATA)
-#define STUB_START ((unsigned long) ASM_STUB_START)
+#ifdef __ASSEMBLY__
+#define _AC(X, Y)	(Y)
+#else
+#define __AC(X, Y)	(X (Y))
+#define _AC(X, Y)	__AC(X, Y)
+#endif
+
+#define STUB_CODE _AC((unsigned long), \
+		      UML_CONFIG_TOP_ADDR - 2 * UM_KERN_PAGE_SIZE)
+#define STUB_DATA _AC((unsigned long), UML_CONFIG_TOP_ADDR - UM_KERN_PAGE_SIZE)
+#define STUB_START _AC(, STUB_CODE)
+
+#ifndef __ASSEMBLY__
 
 #include "sysdep/ptrace.h"
 
diff --git a/arch/um/sys-i386/stub.S b/arch/um/sys-i386/stub.S
index e730772..7699e89 100644
--- a/arch/um/sys-i386/stub.S
+++ b/arch/um/sys-i386/stub.S
@@ -7,7 +7,7 @@
 	.globl batch_syscall_stub
 batch_syscall_stub:
 	/* load pointer to first operation */
-	mov	$(ASM_STUB_DATA+8), %esp
+	mov	$(STUB_DATA+8), %esp
 
 again:
 	/* load length of additional data */
@@ -15,12 +15,12 @@ again:
 
 	/* if(length == 0) : end of list */
 	/* write possible 0 to header */
-	mov	%eax, ASM_STUB_DATA+4
+	mov	%eax, STUB_DATA+4
 	cmpl	$0, %eax
 	jz	done
 
 	/* save current pointer */
-	mov	%esp, ASM_STUB_DATA+4
+	mov	%esp, STUB_DATA+4
 
 	/* skip additional data */
 	add	%eax, %esp
@@ -46,7 +46,7 @@ again:
 
 done:
 	/* save return value */
-	mov	%eax, ASM_STUB_DATA
+	mov	%eax, STUB_DATA
 
 	/* stop */
 	int3
diff --git a/arch/um/sys-x86_64/stub.S b/arch/um/sys-x86_64/stub.S
index 4afe204..5687687 100644
--- a/arch/um/sys-x86_64/stub.S
+++ b/arch/um/sys-x86_64/stub.S
@@ -8,18 +8,18 @@ syscall_stub:
 	/* We don't have 64-bit constants, so this constructs the address
 	 * we need.
 	 */
-	movq	$(ASM_STUB_DATA >> 32), %rbx
+	movq	$(STUB_DATA >> 32), %rbx
 	salq	$32, %rbx
-	movq	$(ASM_STUB_DATA & 0xffffffff), %rcx
+	movq	$(STUB_DATA & 0xffffffff), %rcx
 	or	%rcx, %rbx
 	movq	%rax, (%rbx)
 	int3
 
 	.globl batch_syscall_stub
 batch_syscall_stub:
-	mov	$(ASM_STUB_DATA >> 32), %rbx
+	mov	$(STUB_DATA >> 32), %rbx
 	sal	$32, %rbx
-	mov	$(ASM_STUB_DATA & 0xffffffff), %rax
+	mov	$(STUB_DATA & 0xffffffff), %rax
 	or	%rax, %rbx
 	/* load pointer to first operation */
 	mov	%rbx, %rsp
-- 
cgit v1.1


From e725a9b0f5db0fa511d7667da6c4bfb1a6d3c7e4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:44 -0800
Subject: uml: delete some unused headers

Robert Day noticed a few unused headers in UML, so this gets rid of them.

Cc: "Robert P. J. Day" <rpjday@crashcourse.ca>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/misc_constants.h |  6 ------
 arch/um/include/signal_kern.h    | 22 ----------------------
 arch/um/include/skas/mode-skas.h | 11 -----------
 3 files changed, 39 deletions(-)
 delete mode 100644 arch/um/include/misc_constants.h
 delete mode 100644 arch/um/include/signal_kern.h
 delete mode 100644 arch/um/include/skas/mode-skas.h

diff --git a/arch/um/include/misc_constants.h b/arch/um/include/misc_constants.h
deleted file mode 100644
index 989bc08..0000000
--- a/arch/um/include/misc_constants.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __MISC_CONSTANT_H_
-#define __MISC_CONSTANT_H_
-
-#include <user_constants.h>
-
-#endif
diff --git a/arch/um/include/signal_kern.h b/arch/um/include/signal_kern.h
deleted file mode 100644
index aeb5d5a..0000000
--- a/arch/um/include/signal_kern.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* 
- * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SIGNAL_KERN_H__
-#define __SIGNAL_KERN_H__
-
-extern int have_signals(void *t);
-
-#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/skas/mode-skas.h b/arch/um/include/skas/mode-skas.h
deleted file mode 100644
index e065feb..0000000
--- a/arch/um/include/skas/mode-skas.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com)
- * Licensed under the GPL
- */
-
-#ifndef __MODE_SKAS_H__
-#define __MODE_SKAS_H__
-
-extern void kill_off_processes_skas(void);
-
-#endif
-- 
cgit v1.1


From 0ba7fe03b638a084a4e15e21d2e585ba321ad9c8 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:45 -0800
Subject: uml: allow LFLAGS on command line

Allow LFLAGS to be given to make and have the expected effect.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/um/Makefile b/arch/um/Makefile
index ba6813a..e44fd6a 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -130,7 +130,9 @@ CPPFLAGS_vmlinux.lds = -U$(SUBARCH) -DSTART=$(START) -DELF_ARCH=$(ELF_ARCH) \
 # The wrappers will select whether using "malloc" or the kernel allocator.
 LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
 
-CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS)
+LD_FLAGS_CMDLINE = $(foreach opt,$(LFLAGS),-Wl,$(opt))
+
+CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE)
 define cmd_vmlinux__
 	$(CC) $(CFLAGS_vmlinux) -o $@ \
 	-Wl,-T,$(vmlinux-lds) $(vmlinux-init) \
-- 
cgit v1.1


From edea138584d7586a3b93b6d5ab5ec021d18e11e9 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:46 -0800
Subject: uml: tidy kern_util.h

Tidy kern_util.h.  It turns out that most of the function declarations
aren't used, so they can go away.  os.h no longer includes
kern_util.h, so files which got it through os.h now need to include it
directly.  A number of other files never needed it, so these includes
are deleted.

The structure which was used to pass signal handlers from the kernel
side to the userspace side is gone.  Instead, the handlers are
declared here, and used directly from libc code.  This allows
arch/um/os-Linux/trap.c to be deleted, with its remnants being moved
to arch/um/os-Linux/skas/trap.c.

arch/um/os-Linux/tty.c had its inclusions changed, and it needed some
style attention, so it got tidied.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/line.c                 |   1 +
 arch/um/drivers/ssl.c                  |   1 -
 arch/um/drivers/stdio_console.c        |   1 -
 arch/um/drivers/ubd_kern.c             |   1 +
 arch/um/drivers/ubd_user.c             |   1 -
 arch/um/include/kern_util.h            | 118 ++++++++++-----------------------
 arch/um/include/os.h                   |   4 --
 arch/um/kernel/initrd.c                |   1 -
 arch/um/kernel/reboot.c                |   1 +
 arch/um/kernel/smp.c                   |   1 -
 arch/um/kernel/trap.c                  |  15 +----
 arch/um/kernel/um_arch.c               |   6 +-
 arch/um/os-Linux/Makefile              |   4 +-
 arch/um/os-Linux/aio.c                 |   1 +
 arch/um/os-Linux/drivers/tuntap_user.c |   1 +
 arch/um/os-Linux/file.c                |   1 -
 arch/um/os-Linux/irq.c                 |   1 -
 arch/um/os-Linux/mem.c                 |   1 -
 arch/um/os-Linux/signal.c              |   1 +
 arch/um/os-Linux/skas/process.c        |   1 +
 arch/um/os-Linux/skas/trap.c           |  18 +++--
 arch/um/os-Linux/trap.c                |  23 -------
 arch/um/os-Linux/tty.c                 |  57 ++++++++--------
 arch/um/os-Linux/tty_log.c             |   1 -
 arch/um/sys-i386/bugs.c                |   1 +
 25 files changed, 89 insertions(+), 173 deletions(-)
 delete mode 100644 arch/um/os-Linux/trap.c

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 5cff653..fac058b 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -8,6 +8,7 @@
 #include "chan_kern.h"
 #include "irq_kern.h"
 #include "irq_user.h"
+#include "kern_util.h"
 #include "os.h"
 
 #define LINE_BUFSIZE 4096
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 875d60d..f1786e6 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -15,7 +15,6 @@
 #include "line.h"
 #include "ssl.h"
 #include "chan_kern.h"
-#include "kern_util.h"
 #include "kern.h"
 #include "init.h"
 #include "irq_user.h"
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 656036e..cec0c33 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -22,7 +22,6 @@
 #include "stdio_console.h"
 #include "line.h"
 #include "chan_kern.h"
-#include "kern_util.h"
 #include "irq_user.h"
 #include "mconsole_kern.h"
 #include "init.h"
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 7a252ab..4fe4d6b 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -49,6 +49,7 @@
 #include "irq_user.h"
 #include "irq_kern.h"
 #include "ubd_user.h"
+#include "kern_util.h"
 #include "os.h"
 #include "mem.h"
 #include "mem_kern.h"
diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c
index 48fc745..b591bb9 100644
--- a/arch/um/drivers/ubd_user.c
+++ b/arch/um/drivers/ubd_user.c
@@ -16,7 +16,6 @@
 #include <sys/mman.h>
 #include <sys/param.h>
 #include "asm/types.h"
-#include "kern_util.h"
 #include "user.h"
 #include "ubd_user.h"
 #include "os.h"
diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index aa27eb0..8fadf89 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -9,107 +9,59 @@
 #include "sysdep/ptrace.h"
 #include "sysdep/faultinfo.h"
 
-typedef void (*kern_hndl)(int, struct uml_pt_regs *);
-
-struct kern_handlers {
-	kern_hndl relay_signal;
-	kern_hndl winch;
-	kern_hndl bus_handler;
-	kern_hndl page_fault;
-	kern_hndl sigio_handler;
-	kern_hndl timer_handler;
-};
-
-extern const struct kern_handlers handlinfo_kern;
-
 extern int ncpus;
-extern char *gdb_init;
 extern int kmalloc_ok;
-extern int jail;
 extern int nsyscalls;
 
-#define UML_ROUND_DOWN(addr) ((void *)(((unsigned long) addr) & PAGE_MASK))
 #define UML_ROUND_UP(addr) \
-	UML_ROUND_DOWN(((unsigned long) addr) + PAGE_SIZE - 1)
+	((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK)
 
-extern int kernel_fork(unsigned long flags, int (*fn)(void *), void * arg);
-extern int kernel_thread_proc(void *data);
-extern void syscall_segv(int sig);
-extern int current_pid(void);
 extern unsigned long alloc_stack(int order, int atomic);
+extern void free_stack(unsigned long stack, int order);
+
 extern int do_signal(void);
-extern int is_stack_fault(unsigned long sp);
+extern void copy_sc(struct uml_pt_regs *regs, void *from);
+extern void interrupt_end(void);
+extern void relay_signal(int sig, struct uml_pt_regs *regs);
+
 extern unsigned long segv(struct faultinfo fi, unsigned long ip,
 			  int is_user, struct uml_pt_regs *regs);
 extern int handle_page_fault(unsigned long address, unsigned long ip,
 			     int is_write, int is_user, int *code_out);
-extern void syscall_ready(void);
-extern void set_tracing(void *t, int tracing);
-extern int is_tracing(void *task);
-extern int segv_syscall(void);
-extern void kern_finish_exec(void *task, int new_pid, unsigned long stack);
-extern unsigned long page_mask(void);
-extern int need_finish_fork(void);
-extern void free_stack(unsigned long stack, int order);
-extern void add_input_request(int op, void (*proc)(int), void *arg);
-extern char *current_cmd(void);
-extern void timer_handler(int sig, struct uml_pt_regs *regs);
-extern int set_signals(int enable);
-extern int pid_to_processor_id(int pid);
-extern void deliver_signals(void *t);
-extern int next_trap_index(int max);
-extern void default_idle(void);
-extern void finish_fork(void);
-extern void paging_init(void);
-extern void init_flush_vm(void);
-extern void *syscall_sp(void *t);
-extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
+
 extern unsigned int do_IRQ(int irq, struct uml_pt_regs *regs);
-extern void interrupt_end(void);
-extern void initial_thread_cb(void (*proc)(void *), void *arg);
-extern int debugger_signal(int status, int pid);
-extern void debugger_parent_signal(int status, int pid);
-extern void child_signal(int pid, int status);
-extern int init_ptrace_proxy(int idle_pid, int startup, int stop);
-extern int init_parent_proxy(int pid);
-extern int singlestepping(void *t);
-extern void check_stack_overflow(void *ptr);
-extern void relay_signal(int sig, struct uml_pt_regs *regs);
-extern int user_context(unsigned long sp);
-extern void timer_irq(struct uml_pt_regs *regs);
-extern void do_uml_exitcalls(void);
-extern int attach_debugger(int idle_pid, int pid, int stop);
-extern int config_gdb(char *str);
-extern int remove_gdb(void);
-extern char *uml_strdup(const char *string);
-extern void unprotect_kernel_mem(void);
-extern void protect_kernel_mem(void);
-extern void uml_cleanup(void);
-extern void lock_signalled_task(void *t);
-extern void IPI_handler(int cpu);
-extern int jail_setup(char *line, int *add);
-extern void *get_init_task(void);
-extern int clear_user_proc(void *buf, int size);
-extern int copy_to_user_proc(void *to, void *from, int size);
-extern int copy_from_user_proc(void *to, void *from, int size);
-extern int strlen_user_proc(char *str);
-extern long execute_syscall(void *r);
 extern int smp_sigio_handler(void);
-extern void *get_current(void);
-extern struct task_struct *get_task(int pid, int require);
-extern void machine_halt(void);
+extern void initial_thread_cb(void (*proc)(void *), void *arg);
 extern int is_syscall(unsigned long addr);
+extern void timer_handler(int sig, struct uml_pt_regs *regs);
 
-extern void free_irq(unsigned int, void *);
-extern int cpu(void);
+extern void timer_handler(int sig, struct uml_pt_regs *regs);
+
+extern int start_uml(void);
+extern void paging_init(void);
 
-extern void time_init_kern(void);
+extern void uml_cleanup(void);
+extern void do_uml_exitcalls(void);
 
-/* Are we disallowed to sleep? Used to choose between GFP_KERNEL and GFP_ATOMIC. */
+/*
+ * Are we disallowed to sleep? Used to choose between GFP_KERNEL and
+ * GFP_ATOMIC.
+ */
 extern int __cant_sleep(void);
-extern void sigio_handler(int sig, struct uml_pt_regs *regs);
-extern void copy_sc(struct uml_pt_regs *regs, void *from);
+extern void *get_current(void);
+extern int copy_from_user_proc(void *to, void *from, int size);
+extern int cpu(void);
+extern char *uml_strdup(const char *string);
+
 extern unsigned long to_irq_stack(unsigned long *mask_out);
-unsigned long from_irq_stack(int nested);
-extern int start_uml(void);
+extern unsigned long from_irq_stack(int nested);
+
+extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
+extern int singlestepping(void *t);
+
+extern void segv_handler(int sig, struct uml_pt_regs *regs);
+extern void bus_handler(int sig, struct uml_pt_regs *regs);
+extern void winch(int sig, struct uml_pt_regs *regs);
+
+
 #endif
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 82e5aea..55ca073 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -8,7 +8,6 @@
 
 #include <stdarg.h>
 #include "irq_user.h"
-#include "kern_util.h"
 #include "longjmp.h"
 #include "mm_id.h"
 #include "sysdep/tls.h"
@@ -237,9 +236,6 @@ extern void unblock_signals(void);
 extern int get_signals(void);
 extern int set_signals(int enable);
 
-/* trap.c */
-extern void os_fill_handlinfo(struct kern_handlers h);
-
 /* util.c */
 extern void stack_protections(unsigned long address);
 extern int raw(int fd);
diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c
index 16dc43e..ae31c62 100644
--- a/arch/um/kernel/initrd.c
+++ b/arch/um/kernel/initrd.c
@@ -7,7 +7,6 @@
 #include "linux/bootmem.h"
 #include "linux/initrd.h"
 #include "asm/types.h"
-#include "kern_util.h"
 #include "initrd.h"
 #include "init.h"
 #include "os.h"
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 04cebcf..1ce49cd 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -4,6 +4,7 @@
  */
 
 #include "linux/sched.h"
+#include "kern_util.h"
 #include "os.h"
 #include "skas.h"
 
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 36d89cf..a12e5bd 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -21,7 +21,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 #include "asm/smp.h"
 #include "asm/processor.h"
 #include "asm/spinlock.h"
-#include "kern_util.h"
 #include "kern.h"
 #include "irq_user.h"
 #include "os.h"
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index e3a3ab8..ff405a4 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -128,7 +128,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 	force_sig_info(SIGSEGV, &si, current);
 }
 
-static void segv_handler(int sig, struct uml_pt_regs *regs)
+void segv_handler(int sig, struct uml_pt_regs *regs)
 {
 	struct faultinfo * fi = UPT_FAULTINFO(regs);
 
@@ -229,27 +229,18 @@ void relay_signal(int sig, struct uml_pt_regs *regs)
 	force_sig(sig, current);
 }
 
-static void bus_handler(int sig, struct uml_pt_regs *regs)
+void bus_handler(int sig, struct uml_pt_regs *regs)
 {
 	if (current->thread.fault_catcher != NULL)
 		UML_LONGJMP(current->thread.fault_catcher, 1);
 	else relay_signal(sig, regs);
 }
 
-static void winch(int sig, struct uml_pt_regs *regs)
+void winch(int sig, struct uml_pt_regs *regs)
 {
 	do_IRQ(WINCH_IRQ, regs);
 }
 
-const struct kern_handlers handlinfo_kern = {
-	.relay_signal = relay_signal,
-	.winch = winch,
-	.bus_handler = bus_handler,
-	.page_fault = segv_handler,
-	.sigio_handler = sigio_handler,
-	.timer_handler = timer_handler
-};
-
 void trap_init(void)
 {
 }
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index f1c7139..9b5d2cd 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -16,6 +16,7 @@
 #include "as-layout.h"
 #include "init.h"
 #include "kern.h"
+#include "kern_util.h"
 #include "mem_user.h"
 #include "os.h"
 #include "skas.h"
@@ -280,11 +281,6 @@ int __init linux_main(int argc, char **argv)
 
 	host_task_size = set_task_sizes_skas(&task_size);
 
-	/*
-	 * Setting up handlers to 'sig_info' struct
-	 */
-	os_fill_handlinfo(handlinfo_kern);
-
 	brk_start = (unsigned long) sbrk(0);
 
 	/*
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 8e129af..8a48d6a 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-y = aio.o elf_aux.o execvp.o file.o helper.o irq.o main.o mem.o process.o \
-	registers.o sigio.o signal.o start_up.o time.o trap.o tty.o uaccess.o \
+	registers.o sigio.o signal.o start_up.o time.o tty.o uaccess.o \
 	umid.o tls.o user_syms.o util.o drivers/ sys-$(SUBARCH)/ skas/
 
 obj-$(CONFIG_TTY_LOG) += tty_log.o
@@ -12,7 +12,7 @@ user-objs-$(CONFIG_TTY_LOG) += tty_log.o
 
 USER_OBJS := $(user-objs-y) aio.o elf_aux.o execvp.o file.o helper.o irq.o \
 	main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \
-	trap.o tty.o tls.o uaccess.o umid.o util.o
+	tty.o tls.o uaccess.o umid.o util.o
 
 CFLAGS_user_syms.o += -DSUBARCH_$(SUBARCH)
 
diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c
index 93dc0c8..b8d8c9c 100644
--- a/arch/um/os-Linux/aio.c
+++ b/arch/um/os-Linux/aio.c
@@ -12,6 +12,7 @@
 #include "aio.h"
 #include "init.h"
 #include "kern_constants.h"
+#include "kern_util.h"
 #include "os.h"
 #include "user.h"
 
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
index 1d84795..7739e29 100644
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ b/arch/um/os-Linux/drivers/tuntap_user.c
@@ -14,6 +14,7 @@
 #include <sys/wait.h>
 #include <sys/uio.h>
 #include "kern_constants.h"
+#include "kern_util.h"
 #include "os.h"
 #include "tuntap.h"
 #include "user.h"
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index c3bb5ce..9387cb1 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -17,7 +17,6 @@
 #include <sys/uio.h>
 #include "os.h"
 #include "user.h"
-#include "kern_util.h"
 
 static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
 {
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index 6aa6f95..a26e066 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -11,7 +11,6 @@
 #include <sys/poll.h>
 #include <sys/types.h>
 #include <sys/time.h>
-#include "kern_util.h"
 #include "user.h"
 #include "process.h"
 #include "sigio.h"
diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
index 9674ed1..eedc2d8 100644
--- a/arch/um/os-Linux/mem.c
+++ b/arch/um/os-Linux/mem.c
@@ -9,7 +9,6 @@
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/statfs.h>
-#include "kern_util.h"
 #include "user.h"
 #include "mem_user.h"
 #include "init.h"
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 37302e8..7ff8f57 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -9,6 +9,7 @@
 #include <errno.h>
 #include <signal.h>
 #include <strings.h>
+#include "kern_util.h"
 #include "os.h"
 #include "sysdep/barrier.h"
 #include "sysdep/sigcontext.h"
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index e8b7a97..765cfa6 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -15,6 +15,7 @@
 #include "as-layout.h"
 #include "chan_user.h"
 #include "kern_constants.h"
+#include "kern_util.h"
 #include "mem.h"
 #include "os.h"
 #include "process.h"
diff --git a/arch/um/os-Linux/skas/trap.c b/arch/um/os-Linux/skas/trap.c
index 3b1b924..a19a74f 100644
--- a/arch/um/os-Linux/skas/trap.c
+++ b/arch/um/os-Linux/skas/trap.c
@@ -3,22 +3,26 @@
  * Licensed under the GPL
  */
 
-#if 0
-#include "kern_util.h"
-#include "skas.h"
-#include "ptrace_user.h"
-#include "sysdep/ptrace_user.h"
-#endif
-
 #include <errno.h>
 #include <signal.h>
 #include "sysdep/ptrace.h"
 #include "kern_constants.h"
 #include "as-layout.h"
+#include "kern_util.h"
 #include "os.h"
 #include "sigcontext.h"
 #include "task.h"
 
+void (*sig_info[NSIG])(int, struct uml_pt_regs *) = {
+	[SIGTRAP]	= relay_signal,
+	[SIGFPE]	= relay_signal,
+	[SIGILL]	= relay_signal,
+	[SIGWINCH]	= winch,
+	[SIGBUS]	= bus_handler,
+	[SIGSEGV]	= segv_handler,
+	[SIGIO]		= sigio_handler,
+	[SIGVTALRM]	= timer_handler };
+
 static struct uml_pt_regs ksig_regs[UM_NR_CPUS];
 
 void sig_handler_common_skas(int sig, void *sc_ptr)
diff --git a/arch/um/os-Linux/trap.c b/arch/um/os-Linux/trap.c
deleted file mode 100644
index 2a1c984..0000000
--- a/arch/um/os-Linux/trap.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <signal.h>
-#include "os.h"
-#include "sysdep/ptrace.h"
-
-/* Initialized from linux_main() */
-void (*sig_info[NSIG])(int, struct uml_pt_regs *);
-
-void os_fill_handlinfo(struct kern_handlers h)
-{
-	sig_info[SIGTRAP] = h.relay_signal;
-	sig_info[SIGFPE] = h.relay_signal;
-	sig_info[SIGILL] = h.relay_signal;
-	sig_info[SIGWINCH] = h.winch;
-	sig_info[SIGBUS] = h.bus_handler;
-	sig_info[SIGSEGV] = h.page_fault;
-	sig_info[SIGIO] = h.sigio_handler;
-	sig_info[SIGVTALRM] = h.timer_handler;
-}
diff --git a/arch/um/os-Linux/tty.c b/arch/um/os-Linux/tty.c
index 4cfdd18..b09ff66 100644
--- a/arch/um/os-Linux/tty.c
+++ b/arch/um/os-Linux/tty.c
@@ -1,13 +1,16 @@
-/* 
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
 #include <stdlib.h>
+#include <unistd.h>
 #include <errno.h>
+#include <fcntl.h>
+#include "kern_constants.h"
+#include "kern_util.h"
 #include "os.h"
 #include "user.h"
-#include "kern_util.h"
 
 struct grantpt_info {
 	int fd;
@@ -26,36 +29,34 @@ static void grantpt_cb(void *arg)
 int get_pty(void)
 {
 	struct grantpt_info info;
-	int fd;
-
-	fd = os_open_file("/dev/ptmx", of_rdwr(OPENFLAGS()), 0);
-	if(fd < 0){
-		printk("get_pty : Couldn't open /dev/ptmx - err = %d\n", -fd);
-		return(fd);
+	int fd, err;
+
+	fd = open("/dev/ptmx", O_RDWR);
+	if (fd < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "get_pty : Couldn't open /dev/ptmx - "
+		       "err = %d\n", errno);
+		return err;
 	}
 
 	info.fd = fd;
 	initial_thread_cb(grantpt_cb, &info);
 
-	if(info.res < 0){
-		printk("get_pty : Couldn't grant pty - errno = %d\n", 
-		       -info.err);
-		return(-1);
+	if (info.res < 0) {
+		err = -info.err;
+		printk(UM_KERN_ERR "get_pty : Couldn't grant pty - "
+		       "errno = %d\n", -info.err);
+		goto out;
 	}
-	if(unlockpt(fd) < 0){
-		printk("get_pty : Couldn't unlock pty - errno = %d\n", errno);
-		return(-1);
+
+	if (unlockpt(fd) < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "get_pty : Couldn't unlock pty - "
+		       "errno = %d\n", errno);
+		goto out;
 	}
-	return(fd);
+	return fd;
+out:
+	close(fd);
+	return err;
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/os-Linux/tty_log.c b/arch/um/os-Linux/tty_log.c
index d11a55b..cc648e6 100644
--- a/arch/um/os-Linux/tty_log.c
+++ b/arch/um/os-Linux/tty_log.c
@@ -12,7 +12,6 @@
 #include <sys/time.h>
 #include "init.h"
 #include "user.h"
-#include "kern_util.h"
 #include "os.h"
 
 #define TTY_LOG_DIR "./"
diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index b0cb052..a74442d 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -5,6 +5,7 @@
 
 #include <signal.h>
 #include "kern_constants.h"
+#include "kern_util.h"
 #include "longjmp.h"
 #include "task.h"
 #include "user.h"
-- 
cgit v1.1


From d83ecf083a2163705f5ebcede4637a955eb7b964 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:47 -0800
Subject: uml: tidy pgtable.h

Large pieces of include/asm/pgtable.h were unused cruft.

This uncovered arch/um/kernel/trap.c needing skas.h in order to get
ptrace_faultinfo.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/trap.c    |  1 +
 include/asm-um/pgtable.h | 86 ++++--------------------------------------------
 2 files changed, 8 insertions(+), 79 deletions(-)

diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ff405a4..8fd1a79 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -13,6 +13,7 @@
 #include "as-layout.h"
 #include "kern_util.h"
 #include "os.h"
+#include "skas.h"
 #include "sysdep/sigcontext.h"
 
 /*
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 830fc6e..cb0d204 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -1,5 +1,5 @@
 /* 
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Copyright 2003 PathScale, Inc.
  * Derived from include/asm-i386/pgtable.h
  * Licensed under the GPL
@@ -9,10 +9,6 @@
 #define __UM_PGTABLE_H
 
 #include "linux/sched.h"
-#include "linux/linkage.h"
-#include "asm/processor.h"
-#include "asm/page.h"
-#include "asm/fixmap.h"
 
 #define _PAGE_PRESENT	0x001
 #define _PAGE_NEWPAGE	0x002
@@ -42,14 +38,6 @@ extern unsigned long *empty_zero_page;
 
 #define pgtable_cache_init() do ; while (0)
 
-/*
- * pgd entries used up by user/kernel:
- */
-
-#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT)
-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
-
-#ifndef __ASSEMBLY__
 /* Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 8MB value just means that there will be a 8MB "hole" after the
  * physical memory until the kernel virtual memory starts.  That means that
@@ -62,16 +50,12 @@ extern unsigned long end_iomem;
 
 #define VMALLOC_OFFSET	(__va_space)
 #define VMALLOC_START ((end_iomem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
-
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
 #else
 # define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
-#define REGION_SHIFT	(sizeof(pte_t) * 8 - 4)
-#define REGION_MASK	(((unsigned long) 0xf) << REGION_SHIFT)
-
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -81,11 +65,12 @@ extern unsigned long end_iomem;
 #define PAGE_COPY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
 #define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
 #define PAGE_KERNEL	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define PAGE_KERNEL_RO	__pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED)
 
 /*
- * The i386 can't do page protection for execute, and considers that the same are read.
- * Also, write permissions imply read permissions. This is the closest we can get..
+ * The i386 can't do page protection for execute, and considers that the same
+ * are read.
+ * Also, write permissions imply read permissions. This is the closest we can
+ * get..
  */
 #define __P000	PAGE_NONE
 #define __P001	PAGE_READONLY
@@ -106,40 +91,16 @@ extern unsigned long end_iomem;
 #define __S111	PAGE_SHARED
 
 /*
- * Define this if things work differently on an i386 and an i486:
- * it will (on an i486) warn about kernel memory accesses that are
- * done without a 'access_ok(VERIFY_WRITE,..)'
- */
-#undef TEST_VERIFY_AREA
-
-/* page table for 0-4MB for everybody */
-extern unsigned long pg0[1024];
-
-/*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-
 #define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page)
 
-/* number of bits that fit into a memory pointer */
-#define BITS_PER_PTR			(8*sizeof(unsigned long))
-
-/* to align the pointer to a pointer address */
-#define PTR_MASK			(~(sizeof(void*)-1))
-
-/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
-/* 64-bit machines, beware!  SRB. */
-#define SIZEOF_PTR_LOG2			3
-
-/* to find an entry in a page-table */
-#define PAGE_PTR(address) \
-((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK)
-
 #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
 
 #define pmd_none(x)	(!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
 
@@ -149,14 +110,9 @@ extern unsigned long pg0[1024];
 #define pud_newpage(x)  (pud_val(x) & _PAGE_NEWPAGE)
 #define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEWPAGE)
 
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-
 #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK)
 
 #define pte_page(x) pfn_to_page(pte_pfn(x))
-#define pte_address(x) (__va(pte_val(x) & PAGE_MASK))
-#define mk_phys(a, r) ((a) + (((unsigned long) r) << REGION_SHIFT))
-#define phys_addr(p) ((p) & ~REGION_MASK)
 
 #define pte_present(x)	pte_get_bits(x, (_PAGE_PRESENT | _PAGE_PROTNONE))
 
@@ -310,6 +266,7 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval)
 #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
 #define __virt_to_page(virt) phys_to_page(__pa(virt))
 #define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
+#define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
 
 #define mk_pte(page, pgprot) \
 	({ pte_t pte;					\
@@ -325,8 +282,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	return pte; 
 }
 
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
-
 /*
  * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
  *
@@ -335,8 +290,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  */
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 
-#define pgd_index_k(addr) pgd_index(addr)
-
 /*
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
@@ -388,29 +341,4 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #include <asm-generic/pgtable.h>
 
-#include <asm-generic/pgtable-nopud.h>
-
-#ifdef CONFIG_HIGHMEM
-/* Clear a kernel PTE and flush it from the TLB */
-#define kpte_clear_flush(ptep, vaddr)					\
-do {									\
-	pte_clear(&init_mm, vaddr, ptep);				\
-	__flush_tlb_one(vaddr);						\
-} while (0)
 #endif
-
-#endif
-#endif
-
-#define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
-- 
cgit v1.1


From 300ecf59c0ca2c09fb7fde7dff986a7306e95361 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:47 -0800
Subject: UML - Fix build in 2.6.24-rc2-mm1

The earlier pgtable.h tidying patch made things a bit too tidy.  Add
back a header which is needed in VMALLOC_START and friend.  Also add
back a definition of pmd_page_vaddr, which is needed on x86_64.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-um/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index cb0d204..1e7fc00 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -9,6 +9,7 @@
 #define __UM_PGTABLE_H
 
 #include "linux/sched.h"
+#include <asm/fixmap.h>
 
 #define _PAGE_PRESENT	0x001
 #define _PAGE_NEWPAGE	0x002
@@ -308,6 +309,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  * this macro returns the index of the entry in the pmd page which would
  * control the given virtual address
  */
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 
 /*
-- 
cgit v1.1


From 8299ca5ce10c9532848bd2e5526ba975f8d80d1d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:48 -0800
Subject: uml: reconst a parameter

The previous const-ing patch consted a string which shouldn't have
been, and I didn't notice the gcc warning.

ubd_setup can't take a const char * because its address is assigned to
something which expects a char *arg.  Many setups modify the string
they are given, they can't be const.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/ubd_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 4fe4d6b..b498f27 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -444,7 +444,7 @@ __uml_help(ubd_setup,
 "    cluster filesystem and inappropriate at almost all other times.\n\n"
 );
 
-static int udb_setup(const char *str)
+static int udb_setup(char *str)
 {
 	printk("udb%s specified on command line is almost certainly a ubd -> "
 	       "udb TYPO\n", str);
-- 
cgit v1.1


From ab8cda4347e2ffed658e522c9398932509c278bd Mon Sep 17 00:00:00 2001
From: Lucas Woods <woodzy@gmail.com>
Date: Mon, 4 Feb 2008 22:30:49 -0800
Subject: arch/um: remove duplicate includes

Signed-off-by: Lucas Woods <woodzy@gmail.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/sigio.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 7243f57..6443809 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -20,7 +20,6 @@
 #include "sigio.h"
 #include "os.h"
 #include "um_malloc.h"
-#include "init.h"
 
 /* Protected by sigio_lock(), also used by sigio_cleanup, which is an
  * exitcall.
-- 
cgit v1.1


From 291248fd6e371bcbfb8a77689c5d741a1527488f Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:49 -0800
Subject: uml: remove unused variables in the context switcher

This patch removes a variable which was not used in two functions.  Yet
another code cleanup, nothing really significant.

Please note that I could not test this on x86_64. I don't have the
hardware for it.

[ jdike - Bits of tidying around the affected code.  Also, it's fine on
x86_64 ]

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/process.c      | 19 ++++++++-----------
 arch/um/sys-i386/ptrace.c     |  6 +++---
 arch/um/sys-i386/tls.c        |  2 +-
 arch/um/sys-x86_64/syscalls.c |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 91bd68e..62a4e0e 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -81,12 +81,12 @@ static inline void set_current(struct task_struct *task)
 		{ external_pid(task), task });
 }
 
-extern void arch_switch_to(struct task_struct *from, struct task_struct *to);
+extern void arch_switch_to(struct task_struct *to);
 
 void *_switch_to(void *prev, void *next, void *last)
 {
 	struct task_struct *from = prev;
-	struct task_struct *to= next;
+	struct task_struct *to = next;
 
 	to->thread.prev_sched = from;
 	set_current(to);
@@ -94,16 +94,15 @@ void *_switch_to(void *prev, void *next, void *last)
 	do {
 		current->thread.saved_task = NULL;
 
-		switch_threads(&from->thread.switch_buf,
-			       &to->thread.switch_buf);
+		switch_threads(&from->thread.switch_buf, &to->thread.switch_buf);
 
-		arch_switch_to(current->thread.prev_sched, current);
+		arch_switch_to(current);
 
 		if (current->thread.saved_task)
 			show_regs(&(current->thread.regs));
-		next= current->thread.saved_task;
-		prev= current;
-	} while(current->thread.saved_task);
+		next = current->thread.saved_task;
+		prev = current;
+	} while (current->thread.saved_task);
 
 	return current->thread.prev_sched;
 
@@ -161,8 +160,6 @@ void new_thread_handler(void)
 void fork_handler(void)
 {
 	force_flush_all();
-	if (current->thread.prev_sched == NULL)
-		panic("blech");
 
 	schedule_tail(current->thread.prev_sched);
 
@@ -171,7 +168,7 @@ void fork_handler(void)
 	 * arch_switch_to isn't needed. We could want to apply this to
 	 * improve performance. -bb
 	 */
-	arch_switch_to(current->thread.prev_sched, current);
+	arch_switch_to(current);
 
 	current->thread.prev_sched = NULL;
 
diff --git a/arch/um/sys-i386/ptrace.c b/arch/um/sys-i386/ptrace.c
index bd3da8a..6b44999 100644
--- a/arch/um/sys-i386/ptrace.c
+++ b/arch/um/sys-i386/ptrace.c
@@ -8,11 +8,11 @@
 #include "asm/uaccess.h"
 #include "skas.h"
 
-extern int arch_switch_tls(struct task_struct *from, struct task_struct *to);
+extern int arch_switch_tls(struct task_struct *to);
 
-void arch_switch_to(struct task_struct *from, struct task_struct *to)
+void arch_switch_to(struct task_struct *to)
 {
-	int err = arch_switch_tls(from, to);
+	int err = arch_switch_tls(to);
 	if (!err)
 		return;
 
diff --git a/arch/um/sys-i386/tls.c b/arch/um/sys-i386/tls.c
index 027e86a..f29b8a8 100644
--- a/arch/um/sys-i386/tls.c
+++ b/arch/um/sys-i386/tls.c
@@ -172,7 +172,7 @@ void clear_flushed_tls(struct task_struct *task)
  * SKAS patch.
  */
 
-int arch_switch_tls(struct task_struct *from, struct task_struct *to)
+int arch_switch_tls(struct task_struct *to)
 {
 	if (!host_supports_tls)
 		return 0;
diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c
index 86f6b18..e437ee2 100644
--- a/arch/um/sys-x86_64/syscalls.c
+++ b/arch/um/sys-x86_64/syscalls.c
@@ -105,7 +105,7 @@ long sys_clone(unsigned long clone_flags, unsigned long newsp,
 	return ret;
 }
 
-void arch_switch_to(struct task_struct *from, struct task_struct *to)
+void arch_switch_to(struct task_struct *to)
 {
 	if ((to->thread.arch.fs == 0) || (to->mm == NULL))
 		return;
-- 
cgit v1.1


From 6b7e967484f4197d799e14b844b78118e93192c6 Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:30:50 -0800
Subject: uml: convert functions to void

This patch changes a few functions into returning void.  The return values
were not used anyway, so I think it should not be a problem.  Also removed a
little leftover bit from TT mode.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h        | 6 +-----
 arch/um/kernel/um_arch.c    | 5 ++---
 arch/um/os-Linux/start_up.c | 4 +---
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 55ca073..69c0d4a 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -167,14 +167,10 @@ extern int os_fchange_dir(int fd);
 
 /* start_up.c */
 extern void os_early_checks(void);
-extern int can_do_skas(void);
+extern void can_do_skas(void);
 extern void os_check_bugs(void);
 extern void check_host_supports_tls(int *supports_tls, int *tls_min);
 
-/* Make sure they are clear when running in TT mode. Required by
- * SEGV_MAYBE_FIXABLE */
-#define clear_can_do_skas() do { ptrace_faultinfo = proc_mm = 0; } while (0)
-
 /* mem.c */
 extern int create_mem_file(unsigned long long len);
 
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 9b5d2cd..1139e07 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -198,7 +198,7 @@ __uml_setup("--help", Usage,
 "    Prints this message.\n\n"
 );
 
-static int __init uml_checksetup(char *line, int *add)
+static void __init uml_checksetup(char *line, int *add)
 {
 	struct uml_param *p;
 
@@ -208,10 +208,9 @@ static int __init uml_checksetup(char *line, int *add)
 
 		n = strlen(p->str);
 		if (!strncmp(line, p->str, n) && p->setup_func(line + n, add))
-			return 1;
+			return;
 		p++;
 	}
-	return 0;
 }
 
 static void __init uml_postsetup(void)
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index c6cf648..b07887c 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -466,7 +466,7 @@ static inline void check_skas3_proc_mm(void)
 	else non_fatal("found\n");
 }
 
-int can_do_skas(void)
+void can_do_skas(void)
 {
 	non_fatal("Checking for the skas3 patch in the host:\n");
 
@@ -476,8 +476,6 @@ int can_do_skas(void)
 
 	if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt)
 		skas_needs_stub = 1;
-
-	return 1;
 }
 
 int __init parse_iomem(char *str, int *add)
-- 
cgit v1.1


From b8bec829c90d45a2d115a52f3a928ce841afc3d4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:51 -0800
Subject: uml: host TLS diagnostics

Add some diagnostics when TLS operations on the host fail.  Also spit out more
information about the TLS environment on the host at boot time.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/sys-i386/tls.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/um/sys-i386/tls.c b/arch/um/sys-i386/tls.c
index f29b8a8..c6c7131 100644
--- a/arch/um/sys-i386/tls.c
+++ b/arch/um/sys-i386/tls.c
@@ -26,6 +26,11 @@ int do_set_thread_area(struct user_desc *info)
 	cpu = get_cpu();
 	ret = os_set_thread_area(info, userspace_pid[cpu]);
 	put_cpu();
+
+	if (ret)
+		printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, "
+		       "index = %d\n", ret, info->entry_number);
+
 	return ret;
 }
 
@@ -37,6 +42,11 @@ int do_get_thread_area(struct user_desc *info)
 	cpu = get_cpu();
 	ret = os_get_thread_area(info, userspace_pid[cpu]);
 	put_cpu();
+
+	if (ret)
+		printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, "
+		       "index = %d\n", ret, info->entry_number);
+
 	return ret;
 }
 
@@ -356,10 +366,9 @@ out:
 	return ret;
 }
 
-
 /*
- * XXX: This part is probably common to i386 and x86-64. Don't create a common
- * file for now, do that when implementing x86-64 support.
+ * This code is really i386-only, but it detects and logs x86_64 GDT indexes
+ * if a 32-bit UML is running on a 64-bit host.
  */
 static int __init __setup_host_supports_tls(void)
 {
@@ -368,13 +377,16 @@ static int __init __setup_host_supports_tls(void)
 		printk(KERN_INFO "Host TLS support detected\n");
 		printk(KERN_INFO "Detected host type: ");
 		switch (host_gdt_entry_tls_min) {
-			case GDT_ENTRY_TLS_MIN_I386:
-				printk("i386\n");
-				break;
-			case GDT_ENTRY_TLS_MIN_X86_64:
-				printk("x86_64\n");
-				break;
+		case GDT_ENTRY_TLS_MIN_I386:
+			printk(KERN_CONT "i386");
+			break;
+		case GDT_ENTRY_TLS_MIN_X86_64:
+			printk(KERN_CONT "x86_64");
+			break;
 		}
+		printk(KERN_CONT " (GDT indexes %d to %d)\n",
+		       host_gdt_entry_tls_min,
+		       host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES);
 	} else
 		printk(KERN_ERR "  Host TLS support NOT detected! "
 				"TLS support inside UML will not work\n");
-- 
cgit v1.1


From 9157f90f08f7db3188cd06971f41cb2ba5646e57 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:52 -0800
Subject: uml: move um_virt_to_phys

This patchset makes UML build and run with three-level page tables on
32-bit hosts.  This is an uncommon use case, but the code here needed
fixing and cleaning up, so 32-bit three-level pages tables were tested
to make sure the changes are good.

Patch 1 - code movement
Patch 2 - header untangling
Patch 3 - style fixups in files affected so far
Patch 4 - clean up use of current.h
Patch 5 - fix sizes of types that are different between 2 and 3-level
	page tables - three-level page table support should build at
	this point
Patch 6 - tidy (i.e. eliminate much of) the code that figures out how
	big the address space is
Patch 7 - change um_virt_to_phys into virt_to_pte, clean its
	interface, and clean its (so far) one caller
Patch 8 - the stub pages are covered with a VMA, allowing some nasty
	code to be thrown out - three-level page tables now work

This patch:

um_virt_to_phys only has one user, so it can be moved to the same file
and made static.  Its declarations in pgtable.h and ksyms.c are also
gone.

current_cmd was another apparent user, but it itself isn't used, so it
is deleted.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/ksyms.c        |  1 -
 arch/um/kernel/process.c      | 43 -------------------------------------------
 arch/um/kernel/skas/uaccess.c | 34 ++++++++++++++++++++++++++++++++--
 include/asm-um/pgtable.h      |  3 ---
 4 files changed, 32 insertions(+), 49 deletions(-)

diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index 7c7142b..f0ced98 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -26,7 +26,6 @@ EXPORT_SYMBOL(get_kmem_end);
 
 EXPORT_SYMBOL(high_physmem);
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(um_virt_to_phys);
 EXPORT_SYMBOL(handle_page_fault);
 EXPORT_SYMBOL(find_iomem);
 
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 62a4e0e..541a2bf 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -255,49 +255,6 @@ void cpu_idle(void)
 	default_idle();
 }
 
-void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
-		      pte_t *pte_out)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t ptent;
-
-	if (task->mm == NULL)
-		return ERR_PTR(-EINVAL);
-	pgd = pgd_offset(task->mm, addr);
-	if (!pgd_present(*pgd))
-		return ERR_PTR(-EINVAL);
-
-	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud))
-		return ERR_PTR(-EINVAL);
-
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
-		return ERR_PTR(-EINVAL);
-
-	pte = pte_offset_kernel(pmd, addr);
-	ptent = *pte;
-	if (!pte_present(ptent))
-		return ERR_PTR(-EINVAL);
-
-	if (pte_out != NULL)
-		*pte_out = ptent;
-	return (void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-char *current_cmd(void)
-{
-#if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM)
-	return "(Unknown)";
-#else
-	void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL);
-	return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr);
-#endif
-}
-
 void dump_thread(struct pt_regs *regs, struct user *u)
 {
 }
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 1d8b119..302425b 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -12,8 +12,38 @@
 #include "kern_util.h"
 #include "os.h"
 
-extern void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
-			     pte_t *pte_out);
+static void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
+			     pte_t *pte_out)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	pte_t ptent;
+
+	if (task->mm == NULL)
+		return ERR_PTR(-EINVAL);
+	pgd = pgd_offset(task->mm, addr);
+	if (!pgd_present(*pgd))
+		return ERR_PTR(-EINVAL);
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		return ERR_PTR(-EINVAL);
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return ERR_PTR(-EINVAL);
+
+	pte = pte_offset_kernel(pmd, addr);
+	ptent = *pte;
+	if (!pte_present(ptent))
+		return ERR_PTR(-EINVAL);
+
+	if (pte_out != NULL)
+		*pte_out = ptent;
+	return (void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK);
+}
 
 static unsigned long maybe_map(unsigned long virt, int is_write)
 {
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 1e7fc00..e268506 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -31,9 +31,6 @@
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 
-extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt,
-			     pte_t *pte_out);
-
 /* zero page used for uninitialized stuff */
 extern unsigned long *empty_zero_page;
 
-- 
cgit v1.1


From 8192ab42bf60e1e9b7efa046990e9cc5e4a95cf4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:53 -0800
Subject: uml: header untangling

Untangle UML headers somewhat and add some includes where they were
needed explicitly, but gotten accidentally via some other header.

arch/um/include/um_uaccess.h loses asm/fixmap.h because it uses no
fixmap stuff and gains elf.h, because it needs FIXADDR_USER_*, and
archsetjmp.h, because it needs jmp_buf.

pmd_alloc_one is uninlined because it needs mm_struct, and that's
inconvenient to provide in asm-um/pgtable-3level.h.

elf_core_copy_fpregs is also uninlined from elf-i386.h and
elf-x86_64.h, which duplicated the code anyway, to
arch/um/kernel/process.c, so that the reference to current_thread
doesn't pull sched.h or anything related into asm/elf.h.

arch/um/sys-i386/ldt.c, arch/um/kernel/tlb.c and
arch/um/kernel/skas/uaccess.c got sched.h because they dereference
task_structs.  Its includes of linux and asm headers got turned from
"" to <>.

arch/um/sys-i386/bug.c gets asm/errno.h because it needs errno
constants.

asm/elf-i386 gets asm/user.h because it needs user_regs_struct.

asm/fixmap.h gets page.h because it needs PAGE_SIZE and PAGE_MASK and
system.h for BUG_ON.

asm/pgtable doesn't need sched.h.

asm/processor-generic.h defined mm_segment_t, but didn't use it.  So,
that definition is moved to uaccess.h, which defines a bunch of
mm_segment_t-related stuff.  thread_info.h uses mm_segment_t, and
includes uaccess.h, which causes a recursion.  So, the definition is
placed above the include of thread_info. in uaccess.h.  thread_info.h
also gets page.h because it needs PAGE_SIZE.

ObCheckpatchViolationJustification - I'm not adding a typedef; I'm
moving mm_segment_t from one place to another.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/random.c           |  1 +
 arch/um/include/um_uaccess.h       |  4 +++-
 arch/um/kernel/mem.c               | 21 ++++++++++++++++-----
 arch/um/kernel/process.c           |  8 ++++++++
 arch/um/kernel/skas/uaccess.c      | 13 +++++++------
 arch/um/kernel/tlb.c               |  7 ++++---
 arch/um/sys-i386/bug.c             |  1 +
 arch/um/sys-i386/ldt.c             |  5 +++--
 include/asm-um/elf-i386.h          |  9 ++-------
 include/asm-um/elf-x86_64.h        |  8 +-------
 include/asm-um/fixmap.h            |  3 ++-
 include/asm-um/pgtable-3level.h    | 11 ++---------
 include/asm-um/pgtable.h           |  1 -
 include/asm-um/processor-generic.h |  5 +----
 include/asm-um/thread_info.h       |  3 ++-
 include/asm-um/uaccess.h           | 10 +++++++++-
 16 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index e942e83..71f0959 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -5,6 +5,7 @@
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
  */
+#include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
diff --git a/arch/um/include/um_uaccess.h b/arch/um/include/um_uaccess.h
index fdfc06b..2b6fc8e 100644
--- a/arch/um/include/um_uaccess.h
+++ b/arch/um/include/um_uaccess.h
@@ -6,7 +6,9 @@
 #ifndef __ARCH_UM_UACCESS_H
 #define __ARCH_UM_UACCESS_H
 
-#include "asm/fixmap.h"
+#include <asm/elf.h>
+#include <asm/fixmap.h>
+#include "sysdep/archsetjmp.h"
 
 #define __under_task_size(addr, size) \
 	(((unsigned long) (addr) < TASK_SIZE) && \
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 96072e2..1f8f0c1 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -152,7 +152,7 @@ pgprot_t kmap_prot;
 
 #define kmap_get_fixmap_pte(vaddr)					\
 	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
- 			  (vaddr)), (vaddr))
+				     (vaddr)), (vaddr))
 
 static void __init kmap_init(void)
 {
@@ -278,7 +278,8 @@ struct page *arch_validate(struct page *page, gfp_t mask, int order)
 	goto again;
 }
 
-/* This can't do anything because nothing in the kernel image can be freed
+/*
+ * This can't do anything because nothing in the kernel image can be freed
  * since it's not in kernel physical memory.
  */
 
@@ -331,9 +332,7 @@ void show_mem(void)
 	printk("%d pages swap cached\n", cached);
 }
 
-/*
- * Allocate and free page tables.
- */
+/* Allocate and free page tables. */
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
@@ -368,3 +367,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
 	return pte;
 }
+
+#ifdef CONFIG_3_LEVEL_PGTABLES
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
+
+	if (pmd)
+		memset(pmd, 0, PAGE_SIZE);
+
+	return pmd;
+}
+#endif
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 541a2bf..5704712 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -446,3 +446,11 @@ unsigned long get_wchan(struct task_struct *p)
 
 	return 0;
 }
+
+int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
+{
+	int cpu = current_thread_info()->cpu;
+
+	return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu);
+}
+
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 302425b..d145565 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -3,12 +3,13 @@
  * Licensed under the GPL
  */
 
-#include "linux/err.h"
-#include "linux/highmem.h"
-#include "linux/mm.h"
-#include "asm/current.h"
-#include "asm/page.h"
-#include "asm/pgtable.h"
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <asm/current.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
 #include "kern_util.h"
 #include "os.h"
 
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index f4a0e40..96e0978 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -3,9 +3,10 @@
  * Licensed under the GPL
  */
 
-#include "linux/mm.h"
-#include "asm/pgtable.h"
-#include "asm/tlbflush.h"
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 #include "as-layout.h"
 #include "mem_user.h"
 #include "os.h"
diff --git a/arch/um/sys-i386/bug.c b/arch/um/sys-i386/bug.c
index a4360b5..8d4f273 100644
--- a/arch/um/sys-i386/bug.c
+++ b/arch/um/sys-i386/bug.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/uaccess.h>
+#include <asm/errno.h>
 
 /* Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because
  * that's not relevant in skas mode.
diff --git a/arch/um/sys-i386/ldt.c b/arch/um/sys-i386/ldt.c
index 67c0958..505ed5c 100644
--- a/arch/um/sys-i386/ldt.c
+++ b/arch/um/sys-i386/ldt.c
@@ -3,8 +3,9 @@
  * Licensed under the GPL
  */
 
-#include "linux/mm.h"
-#include "asm/unistd.h"
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <asm/unistd.h>
 #include "os.h"
 #include "proc_mm.h"
 #include "skas.h"
diff --git a/include/asm-um/elf-i386.h b/include/asm-um/elf-i386.h
index ca94a13..22bbb75 100644
--- a/include/asm-um/elf-i386.h
+++ b/include/asm-um/elf-i386.h
@@ -5,7 +5,7 @@
 #ifndef __UM_ELF_I386_H
 #define __UM_ELF_I386_H
 
-#include <linux/sched.h>
+#include <asm/user.h>
 #include "skas.h"
 
 #define R_386_NONE	0
@@ -76,12 +76,7 @@ typedef struct user_i387_struct elf_fpregset_t;
 	pr_reg[16] = PT_REGS_SS(regs);		\
 } while(0);
 
-static inline int elf_core_copy_fpregs(struct task_struct *t,
-				       elf_fpregset_t *fpu)
-{
-	int cpu = ((struct thread_info *) t->stack)->cpu;
-	return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu);
-}
+extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu)
 
diff --git a/include/asm-um/elf-x86_64.h b/include/asm-um/elf-x86_64.h
index 3c9d543..3b2d522 100644
--- a/include/asm-um/elf-x86_64.h
+++ b/include/asm-um/elf-x86_64.h
@@ -7,7 +7,6 @@
 #ifndef __UM_ELF_X86_64_H
 #define __UM_ELF_X86_64_H
 
-#include <linux/sched.h>
 #include <asm/user.h>
 #include "skas.h"
 
@@ -96,12 +95,7 @@ typedef struct user_i387_struct elf_fpregset_t;
 	(pr_reg)[25] = 0;					\
 	(pr_reg)[26] = 0;
 
-static inline int elf_core_copy_fpregs(struct task_struct *t,
-				       elf_fpregset_t *fpu)
-{
-	int cpu = current_thread->cpu;
-	return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu);
-}
+extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu)
 
diff --git a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h
index d352a35..3d3e85d 100644
--- a/include/asm-um/fixmap.h
+++ b/include/asm-um/fixmap.h
@@ -1,9 +1,10 @@
 #ifndef __UM_FIXMAP_H
 #define __UM_FIXMAP_H
 
+#include <asm/system.h>
 #include <asm/kmap_types.h>
 #include <asm/archparam.h>
-#include <asm/elf.h>
+#include <asm/page.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
diff --git a/include/asm-um/pgtable-3level.h b/include/asm-um/pgtable-3level.h
index 3ebafba..e0b6c16 100644
--- a/include/asm-um/pgtable-3level.h
+++ b/include/asm-um/pgtable-3level.h
@@ -59,15 +59,8 @@ static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; }
 
 #define set_pmd(pmdptr, pmdval) set_64bit((phys_t *) (pmdptr), pmd_val(pmdval))
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-        pmd_t *pmd = (pmd_t *) __get_free_page(GFP_KERNEL);
-
-        if(pmd)
-                memset(pmd, 0, PAGE_SIZE);
-
-        return pmd;
-}
+struct mm_struct;
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address);
 
 static inline void pud_clear (pud_t *pud)
 {
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index e268506..bec4840 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -8,7 +8,6 @@
 #ifndef __UM_PGTABLE_H
 #define __UM_PGTABLE_H
 
-#include "linux/sched.h"
 #include <asm/fixmap.h>
 
 #define _PAGE_PRESENT	0x001
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 057a76d..35ab6a2 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -11,6 +11,7 @@ struct pt_regs;
 struct task_struct;
 
 #include "asm/ptrace.h"
+#include "asm/pgtable.h"
 #include "registers.h"
 #include "sysdep/archsetjmp.h"
 
@@ -68,10 +69,6 @@ struct thread_struct {
 	.request		= { 0 } \
 }
 
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
 extern struct task_struct *alloc_task_struct(void);
 
 static inline void release_thread(struct task_struct *task)
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index 6e5fd5c..cdfc91c 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -8,8 +8,9 @@
 
 #ifndef __ASSEMBLY__
 
-#include <asm/processor.h>
 #include <asm/types.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
diff --git a/include/asm-um/uaccess.h b/include/asm-um/uaccess.h
index 077032d..b9a895d 100644
--- a/include/asm-um/uaccess.h
+++ b/include/asm-um/uaccess.h
@@ -6,7 +6,15 @@
 #ifndef __UM_UACCESS_H
 #define __UM_UACCESS_H
 
-#include "linux/sched.h"
+#include <asm/errno.h>
+#include <asm/processor.h>
+
+/* thread_info has a mm_segment_t in it, so put the definition up here */
+typedef struct {
+	unsigned long seg;
+} mm_segment_t;
+
+#include "linux/thread_info.h"
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
-- 
cgit v1.1


From 009ec2a915ba52f6b647c4076f4a2e259cba85aa Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:53 -0800
Subject: uml: style cleanup

Style fixes in elf-i386.h and arch/um/kernel/mem.c.
      update the copyright
      get rid of an emacs formatting comment
      some formatting fixes
      inclusion trimming
      whitespace fixes

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/mem.c      | 86 ++++++++++++++++++++++-------------------------
 include/asm-um/elf-i386.h | 19 +++--------
 2 files changed, 45 insertions(+), 60 deletions(-)

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 1f8f0c1..3eddc20 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -1,28 +1,22 @@
 /*
- * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include "linux/stddef.h"
-#include "linux/kernel.h"
-#include "linux/mm.h"
-#include "linux/bootmem.h"
-#include "linux/swap.h"
-#include "linux/highmem.h"
-#include "linux/gfp.h"
-#include "asm/page.h"
-#include "asm/fixmap.h"
-#include "asm/pgalloc.h"
-#include "kern_util.h"
+#include <linux/stddef.h>
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <asm/fixmap.h>
+#include <asm/page.h>
 #include "as-layout.h"
+#include "init.h"
 #include "kern.h"
+#include "kern_util.h"
 #include "mem_user.h"
-#include "um_uaccess.h"
 #include "os.h"
-#include "linux/types.h"
-#include "linux/string.h"
-#include "init.h"
-#include "kern_constants.h"
 
 /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */
 unsigned long *empty_zero_page = NULL;
@@ -53,7 +47,7 @@ static void setup_highmem(unsigned long highmem_start,
 	int i;
 
 	highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT;
-	for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){
+	for (i = 0; i < highmem_len >> PAGE_SHIFT; i++) {
 		page = &mem_map[highmem_pfn + i];
 		ClearPageReserved(page);
 		init_page_count(page);
@@ -85,7 +79,7 @@ void __init mem_init(void)
 #endif
 	num_physpages = totalram_pages;
 	max_pfn = totalram_pages;
-	printk(KERN_INFO "Memory: %luk available\n", 
+	printk(KERN_INFO "Memory: %luk available\n",
 	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10));
 	kmalloc_ok = 1;
 
@@ -119,7 +113,7 @@ static void __init one_md_table_init(pud_t *pud)
 #endif
 }
 
-static void __init fixrange_init(unsigned long start, unsigned long end, 
+static void __init fixrange_init(unsigned long start, unsigned long end,
 				 pgd_t *pgd_base)
 {
 	pgd_t *pgd;
@@ -206,7 +200,8 @@ static void __init fixaddr_user_init( void)
 	paddr = (unsigned long)alloc_bootmem_low_pages( size);
 	memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size);
 	paddr = __pa(paddr);
-	for ( ; size > 0; size-=PAGE_SIZE, vaddr+=PAGE_SIZE, paddr+=PAGE_SIZE){
+	for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE,
+		      paddr += PAGE_SIZE) {
 		pgd = swapper_pg_dir + pgd_index(vaddr);
 		pud = pud_offset(pgd, vaddr);
 		pmd = pmd_offset(pud, vaddr);
@@ -223,7 +218,7 @@ void __init paging_init(void)
 
 	empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
 	empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
-	for(i = 0; i < ARRAY_SIZE(zones_size); i++)
+	for (i = 0; i < ARRAY_SIZE(zones_size); i++)
 		zones_size[i] = 0;
 
 	zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) -
@@ -253,26 +248,26 @@ struct page *arch_validate(struct page *page, gfp_t mask, int order)
 	int i;
 
  again:
-	if(page == NULL)
+	if (page == NULL)
 		return page;
-	if(PageHighMem(page))
+	if (PageHighMem(page))
 		return page;
 
 	addr = (unsigned long) page_address(page);
-	for(i = 0; i < (1 << order); i++){
+	for (i = 0; i < (1 << order); i++) {
 		current->thread.fault_addr = (void *) addr;
-		if(__do_copy_to_user((void __user *) addr, &zero,
+		if (__do_copy_to_user((void __user *) addr, &zero,
 				     sizeof(zero),
 				     &current->thread.fault_addr,
-				     &current->thread.fault_catcher)){
-			if(!(mask & __GFP_WAIT))
+				     &current->thread.fault_catcher)) {
+			if (!(mask & __GFP_WAIT))
 				return NULL;
 			else break;
 		}
 		addr += PAGE_SIZE;
 	}
 
-	if(i == (1 << order))
+	if (i == (1 << order))
 		return page;
 	page = alloc_pages(mask, order);
 	goto again;
@@ -291,8 +286,8 @@ void free_initmem(void)
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (start < end)
-		printk ("Freeing initrd memory: %ldk freed\n", 
-			(end - start) >> 10);
+		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
+		       (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
 		init_page_count(virt_to_page(start));
@@ -309,27 +304,28 @@ void show_mem(void)
 	int highmem = 0;
 	struct page *page;
 
-	printk("Mem-info:\n");
+	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	printk(KERN_INFO "Free swap:       %6ldkB\n",
+	       nr_swap_pages<<(PAGE_SHIFT-10));
 	pfn = max_mapnr;
-	while(pfn-- > 0) {
+	while (pfn-- > 0) {
 		page = pfn_to_page(pfn);
 		total++;
-		if(PageHighMem(page))
+		if (PageHighMem(page))
 			highmem++;
-		if(PageReserved(page))
+		if (PageReserved(page))
 			reserved++;
-		else if(PageSwapCache(page))
+		else if (PageSwapCache(page))
 			cached++;
-		else if(page_count(page))
+		else if (page_count(page))
 			shared += page_count(page) - 1;
 	}
-	printk("%d pages of RAM\n", total);
-	printk("%d pages of HIGHMEM\n", highmem);
-	printk("%d reserved pages\n", reserved);
-	printk("%d pages shared\n", shared);
-	printk("%d pages swap cached\n", cached);
+	printk(KERN_INFO "%d pages of RAM\n", total);
+	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+	printk(KERN_INFO "%d reserved pages\n", reserved);
+	printk(KERN_INFO "%d pages shared\n", shared);
+	printk(KERN_INFO "%d pages swap cached\n", cached);
 }
 
 /* Allocate and free page tables. */
@@ -340,8 +336,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	if (pgd) {
 		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
-		memcpy(pgd + USER_PTRS_PER_PGD, 
-		       swapper_pg_dir + USER_PTRS_PER_PGD, 
+		memcpy(pgd + USER_PTRS_PER_PGD,
+		       swapper_pg_dir + USER_PTRS_PER_PGD,
 		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 	}
 	return pgd;
diff --git a/include/asm-um/elf-i386.h b/include/asm-um/elf-i386.h
index 22bbb75..23d6893 100644
--- a/include/asm-um/elf-i386.h
+++ b/include/asm-um/elf-i386.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 #ifndef __UM_ELF_I386_H
@@ -46,7 +46,7 @@ typedef struct user_i387_struct elf_fpregset_t;
 	PT_REGS_EDI(regs) = 0; \
 	PT_REGS_EBP(regs) = 0; \
 	PT_REGS_EAX(regs) = 0; \
-} while(0)
+} while (0)
 
 #define USE_ELF_CORE_DUMP
 #define ELF_EXEC_PAGESIZE 4096
@@ -74,7 +74,7 @@ typedef struct user_i387_struct elf_fpregset_t;
 	pr_reg[14] = PT_REGS_EFLAGS(regs);	\
 	pr_reg[15] = PT_REGS_SP(regs);		\
 	pr_reg[16] = PT_REGS_SS(regs);		\
-} while(0);
+} while (0);
 
 extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 
@@ -86,7 +86,7 @@ extern long elf_aux_hwcap;
 extern char * elf_aux_platform;
 #define ELF_PLATFORM (elf_aux_platform)
 
-#define SET_PERSONALITY(ex, ibcs2) do ; while(0)
+#define SET_PERSONALITY(ex, ibcs2) do { } while (0)
 
 extern unsigned long vsyscall_ehdr;
 extern unsigned long vsyscall_end;
@@ -161,14 +161,3 @@ if ( vsyscall_ehdr ) {							      \
 }
 
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
-- 
cgit v1.1


From a5a678c80beac4d163babda243a27eeb9c89bd89 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:54 -0800
Subject: uml: current.h cleanup

Tidy current-related stuff.  There was a comment in current.h saying
that current_thread was obsolete, so this patch turns all instances of
current_thread into current_thread_info().  There's some simplifying
of the result in arch/um/sys-i386/signal.c.

current.h and thread_info also get style cleanups.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/process.c     |  8 ++++----
 arch/um/sys-i386/signal.c    | 18 ++++++++----------
 arch/um/sys-x86_64/signal.c  |  4 ++--
 include/asm-um/current.h     | 23 ++---------------------
 include/asm-um/thread_info.h |  8 ++++----
 5 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 5704712..ae1942e 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -251,7 +251,7 @@ void default_idle(void)
 
 void cpu_idle(void)
 {
-	cpu_tasks[current_thread->cpu].pid = os_getpid();
+	cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
 	default_idle();
 }
 
@@ -269,7 +269,7 @@ int user_context(unsigned long sp)
 	unsigned long stack;
 
 	stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER);
-	return stack != (unsigned long) current_thread;
+	return stack != (unsigned long) current_thread_info();
 }
 
 extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end;
@@ -311,7 +311,7 @@ int strlen_user_proc(char __user *str)
 int smp_sigio_handler(void)
 {
 #ifdef CONFIG_SMP
-	int cpu = current_thread->cpu;
+	int cpu = current_thread_info()->cpu;
 	IPI_handler(cpu);
 	if (cpu != 0)
 		return 1;
@@ -321,7 +321,7 @@ int smp_sigio_handler(void)
 
 int cpu(void)
 {
-	return current_thread->cpu;
+	return current_thread_info()->cpu;
 }
 
 static atomic_t using_sysemu = ATOMIC_INIT(0);
diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c
index 19053d4..fd0c25a 100644
--- a/arch/um/sys-i386/signal.c
+++ b/arch/um/sys-i386/signal.c
@@ -168,12 +168,13 @@ static int copy_sc_from_user(struct pt_regs *regs,
 			     struct sigcontext __user *from)
 {
 	struct sigcontext sc;
-	int err;
+	int err, pid;
 
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
 		return err;
 
+	pid = userspace_pid[current_thread_info()->cpu];
 	copy_sc(&regs->regs, &sc);
 	if (have_fpx_regs) {
 		struct user_fxsr_struct fpx;
@@ -187,8 +188,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 		if (err)
 			return 1;
 
-		err = restore_fpx_registers(userspace_pid[current_thread->cpu],
-					    (unsigned long *) &fpx);
+		err = restore_fpx_registers(pid, (unsigned long *) &fpx);
 		if (err < 0) {
 			printk(KERN_ERR "copy_sc_from_user - "
 			       "restore_fpx_registers failed, errno = %d\n",
@@ -204,8 +204,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 		if (err)
 			return 1;
 
-		err = restore_fp_registers(userspace_pid[current_thread->cpu],
-					   (unsigned long *) &fp);
+		err = restore_fp_registers(pid, (unsigned long *) &fp);
 		if (err < 0) {
 			printk(KERN_ERR "copy_sc_from_user - "
 			       "restore_fp_registers failed, errno = %d\n",
@@ -223,7 +222,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 {
 	struct sigcontext sc;
 	struct faultinfo * fi = &current->thread.arch.faultinfo;
-	int err;
+	int err, pid;
 
 	sc.gs = REGS_GS(regs->regs.gp);
 	sc.fs = REGS_FS(regs->regs.gp);
@@ -249,11 +248,11 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 	to_fp = (to_fp ? to_fp : (struct _fpstate __user *) (to + 1));
 	sc.fpstate = to_fp;
 
+	pid = userspace_pid[current_thread_info()->cpu];
 	if (have_fpx_regs) {
 		struct user_fxsr_struct fpx;
 
-		err = save_fpx_registers(userspace_pid[current_thread->cpu],
-					 (unsigned long *) &fpx);
+		err = save_fpx_registers(pid, (unsigned long *) &fpx);
 		if (err < 0){
 			printk(KERN_ERR "copy_sc_to_user - save_fpx_registers "
 			       "failed, errno = %d\n", err);
@@ -276,8 +275,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 	else {
 		struct user_i387_struct fp;
 
-		err = save_fp_registers(userspace_pid[current_thread->cpu],
-					(unsigned long *) &fp);
+		err = save_fp_registers(pid, (unsigned long *) &fp);
 		if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct)))
 			return 1;
 	}
diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c
index 1407018..1a899a7 100644
--- a/arch/um/sys-x86_64/signal.c
+++ b/arch/um/sys-x86_64/signal.c
@@ -81,7 +81,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	if (err)
 		return 1;
 
-	err = restore_fp_registers(userspace_pid[current_thread->cpu],
+	err = restore_fp_registers(userspace_pid[current_thread_info()->cpu],
 				   (unsigned long *) &fp);
 	if (err < 0) {
 		printk(KERN_ERR "copy_sc_from_user - "
@@ -143,7 +143,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 	if (err)
 		return 1;
 
-	err = save_fp_registers(userspace_pid[current_thread->cpu],
+	err = save_fp_registers(userspace_pid[current_thread_info()->cpu],
 				(unsigned long *) &fp);
 	if (err < 0) {
 		printk(KERN_ERR "copy_sc_from_user - restore_fp_registers "
diff --git a/include/asm-um/current.h b/include/asm-um/current.h
index 8fd72f6..c2191d9 100644
--- a/include/asm-um/current.h
+++ b/include/asm-um/current.h
@@ -1,32 +1,13 @@
-/* 
- * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
 #ifndef __UM_CURRENT_H
 #define __UM_CURRENT_H
 
-#ifndef __ASSEMBLY__
-
-#include "asm/page.h"
 #include "linux/thread_info.h"
 
 #define current (current_thread_info()->task)
 
-/*Backward compatibility - it's used inside arch/um.*/
-#define current_thread current_thread_info()
-
-#endif /* __ASSEMBLY__ */
-
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index cdfc91c..356b83e 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -1,5 +1,5 @@
-/* 
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -76,8 +76,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_POLLING_NRFLAG      3       /* true if poll_idle() is polling 
-					 * TIF_NEED_RESCHED 
+#define TIF_POLLING_NRFLAG      3       /* true if poll_idle() is polling
+					 * TIF_NEED_RESCHED
 					 */
 #define TIF_RESTART_BLOCK 	4
 #define TIF_MEMDIE	 	5
-- 
cgit v1.1


From 655e4ed0c521dcfdbf1c5a79da971560e6733527 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:55 -0800
Subject: uml: fix page table data sizes

Get the sizes of various pieces of data right when using three-level
page tables.  pgd and pmd entries remain at 32 bits in a 32-bit
compilation because page tables will remain in low memory.  So,
PGDIR_SHIFT, the PTRS_PER_* values, set_pud, set_pmd are conditional
on 64BIT.

More use of phys_t is made when there are physical memory addresses
floating around.

ObCheckpatchViolationJustification - the new typedef is an alternate
definition of pmd_t, which I can't really live without.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/mem.c            | 17 +++++++++--------
 include/asm-um/page.h           |  6 +++---
 include/asm-um/pgtable-3level.h | 21 ++++++++++++++++++++-
 include/asm-um/pgtable.h        |  2 +-
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 3eddc20..663011c 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -132,7 +132,7 @@ static void __init fixrange_init(unsigned long start, unsigned long end,
 		if (pud_none(*pud))
 			one_md_table_init(pud);
 		pmd = pmd_offset(pud, vaddr);
-		for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) {
+		for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) {
 			one_page_table_init(pmd);
 			vaddr += PMD_SIZE;
 		}
@@ -191,22 +191,23 @@ static void __init fixaddr_user_init( void)
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned long paddr, vaddr = FIXADDR_USER_START;
+	phys_t p;
+	unsigned long v, vaddr = FIXADDR_USER_START;
 
-	if (  ! size )
+	if (!size)
 		return;
 
 	fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir);
-	paddr = (unsigned long)alloc_bootmem_low_pages( size);
-	memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size);
-	paddr = __pa(paddr);
+	v = (unsigned long) alloc_bootmem_low_pages(size);
+	memcpy((void *) v , (void *) FIXADDR_USER_START, size);
+	p = __pa(v);
 	for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE,
-		      paddr += PAGE_SIZE) {
+		      p += PAGE_SIZE) {
 		pgd = swapper_pg_dir + pgd_index(vaddr);
 		pud = pud_offset(pgd, vaddr);
 		pmd = pmd_offset(pud, vaddr);
 		pte = pte_offset_kernel(pmd, vaddr);
-		pte_set_val( (*pte), paddr, PAGE_READONLY);
+		pte_set_val(*pte, p, PAGE_READONLY);
 	}
 #endif
 }
diff --git a/include/asm-um/page.h b/include/asm-um/page.h
index 4b424c7..fe2374d 100644
--- a/include/asm-um/page.h
+++ b/include/asm-um/page.h
@@ -30,7 +30,7 @@ struct page;
 #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT)
 
 typedef struct { unsigned long pte_low, pte_high; } pte_t;
-typedef struct { unsigned long long pmd; } pmd_t;
+typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pgd; } pgd_t;
 #define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32))
 
@@ -106,8 +106,8 @@ extern unsigned long uml_physmem;
 #define __pa(virt) to_phys((void *) (unsigned long) (virt))
 #define __va(phys) to_virt((unsigned long) (phys))
 
-#define phys_to_pfn(p) ((p) >> PAGE_SHIFT)
-#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
+#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT))
+#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT))
 
 #define pfn_valid(pfn) ((pfn) < max_mapnr)
 #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v)))
diff --git a/include/asm-um/pgtable-3level.h b/include/asm-um/pgtable-3level.h
index e0b6c16..48f8f5d 100644
--- a/include/asm-um/pgtable-3level.h
+++ b/include/asm-um/pgtable-3level.h
@@ -11,7 +11,11 @@
 
 /* PGDIR_SHIFT determines what a third-level page table entry can map */
 
+#ifdef CONFIG_64BIT
 #define PGDIR_SHIFT	30
+#else
+#define PGDIR_SHIFT	31
+#endif
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
@@ -28,9 +32,15 @@
  */
 
 #define PTRS_PER_PTE 512
+#ifdef CONFIG_64BIT
 #define PTRS_PER_PMD 512
-#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
 #define PTRS_PER_PGD 512
+#else
+#define PTRS_PER_PMD 1024
+#define PTRS_PER_PGD 1024
+#endif
+
+#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
 #define FIRST_USER_ADDRESS	0
 
 #define pte_ERROR(e) \
@@ -49,7 +59,12 @@
 #define pud_populate(mm, pud, pmd) \
 	set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd)))
 
+#ifdef CONFIG_64BIT
 #define set_pud(pudptr, pudval) set_64bit((phys_t *) (pudptr), pud_val(pudval))
+#else
+#define set_pud(pudptr, pudval) (*(pudptr) = (pudval))
+#endif
+
 static inline int pgd_newpage(pgd_t pgd)
 {
 	return(pgd_val(pgd) & _PAGE_NEWPAGE);
@@ -57,7 +72,11 @@ static inline int pgd_newpage(pgd_t pgd)
 
 static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; }
 
+#ifdef CONFIG_64BIT
 #define set_pmd(pmdptr, pmdval) set_64bit((phys_t *) (pmdptr), pmd_val(pmdval))
+#else
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+#endif
 
 struct mm_struct;
 extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address);
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index bec4840..62ab94a 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -262,7 +262,7 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval)
 
 #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
 #define __virt_to_page(virt) phys_to_page(__pa(virt))
-#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
+#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page))
 #define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
 
 #define mk_pte(page, pgprot) \
-- 
cgit v1.1


From ca77b555c0aafa3070fbb67592abaaa1b8d31913 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:55 -0800
Subject: uml: add virt_to_pte

Turn um_virt_to_phys into virt_to_pte, cleaning up a horrid interface.

It's also made non-static and declared in pgtable.h because it'll be
needed when the stubs get a vma.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/skas/uaccess.c | 56 ++++++++++++++++++-------------------------
 include/asm-um/pgtable.h      |  3 +++
 2 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index d145565..05b41dbc 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -13,70 +13,60 @@
 #include "kern_util.h"
 #include "os.h"
 
-static void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
-			     pte_t *pte_out)
+pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
-	pte_t ptent;
 
-	if (task->mm == NULL)
-		return ERR_PTR(-EINVAL);
-	pgd = pgd_offset(task->mm, addr);
+	if (mm == NULL)
+		return NULL;
+
+	pgd = pgd_offset(mm, addr);
 	if (!pgd_present(*pgd))
-		return ERR_PTR(-EINVAL);
+		return NULL;
 
 	pud = pud_offset(pgd, addr);
 	if (!pud_present(*pud))
-		return ERR_PTR(-EINVAL);
+		return NULL;
 
 	pmd = pmd_offset(pud, addr);
 	if (!pmd_present(*pmd))
-		return ERR_PTR(-EINVAL);
-
-	pte = pte_offset_kernel(pmd, addr);
-	ptent = *pte;
-	if (!pte_present(ptent))
-		return ERR_PTR(-EINVAL);
+		return NULL;
 
-	if (pte_out != NULL)
-		*pte_out = ptent;
-	return (void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK);
+	return pte_offset_kernel(pmd, addr);
 }
 
-static unsigned long maybe_map(unsigned long virt, int is_write)
+static pte_t *maybe_map(unsigned long virt, int is_write)
 {
-	pte_t pte;
-	int err;
+	pte_t *pte = virt_to_pte(current->mm, virt);
+	int err, dummy_code;
 
-	void *phys = um_virt_to_phys(current, virt, &pte);
-	int dummy_code;
-
-	if (IS_ERR(phys) || (is_write && !pte_write(pte))) {
+	if ((pte == NULL) || !pte_present(*pte) ||
+	    (is_write && !pte_write(*pte))) {
 		err = handle_page_fault(virt, 0, is_write, 1, &dummy_code);
 		if (err)
-			return -1UL;
-		phys = um_virt_to_phys(current, virt, NULL);
+			return NULL;
+		pte = virt_to_pte(current->mm, virt);
 	}
-	if (IS_ERR(phys))
-		phys = (void *) -1;
+	if (!pte_present(*pte))
+		pte = NULL;
 
-	return (unsigned long) phys;
+	return pte;
 }
 
 static int do_op_one_page(unsigned long addr, int len, int is_write,
 		 int (*op)(unsigned long addr, int len, void *arg), void *arg)
 {
 	struct page *page;
+	pte_t *pte;
 	int n;
 
-	addr = maybe_map(addr, is_write);
-	if (addr == -1UL)
+	pte = maybe_map(addr, is_write);
+	if (pte == NULL)
 		return -1;
 
-	page = phys_to_page(addr);
+	page = pte_page(*pte);
 	addr = (unsigned long) kmap_atomic(page, KM_UML_USERCOPY) +
 		(addr & ~PAGE_MASK);
 
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 62ab94a..fb47777 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -323,6 +323,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_unmap(pte) do { } while (0)
 #define pte_unmap_nested(pte) do { } while (0)
 
+struct mm_struct;
+extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
+
 #define update_mmu_cache(vma,address,pte) do ; while (0)
 
 /* Encode and de-code a swap entry */
-- 
cgit v1.1


From ee3d9bd4de1ed93d2a7ee41c331ed30a1c7b8acd Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:56 -0800
Subject: uml: simplify SIGSEGV handling

Simplify the page fault stub by not masking signals while it is running.  This
allows it to signal that it is done by executing an instruction which will
generate a SIGTRAP (int3 on x86) rather than running sigreturn by hand after
queueing a blocked SIGUSR1.

userspace_tramp now no longer puts anything in the SIGSEGV sa_mask, but it
does add SA_NODEFER to sa_flags so that SIGSEGV is still enabled after the
signal handler fails to run sigreturn.

SIGWINCH is just blocked so that we don't have to deal with it and the signal
masks used by wait_stub_done are updated to reflect the smaller number of
signals that it has to worry about.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/skas/process.c | 11 ++++-------
 arch/um/sys-i386/stub_segv.c    | 19 ++-----------------
 arch/um/sys-x86_64/stub_segv.c  | 39 +++++----------------------------------
 3 files changed, 11 insertions(+), 58 deletions(-)

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 765cfa6..2cc2071 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -55,10 +55,10 @@ static int ptrace_dump_regs(int pid)
  * Signals that are OK to receive in the stub - we'll just continue it.
  * SIGWINCH will happen when UML is inside a detached screen.
  */
-#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH))
+#define STUB_SIG_MASK (1 << SIGVTALRM)
 
 /* Signals that the stub will finish with - anything else is an error */
-#define STUB_DONE_MASK ((1 << SIGUSR1) | (1 << SIGTRAP))
+#define STUB_DONE_MASK (1 << SIGTRAP)
 
 void wait_stub_done(int pid)
 {
@@ -179,6 +179,7 @@ static int userspace_tramp(void *stack)
 	ptrace(PTRACE_TRACEME, 0, 0, 0);
 
 	signal(SIGTERM, SIG_DFL);
+	signal(SIGWINCH, SIG_IGN);
 	err = set_interval();
 	if (err)
 		panic("userspace_tramp - setting timer failed, errno = %d\n",
@@ -222,11 +223,7 @@ static int userspace_tramp(void *stack)
 
 		set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE);
 		sigemptyset(&sa.sa_mask);
-		sigaddset(&sa.sa_mask, SIGIO);
-		sigaddset(&sa.sa_mask, SIGWINCH);
-		sigaddset(&sa.sa_mask, SIGVTALRM);
-		sigaddset(&sa.sa_mask, SIGUSR1);
-		sa.sa_flags = SA_ONSTACK;
+		sa.sa_flags = SA_ONSTACK | SA_NODEFER;
 		sa.sa_handler = (void *) v;
 		sa.sa_restorer = NULL;
 		if (sigaction(SIGSEGV, &sa, NULL) < 0)
diff --git a/arch/um/sys-i386/stub_segv.c b/arch/um/sys-i386/stub_segv.c
index b3999cb..28ccf73 100644
--- a/arch/um/sys-i386/stub_segv.c
+++ b/arch/um/sys-i386/stub_segv.c
@@ -1,32 +1,17 @@
 /*
- * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
+ * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include <signal.h>
-#include <sys/select.h> /* The only way I can see to get sigset_t */
-#include <asm/unistd.h>
-#include "as-layout.h"
-#include "uml-config.h"
 #include "sysdep/stub.h"
 #include "sysdep/sigcontext.h"
-#include "sysdep/faultinfo.h"
 
 void __attribute__ ((__section__ (".__syscall_stub")))
 stub_segv_handler(int sig)
 {
 	struct sigcontext *sc = (struct sigcontext *) (&sig + 1);
-	int pid;
 
 	GET_FAULTINFO_FROM_SC(*((struct faultinfo *) STUB_DATA), sc);
 
-	pid = stub_syscall0(__NR_getpid);
-	stub_syscall2(__NR_kill, pid, SIGUSR1);
-
-	/* Load pointer to sigcontext into esp, since we need to leave
-	 * the stack in its original form when we do the sigreturn here, by
-	 * hand.
-	 */
-	__asm__ __volatile__("mov %0,%%esp ; movl %1, %%eax ; "
-			     "int $0x80" : : "a" (sc), "g" (__NR_sigreturn));
+	trap_myself();
 }
diff --git a/arch/um/sys-x86_64/stub_segv.c b/arch/um/sys-x86_64/stub_segv.c
index 3afb590..ced051a 100644
--- a/arch/um/sys-x86_64/stub_segv.c
+++ b/arch/um/sys-x86_64/stub_segv.c
@@ -1,51 +1,22 @@
 /*
- * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
+ * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include <stddef.h>
 #include <signal.h>
-#include <asm/unistd.h>
 #include "as-layout.h"
-#include "uml-config.h"
-#include "sysdep/sigcontext.h"
-#include "sysdep/faultinfo.h"
 #include "sysdep/stub.h"
-
-/* Copied from sys-x86_64/signal.c - Can't find an equivalent definition
- * in the libc headers anywhere.
- */
-struct rt_sigframe
-{
-	char *pretcode;
-	struct ucontext uc;
-	struct siginfo info;
-};
-
-/* Copied here from <linux/kernel.h> - we're userspace. */
-#define container_of(ptr, type, member) ({                   \
-	const typeof( ((type *)0)->member ) *__mptr = (ptr); \
-	(type *)( (char *)__mptr - offsetof(type,member) );})
+#include "sysdep/faultinfo.h"
+#include "sysdep/sigcontext.h"
 
 void __attribute__ ((__section__ (".__syscall_stub")))
 stub_segv_handler(int sig)
 {
 	struct ucontext *uc;
-        int pid;
 
 	__asm__ __volatile__("movq %%rdx, %0" : "=g" (uc) :);
 	GET_FAULTINFO_FROM_SC(*((struct faultinfo *) STUB_DATA),
 			      &uc->uc_mcontext);
-
-	pid = stub_syscall0(__NR_getpid);
-	stub_syscall2(__NR_kill, pid, SIGUSR1);
-
-	/* sys_sigreturn expects that the stack pointer will be 8 bytes into
-	 * the signal frame.  So, we use the ucontext pointer, which we know
-	 * already, to get the signal frame pointer, and add 8 to that.
-	 */
-	__asm__ __volatile__("movq %0, %%rsp; movq %1, %%rax ; syscall": :
-                             "g" ((unsigned long)
-                                  container_of(uc, struct rt_sigframe, uc) + 8),
-                             "g" (__NR_rt_sigreturn));
+	trap_myself();
 }
+
-- 
cgit v1.1


From d25f2e1235aab716c9fd6ba36c42503627a3a0e3 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:57 -0800
Subject: uml: use ptrace directly in libc code

Some register accessor cleanups -
	userspace() was calling restore_registers and save_registers for no
reason, since userspace() is on the libc side of the house, and these
add no value over calling ptrace directly
	init_thread_registers and get_safe_registers were the same thing,
so init_thread_registers is gone

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/registers.h     |  1 -
 arch/um/kernel/process.c        |  2 +-
 arch/um/os-Linux/registers.c    | 13 ++++---------
 arch/um/os-Linux/skas/process.c | 11 ++++++++---
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/arch/um/include/registers.h b/arch/um/include/registers.h
index 0e27406..6df3748 100644
--- a/arch/um/include/registers.h
+++ b/arch/um/include/registers.h
@@ -9,7 +9,6 @@
 #include "sysdep/ptrace.h"
 #include "sysdep/archsetjmp.h"
 
-extern void init_thread_registers(struct uml_pt_regs *to);
 extern int save_fp_registers(int pid, unsigned long *fp_regs);
 extern int restore_fp_registers(int pid, unsigned long *fp_regs);
 extern int save_fpx_registers(int pid, unsigned long *fp_regs);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index ae1942e..7a29123 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -199,7 +199,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		arch_copy_thread(&current->thread.arch, &p->thread.arch);
 	}
 	else {
-		init_thread_registers(&p->thread.regs.regs);
+		get_safe_registers(p->thread.regs.regs.gp);
 		p->thread.request.u.thread = current->thread.request.u.thread;
 		handler = new_thread_handler;
 	}
diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c
index a32ba6a..c78fae3 100644
--- a/arch/um/os-Linux/registers.c
+++ b/arch/um/os-Linux/registers.c
@@ -10,15 +10,6 @@
 #include "sysdep/ptrace.h"
 #include "user.h"
 
-/* This is set once at boot time and not changed thereafter */
-
-static unsigned long exec_regs[MAX_REG_NR];
-
-void init_thread_registers(struct uml_pt_regs *to)
-{
-	memcpy(to->gp, exec_regs, sizeof(to->gp));
-}
-
 void save_registers(int pid, struct uml_pt_regs *regs)
 {
 	int err;
@@ -39,6 +30,10 @@ void restore_registers(int pid, struct uml_pt_regs *regs)
 		      "errno = %d\n", errno);
 }
 
+/* This is set once at boot time and not changed thereafter */
+
+static unsigned long exec_regs[MAX_REG_NR];
+
 void init_registers(int pid)
 {
 	int err;
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 2cc2071..7dc24e3 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -300,7 +300,9 @@ void userspace(struct uml_pt_regs *regs)
 	nsecs += os_nsecs();
 
 	while (1) {
-		restore_registers(pid, regs);
+		if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp))
+			panic("userspace - PTRACE_SETREGS failed, "
+			      "errno = %d\n", errno);
 
 		/* Now we set local_using_sysemu to be used for one loop */
 		local_using_sysemu = get_using_sysemu();
@@ -320,7 +322,10 @@ void userspace(struct uml_pt_regs *regs)
 			      errno);
 
 		regs->is_user = 1;
-		save_registers(pid, regs);
+		if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp))
+			panic("userspace - saving registers failed, "
+			      "errno = %d\n", errno);
+
 		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
 
 		if (WIFSTOPPED(status)) {
@@ -343,7 +348,7 @@ void userspace(struct uml_pt_regs *regs)
 				break;
 			case SIGVTALRM:
 				now = os_nsecs();
-				if(now < nsecs)
+				if (now < nsecs)
 					break;
 				block_signals();
 				(*sig_info[sig])(sig, regs);
-- 
cgit v1.1


From 3e6f2ac480ce398ade2fd6b5e02d00d1265f1e0f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:30:58 -0800
Subject: uml: kill processes instead of panicing kernel

UML was panicing in the case of failures of libc calls which shouldn't happen.
 This is an overreaction since a failure from libc doesn't normally mean that
kernel data structures are in an unknown state.  Instead, the current process
should just be killed if there is no way to recover.

The case that prompted this was a failure of PTRACE_SETREGS restoring the same
state that was read by PTRACE_GETREGS.  It appears that when a process tries
to load a bogus value into a segment register, it segfaults (as expected) and
the value is actually loaded and is seen by PTRACE_GETREGS (not expected).

This case is fixed by forcing a fatal SIGSEGV on the process so that it
immediately dies.  fatal_sigsegv was added for this purpose.  It was declared
as noreturn, so in order to pursuade gcc that it actually does not return, I
added a call to os_dump_core (and declared it noreturn) so that I get a core
file if somehow the process survives.

All other calls in arch/um/os-Linux/skas/process.c got the same treatment,
with failures causing the process to die instead of a kernel panic, with some
exceptions.

userspace_tramp exits with status 1 if anything goes wrong there.  That will
cause start_userspace to return an error.  copy_context_skas0 and
map_stub_pages also now return errors instead of panicing.  Callers of thes
functions were changed to check for errors and do something appropriate.
Usually that's to return an error to their callers.
check_skas3_ptrace_faultinfo just exits since that's too early to do anything
else.

save_registers, restore_registers, and init_registers now return status
instead of panicing on failure, with their callers doing something
appropriate.

There were also duplicate declarations of save_registers and restore_registers
in os.h - these are gone.

I noticed and fixed up some whitespace damage.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/kern_util.h     |   1 +
 arch/um/include/os.h            |   8 +-
 arch/um/include/registers.h     |   6 +-
 arch/um/kernel/skas/mmu.c       |   5 +
 arch/um/kernel/skas/process.c   |  20 ++-
 arch/um/kernel/trap.c           |  12 ++
 arch/um/os-Linux/registers.c    |  21 ++--
 arch/um/os-Linux/skas/process.c | 265 ++++++++++++++++++++++++++--------------
 arch/um/os-Linux/start_up.c     |   4 +-
 arch/um/sys-x86_64/syscalls.c   |   8 +-
 10 files changed, 233 insertions(+), 117 deletions(-)

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index 8fadf89..625ca29 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -62,6 +62,7 @@ extern int singlestepping(void *t);
 extern void segv_handler(int sig, struct uml_pt_regs *regs);
 extern void bus_handler(int sig, struct uml_pt_regs *regs);
 extern void winch(int sig, struct uml_pt_regs *regs);
+extern void fatal_sigsegv(void) __attribute__ ((noreturn));
 
 
 #endif
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 69c0d4a..9428d34 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -238,7 +238,7 @@ extern int raw(int fd);
 extern void setup_machinename(char *machine_out);
 extern void setup_hostinfo(char *buf, int len);
 extern int setjmp_wrapper(void (*proc)(void *, void *), ...);
-extern void os_dump_core(void);
+extern void os_dump_core(void) __attribute__ ((noreturn));
 
 /* time.c */
 extern void idle_sleep(unsigned long long nsecs);
@@ -267,11 +267,9 @@ extern int protect(struct mm_id * mm_idp, unsigned long addr,
 extern int is_skas_winch(int pid, int fd, void *data);
 extern int start_userspace(unsigned long stub_stack);
 extern int copy_context_skas0(unsigned long stack, int pid);
-extern void save_registers(int pid, struct uml_pt_regs *regs);
-extern void restore_registers(int pid, struct uml_pt_regs *regs);
 extern void userspace(struct uml_pt_regs *regs);
-extern void map_stub_pages(int fd, unsigned long code,
-			   unsigned long data, unsigned long stack);
+extern int map_stub_pages(int fd, unsigned long code, unsigned long data,
+			  unsigned long stack);
 extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void));
 extern void switch_threads(jmp_buf *me, jmp_buf *you);
 extern int start_idle_thread(void *stack, jmp_buf *switch_buf);
diff --git a/arch/um/include/registers.h b/arch/um/include/registers.h
index 6df3748..9ea1ae3 100644
--- a/arch/um/include/registers.h
+++ b/arch/um/include/registers.h
@@ -13,9 +13,9 @@ extern int save_fp_registers(int pid, unsigned long *fp_regs);
 extern int restore_fp_registers(int pid, unsigned long *fp_regs);
 extern int save_fpx_registers(int pid, unsigned long *fp_regs);
 extern int restore_fpx_registers(int pid, unsigned long *fp_regs);
-extern void save_registers(int pid, struct uml_pt_regs *regs);
-extern void restore_registers(int pid, struct uml_pt_regs *regs);
-extern void init_registers(int pid);
+extern int save_registers(int pid, struct uml_pt_regs *regs);
+extern int restore_registers(int pid, struct uml_pt_regs *regs);
+extern int init_registers(int pid);
 extern void get_safe_registers(unsigned long *regs);
 extern unsigned long get_thread_reg(int reg, jmp_buf *buf);
 
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index b56fe8b..6da9ab4 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -114,6 +114,11 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 			to_mm->id.u.pid = copy_context_skas0(stack,
 							     from_mm->id.u.pid);
 		else to_mm->id.u.pid = start_userspace(stack);
+
+		if (to_mm->id.u.pid < 0) {
+			ret = to_mm->id.u.pid;
+			goto out_free;
+		}
 	}
 
 	ret = init_new_ldt(to_mm, from_mm);
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index fce389c..2e9852c 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -6,19 +6,25 @@
 #include "linux/init.h"
 #include "linux/sched.h"
 #include "as-layout.h"
+#include "kern.h"
 #include "os.h"
 #include "skas.h"
 
 int new_mm(unsigned long stack)
 {
-	int fd;
+	int fd, err;
 
 	fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0);
 	if (fd < 0)
 		return fd;
 
-	if (skas_needs_stub)
-		map_stub_pages(fd, STUB_CODE, STUB_DATA, stack);
+	if (skas_needs_stub) {
+		err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack);
+		if (err) {
+			os_close_file(fd);
+			return err;
+		}
+	}
 
 	return fd;
 }
@@ -49,8 +55,14 @@ int __init start_uml(void)
 {
 	stack_protections((unsigned long) &cpu0_irqstack);
 	set_sigstack(cpu0_irqstack, THREAD_SIZE);
-	if (proc_mm)
+	if (proc_mm) {
 		userspace_pid[0] = start_userspace(0);
+		if (userspace_pid[0] < 0) {
+			printf("start_uml - start_userspace returned %d\n",
+			       userspace_pid[0]);
+			exit(1);
+		}
+	}
 
 	init_new_thread_signals();
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8fd1a79..44e4904 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -129,6 +129,18 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 	force_sig_info(SIGSEGV, &si, current);
 }
 
+void fatal_sigsegv(void)
+{
+	force_sigsegv(SIGSEGV, current);
+	do_signal();
+	/*
+	 * This is to tell gcc that we're not returning - do_signal
+	 * can, in general, return, but in this case, it's not, since
+	 * we just got a fatal SIGSEGV queued.
+	 */
+	os_dump_core();
+}
+
 void segv_handler(int sig, struct uml_pt_regs *regs)
 {
 	struct faultinfo * fi = UPT_FAULTINFO(regs);
diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c
index c78fae3..830fe6a 100644
--- a/arch/um/os-Linux/registers.c
+++ b/arch/um/os-Linux/registers.c
@@ -8,42 +8,41 @@
 #include <string.h>
 #include <sys/ptrace.h>
 #include "sysdep/ptrace.h"
-#include "user.h"
 
-void save_registers(int pid, struct uml_pt_regs *regs)
+int save_registers(int pid, struct uml_pt_regs *regs)
 {
 	int err;
 
 	err = ptrace(PTRACE_GETREGS, pid, 0, regs->gp);
 	if (err < 0)
-		panic("save_registers - saving registers failed, errno = %d\n",
-		      errno);
+		return -errno;
+	return 0;
 }
 
-void restore_registers(int pid, struct uml_pt_regs *regs)
+int restore_registers(int pid, struct uml_pt_regs *regs)
 {
 	int err;
 
 	err = ptrace(PTRACE_SETREGS, pid, 0, regs->gp);
 	if (err < 0)
-		panic("restore_registers - saving registers failed, "
-		      "errno = %d\n", errno);
+		return -errno;
+	return 0;
 }
 
 /* This is set once at boot time and not changed thereafter */
 
 static unsigned long exec_regs[MAX_REG_NR];
 
-void init_registers(int pid)
+int init_registers(int pid)
 {
 	int err;
 
 	err = ptrace(PTRACE_GETREGS, pid, 0, exec_regs);
-	if (err)
-		panic("check_ptrace : PTRACE_GETREGS failed, errno = %d",
-		      errno);
+	if (err < 0)
+		return -errno;
 
 	arch_init_registers(pid);
+	return 0;
 }
 
 void get_safe_registers(unsigned long *regs)
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 7dc24e3..862fea0 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -38,17 +38,17 @@ int is_skas_winch(int pid, int fd, void *data)
 
 static int ptrace_dump_regs(int pid)
 {
-        unsigned long regs[MAX_REG_NR];
-        int i;
+	unsigned long regs[MAX_REG_NR];
+	int i;
 
-        if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0)
-                return -errno;
+	if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0)
+		return -errno;
 
 	printk(UM_KERN_ERR "Stub registers -\n");
 	for (i = 0; i < ARRAY_SIZE(regs); i++)
 		printk(UM_KERN_ERR "\t%d - %lx\n", i, regs[i]);
 
-        return 0;
+	return 0;
 }
 
 /*
@@ -73,9 +73,11 @@ void wait_stub_done(int pid)
 			break;
 
 		err = ptrace(PTRACE_CONT, pid, 0, 0);
-		if (err)
-			panic("wait_stub_done : continue failed, errno = %d\n",
-			      errno);
+		if (err) {
+			printk(UM_KERN_ERR "wait_stub_done : continue failed, "
+			       "errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 	}
 
 	if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0)
@@ -86,8 +88,10 @@ bad_wait:
 	if (err)
 		printk(UM_KERN_ERR "Failed to get registers from stub, "
 		       "errno = %d\n", -err);
-	panic("wait_stub_done : failed to wait for SIGUSR1/SIGTRAP, pid = %d, "
-	      "n = %d, errno = %d, status = 0x%x\n", pid, n, errno, status);
+	printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, "
+	       "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno,
+	       status);
+	fatal_sigsegv();
 }
 
 extern unsigned long current_stub_stack(void);
@@ -98,9 +102,11 @@ void get_skas_faultinfo(int pid, struct faultinfo * fi)
 
 	if (ptrace_faultinfo) {
 		err = ptrace(PTRACE_FAULTINFO, pid, 0, fi);
-		if (err)
-			panic("get_skas_faultinfo - PTRACE_FAULTINFO failed, "
-			      "errno = %d\n", errno);
+		if (err) {
+			printk(UM_KERN_ERR "get_skas_faultinfo - "
+			       "PTRACE_FAULTINFO failed, errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 
 		/* Special handling for i386, which has different structs */
 		if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo))
@@ -110,9 +116,11 @@ void get_skas_faultinfo(int pid, struct faultinfo * fi)
 	}
 	else {
 		err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
-		if (err)
-			panic("Failed to continue stub, pid = %d, errno = %d\n",
-			      pid, errno);
+		if (err) {
+			printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
+			       "errno = %d\n", pid, errno);
+			fatal_sigsegv();
+		}
 		wait_stub_done(pid);
 
 		/*
@@ -145,25 +153,31 @@ static void handle_trap(int pid, struct uml_pt_regs *regs,
 	{
 		err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
 			     __NR_getpid);
-		if (err < 0)
-			panic("handle_trap - nullifying syscall failed, "
-			      "errno = %d\n", errno);
+		if (err < 0) {
+			printk(UM_KERN_ERR "handle_trap - nullifying syscall "
+			       "failed, errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 
 		err = ptrace(PTRACE_SYSCALL, pid, 0, 0);
-		if (err < 0)
-			panic("handle_trap - continuing to end of syscall "
-			      "failed, errno = %d\n", errno);
+		if (err < 0) {
+			printk(UM_KERN_ERR "handle_trap - continuing to end of "
+			       "syscall failed, errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 
 		CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
 		if ((err < 0) || !WIFSTOPPED(status) ||
-		   (WSTOPSIG(status) != SIGTRAP + 0x80)) {
-                        err = ptrace_dump_regs(pid);
-                        if (err)
-                                printk(UM_KERN_ERR "Failed to get registers "
+		    (WSTOPSIG(status) != SIGTRAP + 0x80)) {
+			err = ptrace_dump_regs(pid);
+			if (err)
+				printk(UM_KERN_ERR "Failed to get registers "
 				       "from process, errno = %d\n", -err);
-			panic("handle_trap - failed to wait at end of syscall, "
-			      "errno = %d, status = %d\n", errno, status);
-                }
+			printk(UM_KERN_ERR "handle_trap - failed to wait at "
+			       "end of syscall, errno = %d, status = %d\n",
+			       errno, status);
+			fatal_sigsegv();
+		}
 	}
 
 	handle_syscall(regs);
@@ -181,9 +195,11 @@ static int userspace_tramp(void *stack)
 	signal(SIGTERM, SIG_DFL);
 	signal(SIGWINCH, SIG_IGN);
 	err = set_interval();
-	if (err)
-		panic("userspace_tramp - setting timer failed, errno = %d\n",
-		      err);
+	if (err) {
+		printk(UM_KERN_ERR "userspace_tramp - setting timer failed, "
+		       "errno = %d\n", err);
+		exit(1);
+	}
 
 	if (!proc_mm) {
 		/*
@@ -226,9 +242,11 @@ static int userspace_tramp(void *stack)
 		sa.sa_flags = SA_ONSTACK | SA_NODEFER;
 		sa.sa_handler = (void *) v;
 		sa.sa_restorer = NULL;
-		if (sigaction(SIGSEGV, &sa, NULL) < 0)
-			panic("userspace_tramp - setting SIGSEGV handler "
-			      "failed - errno = %d\n", errno);
+		if (sigaction(SIGSEGV, &sa, NULL) < 0) {
+			printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV "
+			       "handler failed - errno = %d\n", errno);
+			exit(1);
+		}
 	}
 
 	kill(os_getpid(), SIGSTOP);
@@ -244,13 +262,18 @@ int start_userspace(unsigned long stub_stack)
 {
 	void *stack;
 	unsigned long sp;
-	int pid, status, n, flags;
+	int pid, status, n, flags, err;
 
 	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
 		     PROT_READ | PROT_WRITE | PROT_EXEC,
 		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (stack == MAP_FAILED)
-		panic("start_userspace : mmap failed, errno = %d", errno);
+	if (stack == MAP_FAILED) {
+		err = -errno;
+		printk(UM_KERN_ERR "start_userspace : mmap failed, "
+		       "errno = %d", errno);
+		return err;
+	}
+
 	sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *);
 
 	flags = CLONE_FILES;
@@ -260,29 +283,50 @@ int start_userspace(unsigned long stub_stack)
 		flags |= SIGCHLD;
 
 	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
-	if (pid < 0)
-		panic("start_userspace : clone failed, errno = %d", errno);
+	if (pid < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "start_userspace : clone failed, "
+		       "errno = %d", errno);
+		return err;
+	}
 
 	do {
 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
-		if (n < 0)
-			panic("start_userspace : wait failed, errno = %d",
-			      errno);
+		if (n < 0) {
+			err = -errno;
+			printk(UM_KERN_ERR "start_userspace : wait failed, "
+			       "errno = %d", errno);
+			goto out_kill;
+		}
 	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM));
 
-	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
-		panic("start_userspace : expected SIGSTOP, got status = %d",
-		      status);
+	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
+		err = -EINVAL;
+		printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got "
+		       "status = %d", status);
+		goto out_kill;
+	}
 
 	if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
-		   (void *) PTRACE_O_TRACESYSGOOD) < 0)
-		panic("start_userspace : PTRACE_OLDSETOPTIONS failed, "
-		      "errno = %d\n", errno);
+		   (void *) PTRACE_O_TRACESYSGOOD) < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS "
+		       "failed, errno = %d\n", errno);
+		goto out_kill;
+	}
 
-	if (munmap(stack, UM_KERN_PAGE_SIZE) < 0)
-		panic("start_userspace : munmap failed, errno = %d\n", errno);
+	if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "start_userspace : munmap failed, "
+		       "errno = %d\n", errno);
+		goto out_kill;
+	}
 
 	return pid;
+
+ out_kill:
+	os_kill_ptraced_process(pid, 1);
+	return err;
 }
 
 void userspace(struct uml_pt_regs *regs)
@@ -300,9 +344,16 @@ void userspace(struct uml_pt_regs *regs)
 	nsecs += os_nsecs();
 
 	while (1) {
+		/*
+		 * This can legitimately fail if the process loads a
+		 * bogus value into a segment register.  It will
+		 * segfault and PTRACE_GETREGS will read that value
+		 * out of the process.  However, PTRACE_SETREGS will
+		 * fail.  In this case, there is nothing to do but
+		 * just kill the process.
+		 */
 		if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp))
-			panic("userspace - PTRACE_SETREGS failed, "
-			      "errno = %d\n", errno);
+			fatal_sigsegv();
 
 		/* Now we set local_using_sysemu to be used for one loop */
 		local_using_sysemu = get_using_sysemu();
@@ -310,21 +361,25 @@ void userspace(struct uml_pt_regs *regs)
 		op = SELECT_PTRACE_OPERATION(local_using_sysemu,
 					     singlestepping(NULL));
 
-		err = ptrace(op, pid, 0, 0);
-		if (err)
-			panic("userspace - could not resume userspace process, "
-			      "pid=%d, ptrace operation = %d, errno = %d\n",
-			      pid, op, errno);
+		if (ptrace(op, pid, 0, 0)) {
+			printk(UM_KERN_ERR "userspace - ptrace continue "
+			       "failed, op = %d, errno = %d\n", op, errno);
+			fatal_sigsegv();
+		}
 
 		CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
-		if (err < 0)
-			panic("userspace - waitpid failed, errno = %d\n",
-			      errno);
+		if (err < 0) {
+			printk(UM_KERN_ERR "userspace - wait failed, "
+			       "errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 
 		regs->is_user = 1;
-		if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp))
-			panic("userspace - saving registers failed, "
-			      "errno = %d\n", errno);
+		if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
+			printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, "
+			       "errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 
 		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
 
@@ -371,6 +426,7 @@ void userspace(struct uml_pt_regs *regs)
 			default:
 			        printk(UM_KERN_ERR "userspace - child stopped "
 				       "with signal %d\n", sig);
+				fatal_sigsegv();
 			}
 			pid = userspace_pid[0];
 			interrupt_end();
@@ -422,9 +478,12 @@ int copy_context_skas0(unsigned long new_stack, int pid)
 						     .it_interval = tv }) });
 
 	err = ptrace_setregs(pid, thread_regs);
-	if (err < 0)
-		panic("copy_context_skas0 : PTRACE_SETREGS failed, "
-		      "pid = %d, errno = %d\n", pid, -err);
+	if (err < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS "
+		       "failed, pid = %d, errno = %d\n", pid, -err);
+		return err;
+	}
 
 	/* set a well known return code for detection of child write failure */
 	child_data->err = 12345678;
@@ -434,31 +493,47 @@ int copy_context_skas0(unsigned long new_stack, int pid)
 	 * parent's stack, and check, if bad result.
 	 */
 	err = ptrace(PTRACE_CONT, pid, 0, 0);
-	if (err)
-		panic("Failed to continue new process, pid = %d, "
-		      "errno = %d\n", pid, errno);
+	if (err) {
+		err = -errno;
+		printk(UM_KERN_ERR "Failed to continue new process, pid = %d, "
+		       "errno = %d\n", pid, errno);
+		return err;
+	}
+
 	wait_stub_done(pid);
 
 	pid = data->err;
-	if (pid < 0)
-		panic("copy_context_skas0 - stub-parent reports error %d\n",
-		      -pid);
+	if (pid < 0) {
+		printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports "
+		       "error %d\n", -pid);
+		return pid;
+	}
 
 	/*
 	 * Wait, until child has finished too: read child's result from
 	 * child's stack and check it.
 	 */
 	wait_stub_done(pid);
-	if (child_data->err != STUB_DATA)
-		panic("copy_context_skas0 - stub-child reports error %ld\n",
-		      child_data->err);
+	if (child_data->err != STUB_DATA) {
+		printk(UM_KERN_ERR "copy_context_skas0 - stub-child reports "
+		       "error %ld\n", child_data->err);
+		err = child_data->err;
+		goto out_kill;
+	}
 
 	if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
-		   (void *)PTRACE_O_TRACESYSGOOD) < 0)
-		panic("copy_context_skas0 : PTRACE_OLDSETOPTIONS failed, "
-		      "errno = %d\n", errno);
+		   (void *)PTRACE_O_TRACESYSGOOD) < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS "
+		       "failed, errno = %d\n", errno);
+		goto out_kill;
+	}
 
 	return pid;
+
+ out_kill:
+	os_kill_ptraced_process(pid, 1);
+	return err;
 }
 
 /*
@@ -466,8 +541,8 @@ int copy_context_skas0(unsigned long new_stack, int pid)
  * available. Opening /proc/mm creates a new mm_context, which lacks
  * the stub-pages. Thus, we map them using /proc/mm-fd
  */
-void map_stub_pages(int fd, unsigned long code,
-		    unsigned long data, unsigned long stack)
+int map_stub_pages(int fd, unsigned long code, unsigned long data,
+		   unsigned long stack)
 {
 	struct proc_mm_op mmop;
 	int n;
@@ -491,8 +566,9 @@ void map_stub_pages(int fd, unsigned long code,
 		printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, "
 		       "offset = %llx\n", code, code_fd,
 		       (unsigned long long) code_offset);
-		panic("map_stub_pages : /proc/mm map for code failed, "
-		      "err = %d\n", n);
+		printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code "
+		       "failed, err = %d\n", n);
+		return -n;
 	}
 
 	if (stack) {
@@ -510,10 +586,15 @@ void map_stub_pages(int fd, unsigned long code,
 				      .offset  = map_offset
 		} } });
 		CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop)));
-		if (n != sizeof(mmop))
-			panic("map_stub_pages : /proc/mm map for data failed, "
-			      "err = %d\n", errno);
+		if (n != sizeof(mmop)) {
+			n = errno;
+			printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for "
+			       "data failed, err = %d\n", n);
+			return -n;
+		}
 	}
+
+	return 0;
 }
 
 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
@@ -574,7 +655,9 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf)
 		kmalloc_ok = 0;
 		return 1;
 	default:
-		panic("Bad sigsetjmp return in start_idle_thread - %d\n", n);
+		printk(UM_KERN_ERR "Bad sigsetjmp return in "
+		       "start_idle_thread - %d\n", n);
+		fatal_sigsegv();
 	}
 	longjmp(*switch_buf, 1);
 }
@@ -617,9 +700,11 @@ void __switch_mm(struct mm_id *mm_idp)
 	if (proc_mm) {
 		err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0,
 			     mm_idp->u.mm_fd);
-		if (err)
-			panic("__switch_mm - PTRACE_SWITCH_MM failed, "
-			      "errno = %d\n", errno);
+		if (err) {
+			printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM "
+			       "failed, errno = %d\n", errno);
+			fatal_sigsegv();
+		}
 	}
 	else userspace_pid[0] = mm_idp->u.pid;
 }
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index b07887c..6d56d15 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -411,7 +411,9 @@ static inline void check_skas3_ptrace_faultinfo(void)
 			non_fatal("found\n");
 	}
 
-	init_registers(pid);
+	if (init_registers(pid))
+		fatal("Failed to initialize default registers");
+
 	stop_ptraced_child(pid, 1, 1);
 }
 
diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c
index e437ee2..f1199fd 100644
--- a/arch/um/sys-x86_64/syscalls.c
+++ b/arch/um/sys-x86_64/syscalls.c
@@ -48,7 +48,9 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 	switch (code) {
 	case ARCH_SET_FS:
 	case ARCH_SET_GS:
-		restore_registers(pid, &current->thread.regs.regs);
+		ret = restore_registers(pid, &current->thread.regs.regs);
+		if (ret)
+			return ret;
 		break;
 	case ARCH_GET_FS:
 	case ARCH_GET_GS:
@@ -70,10 +72,10 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 	switch (code) {
 	case ARCH_SET_FS:
 		current->thread.arch.fs = (unsigned long) ptr;
-		save_registers(pid, &current->thread.regs.regs);
+		ret = save_registers(pid, &current->thread.regs.regs);
 		break;
 	case ARCH_SET_GS:
-		save_registers(pid, &current->thread.regs.regs);
+		ret = save_registers(pid, &current->thread.regs.regs);
 		break;
 	case ARCH_GET_FS:
 		ret = put_user(tmp, addr);
-- 
cgit v1.1


From b7c000cbc4f1fa7b82efa95b34f00c2adbeaa3fe Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 4 Feb 2008 22:30:59 -0800
Subject: uml: add missing space

Add missing space between merged string constants.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/vde_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c
index d9941fe..56533db 100644
--- a/arch/um/drivers/vde_user.c
+++ b/arch/um/drivers/vde_user.c
@@ -80,7 +80,7 @@ void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init)
 
 	vpri->args = kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL);
 	if (vpri->args == NULL) {
-		printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args"
+		printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args "
 		       "allocation failed");
 		return;
 	}
-- 
cgit v1.1


From 42a2b54ce8c7b9d4f418995a7950e7e2e15e52ce Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:00 -0800
Subject: uml: clean up TASK_SIZE usage

Clean up the calculation and use of the usable address space size on the host.

task_size is gone, replaced with TASK_SIZE, which is calculated from
CONFIG_TOP_ADDR.  get_kmem_end and set_task_sizes_skas are also gone.

host_task_size, which refers to the entire address space usable by the UML
kernel and which may be larger than the address space usable by a UML process,
since that has to end on a pgdir boundary, is replaced by CONFIG_TOP_ADDR.

STACK_TOP is now TASK_SIZE minus the two stub pages.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/as-layout.h        | 15 +++++++++++++--
 arch/um/include/common-offsets.h   |  3 +++
 arch/um/include/mem_user.h         |  4 ----
 arch/um/kernel/exec.c              |  2 +-
 arch/um/kernel/ksyms.c             |  3 ---
 arch/um/kernel/physmem.c           | 10 ----------
 arch/um/kernel/tlb.c               |  2 +-
 arch/um/kernel/um_arch.c           | 22 ++--------------------
 include/asm-um/a.out.h             |  4 +---
 include/asm-um/fixmap.h            |  3 +--
 include/asm-um/processor-generic.h |  4 +---
 11 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/arch/um/include/as-layout.h b/arch/um/include/as-layout.h
index 2b859e0..a2008f5 100644
--- a/arch/um/include/as-layout.h
+++ b/arch/um/include/as-layout.h
@@ -29,9 +29,20 @@
 #define _AC(X, Y)	__AC(X, Y)
 #endif
 
+/*
+ * The "- 1"'s are to avoid gcc complaining about integer overflows
+ * and unrepresentable decimal constants.  With 3-level page tables,
+ * TASK_SIZE is 0x80000000, which gets turned into its signed decimal
+ * equivalent in asm-offsets.s.  gcc then complains about that being
+ * unsigned only in C90.  To avoid that, UM_TASK_SIZE is defined as
+ * TASK_SIZE - 1.  To compensate, we need to add the 1 back here.
+ * However, adding it back to UM_TASK_SIZE produces more gcc
+ * complaints.  So, I adjust the thing being subtracted from
+ * UM_TASK_SIZE instead.  Bah.
+ */
 #define STUB_CODE _AC((unsigned long), \
-		      UML_CONFIG_TOP_ADDR - 2 * UM_KERN_PAGE_SIZE)
-#define STUB_DATA _AC((unsigned long), UML_CONFIG_TOP_ADDR - UM_KERN_PAGE_SIZE)
+		      UM_TASK_SIZE - (2 * UM_KERN_PAGE_SIZE - 1))
+#define STUB_DATA _AC((unsigned long), UM_TASK_SIZE - (UM_KERN_PAGE_SIZE - 1))
 #define STUB_START _AC(, STUB_CODE)
 
 #ifndef __ASSEMBLY__
diff --git a/arch/um/include/common-offsets.h b/arch/um/include/common-offsets.h
index b54bd35..5b67d7c 100644
--- a/arch/um/include/common-offsets.h
+++ b/arch/um/include/common-offsets.h
@@ -39,3 +39,6 @@ DEFINE(UM_HZ, HZ);
 DEFINE(UM_USEC_PER_SEC, USEC_PER_SEC);
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
+
+/* See as-layout.h for an explanation of the "- 1".  Bah. */
+DEFINE(UM_TASK_SIZE, TASK_SIZE - 1);
diff --git a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h
index a54514d..4e6707b 100644
--- a/arch/um/include/mem_user.h
+++ b/arch/um/include/mem_user.h
@@ -46,9 +46,6 @@ extern int iomem_size;
 
 #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1))
 
-extern unsigned long host_task_size;
-extern unsigned long task_size;
-
 extern int init_mem_user(void);
 extern void setup_memory(void *entry);
 extern unsigned long find_iomem(char *driver, unsigned long *len_out);
@@ -62,6 +59,5 @@ extern unsigned long phys_offset(unsigned long phys);
 extern void unmap_physmem(void);
 extern void map_memory(unsigned long virt, unsigned long phys,
 		       unsigned long len, int r, int w, int x);
-extern unsigned long get_kmem_end(void);
 
 #endif
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 8196450..bf66b5b 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -19,7 +19,7 @@
 void flush_thread(void)
 {
 	void *data = NULL;
-	unsigned long end = proc_mm ? task_size : STUB_START;
+	unsigned long end = proc_mm ? TASK_SIZE : STUB_START;
 	int ret;
 
 	arch_flush_thread(&current->thread.arch);
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index f0ced98..187a756 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -18,11 +18,8 @@ EXPORT_SYMBOL(set_signals);
 EXPORT_SYMBOL(get_signals);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(sys_waitpid);
-EXPORT_SYMBOL(task_size);
 EXPORT_SYMBOL(flush_tlb_range);
-EXPORT_SYMBOL(host_task_size);
 EXPORT_SYMBOL(arch_validate);
-EXPORT_SYMBOL(get_kmem_end);
 
 EXPORT_SYMBOL(high_physmem);
 EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index e66432f..9c92900 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -55,16 +55,6 @@ int __init init_maps(unsigned long physmem, unsigned long iomem,
 	return 0;
 }
 
-/* Changed during early boot */
-static unsigned long kmem_top = 0;
-
-unsigned long get_kmem_end(void)
-{
-	if (kmem_top == 0)
-		kmem_top = host_task_size - 1024 * 1024;
-	return kmem_top;
-}
-
 void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
 		int r, int w, int x)
 {
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 96e0978..429fed2 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -511,7 +511,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	if (atomic_read(&mm->mm_users) == 0)
 		return;
 
-	end = proc_mm ? task_size : STUB_START;
+	end = proc_mm ? TASK_SIZE : STUB_START;
 	fix_range(mm, 0, end, 0);
 }
 
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 1139e07..9ba39ab 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -101,8 +101,6 @@ const struct seq_operations cpuinfo_op = {
 };
 
 /* Set in linux_main */
-unsigned long host_task_size;
-unsigned long task_size;
 unsigned long uml_physmem;
 unsigned long uml_reserved; /* Also modified in mem_init */
 unsigned long start_vm;
@@ -234,20 +232,6 @@ EXPORT_SYMBOL(end_iomem);
 
 extern char __binary_start;
 
-static unsigned long set_task_sizes_skas(unsigned long *task_size_out)
-{
-	/* Round up to the nearest 4M */
-	unsigned long host_task_size = ROUND_4M((unsigned long)
-						&host_task_size);
-
-	if (!skas_needs_stub)
-		*task_size_out = host_task_size;
-	else
-		*task_size_out = STUB_START & PGDIR_MASK;
-
-	return host_task_size;
-}
-
 int __init linux_main(int argc, char **argv)
 {
 	unsigned long avail, diff;
@@ -278,8 +262,6 @@ int __init linux_main(int argc, char **argv)
 
 	printf("UML running in %s mode\n", mode);
 
-	host_task_size = set_task_sizes_skas(&task_size);
-
 	brk_start = (unsigned long) sbrk(0);
 
 	/*
@@ -304,7 +286,7 @@ int __init linux_main(int argc, char **argv)
 
 	highmem = 0;
 	iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK;
-	max_physmem = get_kmem_end() - uml_physmem - iomem_size - MIN_VMALLOC;
+	max_physmem = CONFIG_TOP_ADDR - uml_physmem - iomem_size - MIN_VMALLOC;
 
 	/*
 	 * Zones have to begin on a 1 << MAX_ORDER page boundary,
@@ -336,7 +318,7 @@ int __init linux_main(int argc, char **argv)
 	}
 
 	virtmem_size = physmem_size;
-	avail = get_kmem_end() - start_vm;
+	avail = CONFIG_TOP_ADDR - start_vm;
 	if (physmem_size > avail)
 		virtmem_size = avail;
 	end_vm = start_vm + virtmem_size;
diff --git a/include/asm-um/a.out.h b/include/asm-um/a.out.h
index 9281dd8..f42ff145 100644
--- a/include/asm-um/a.out.h
+++ b/include/asm-um/a.out.h
@@ -13,11 +13,9 @@
 
 extern unsigned long stacksizelim;
 
-extern unsigned long host_task_size;
-
 #define STACK_ROOM (stacksizelim)
 
-#define STACK_TOP task_size
+#define STACK_TOP (TASK_SIZE - 2 * PAGE_SIZE)
 
 #define STACK_TOP_MAX STACK_TOP
 
diff --git a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h
index 3d3e85d..89a87c1 100644
--- a/include/asm-um/fixmap.h
+++ b/include/asm-um/fixmap.h
@@ -56,9 +56,8 @@ extern void __set_fixmap (enum fixed_addresses idx,
  * the start of the fixmap, and leave one page empty
  * at the top of mem..
  */
-extern unsigned long get_kmem_end(void);
 
-#define FIXADDR_TOP	(get_kmem_end() - 0x2000)
+#define FIXADDR_TOP	(CONFIG_TOP_ADDR - 2 * PAGE_SIZE)
 #define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
 
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 35ab6a2..ecf6706 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -94,9 +94,7 @@ static inline void mm_copy_segments(struct mm_struct *from_mm,
 /*
  * User space process size: 3GB (default).
  */
-extern unsigned long task_size;
-
-#define TASK_SIZE	(task_size)
+#define TASK_SIZE (CONFIG_TOP_ADDR & PGDIR_MASK)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
-- 
cgit v1.1


From 3963333fe6767f15141ab2dc3b933721c636c212 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:01 -0800
Subject: uml: cover stubs with a VMA

Give the stubs a VMA.  This allows the removal of a truly nasty kludge to make
sure that mm->nr_ptes was correct in exit_mmap.  The underlying problem was
always that the stubs, which have ptes, and thus allocated a page table,
weren't covered by a VMA.

This patch fixes that by using install_special_mapping in arch_dup_mmap and
activate_context to create the VMA.  The stubs have to be moved, since
shift_arg_pages seems to assume that the stack is the only VMA present at that
point during exec, and uses vma_adjust to fiddle its VMA.  However, that
extends the stub VMA by the amount removed from the stack VMA.

To avoid this problem, the stubs were moved to a different fixed location at
the start of the address space.

The init_stub_pte calls were moved from init_new_context to arch_dup_mmap
because I was occasionally seeing arch_dup_mmap not being called, causing
exit_mmap to die.  Rather than figure out what was really happening, I decided
it was cleaner to just move the calls so that there's no doubt that both the
pte and VMA creation happen, no matter what.  arch_exit_mmap is used to clear
the stub ptes at exit time.

The STUB_* constants in as-layout.h no longer depend on UM_TASK_SIZE, that
that definition is removed, along with the comments complaining about gcc.

Because the stubs are no longer at the top of the address space, some care is
needed while flushing TLBs.  update_pte_range checks for addresses in the stub
range and skips them.  flush_thread now issues two unmaps, one for the range
before STUB_START and one for the range after STUB_END.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/as-layout.h      |  19 ++-----
 arch/um/include/common-offsets.h |   3 --
 arch/um/kernel/exec.c            |   5 +-
 arch/um/kernel/skas/mmu.c        | 104 ++++++++++++++++++++++-----------------
 arch/um/kernel/tlb.c             |  11 ++---
 include/asm-um/mmu_context.h     |   7 ++-
 6 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/arch/um/include/as-layout.h b/arch/um/include/as-layout.h
index a2008f5..606bb5c 100644
--- a/arch/um/include/as-layout.h
+++ b/arch/um/include/as-layout.h
@@ -29,21 +29,10 @@
 #define _AC(X, Y)	__AC(X, Y)
 #endif
 
-/*
- * The "- 1"'s are to avoid gcc complaining about integer overflows
- * and unrepresentable decimal constants.  With 3-level page tables,
- * TASK_SIZE is 0x80000000, which gets turned into its signed decimal
- * equivalent in asm-offsets.s.  gcc then complains about that being
- * unsigned only in C90.  To avoid that, UM_TASK_SIZE is defined as
- * TASK_SIZE - 1.  To compensate, we need to add the 1 back here.
- * However, adding it back to UM_TASK_SIZE produces more gcc
- * complaints.  So, I adjust the thing being subtracted from
- * UM_TASK_SIZE instead.  Bah.
- */
-#define STUB_CODE _AC((unsigned long), \
-		      UM_TASK_SIZE - (2 * UM_KERN_PAGE_SIZE - 1))
-#define STUB_DATA _AC((unsigned long), UM_TASK_SIZE - (UM_KERN_PAGE_SIZE - 1))
-#define STUB_START _AC(, STUB_CODE)
+#define STUB_START _AC(, 0x100000)
+#define STUB_CODE _AC((unsigned long), STUB_START)
+#define STUB_DATA _AC((unsigned long), STUB_CODE + UM_KERN_PAGE_SIZE)
+#define STUB_END _AC((unsigned long), STUB_DATA + UM_KERN_PAGE_SIZE)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/um/include/common-offsets.h b/arch/um/include/common-offsets.h
index 5b67d7c..b54bd35 100644
--- a/arch/um/include/common-offsets.h
+++ b/arch/um/include/common-offsets.h
@@ -39,6 +39,3 @@ DEFINE(UM_HZ, HZ);
 DEFINE(UM_USEC_PER_SEC, USEC_PER_SEC);
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
-
-/* See as-layout.h for an explanation of the "- 1".  Bah. */
-DEFINE(UM_TASK_SIZE, TASK_SIZE - 1);
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index bf66b5b..76a62c0 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -19,12 +19,13 @@
 void flush_thread(void)
 {
 	void *data = NULL;
-	unsigned long end = proc_mm ? TASK_SIZE : STUB_START;
 	int ret;
 
 	arch_flush_thread(&current->thread.arch);
 
-	ret = unmap(&current->mm->context.id, 0, end, 1, &data);
+	ret = unmap(&current->mm->context.id, 0, STUB_START, 0, &data);
+	ret = ret || unmap(&current->mm->context.id, STUB_END,
+			   TASK_SIZE - STUB_END, 1, &data);
 	if (ret) {
 		printk(KERN_ERR "flush_thread - clearing address space failed, "
 		       "err = %d\n", ret);
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 6da9ab4..e8dc854 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -34,25 +34,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	if (!pte)
 		goto out_pte;
 
-	/*
-	 * There's an interaction between the skas0 stub pages, stack
-	 * randomization, and the BUG at the end of exit_mmap.  exit_mmap
-	 * checks that the number of page tables freed is the same as had
-	 * been allocated.  If the stack is on the last page table page,
-	 * then the stack pte page will be freed, and if not, it won't.  To
-	 * avoid having to know where the stack is, or if the process mapped
-	 * something at the top of its address space for some other reason,
-	 * we set TASK_SIZE to end at the start of the last page table.
-	 * This keeps exit_mmap off the last page, but introduces a leak
-	 * of that page.  So, we hang onto it here and free it in
-	 * destroy_context_skas.
-	 */
-
-	mm->context.last_page_table = pmd_page_vaddr(*pmd);
-#ifdef CONFIG_3_LEVEL_PGTABLES
-	mm->context.last_pmd = (unsigned long) __va(pud_val(*pud));
-#endif
-
 	*pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
 	*pte = pte_mkread(*pte);
 	return 0;
@@ -76,24 +57,6 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 		stack = get_zeroed_page(GFP_KERNEL);
 		if (stack == 0)
 			goto out;
-
-		/*
-		 * This zeros the entry that pgd_alloc didn't, needed since
-		 * we are about to reinitialize it, and want mm.nr_ptes to
-		 * be accurate.
-		 */
-		mm->pgd[USER_PTRS_PER_PGD] = __pgd(0);
-
-		ret = init_stub_pte(mm, STUB_CODE,
-				    (unsigned long) &__syscall_stub_start);
-		if (ret)
-			goto out_free;
-
-		ret = init_stub_pte(mm, STUB_DATA, stack);
-		if (ret)
-			goto out_free;
-
-		mm->nr_ptes--;
 	}
 
 	to_mm->id.stack = stack;
@@ -137,6 +100,64 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 	return ret;
 }
 
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	struct page **pages;
+	int err, ret;
+
+	if (!skas_needs_stub)
+		return;
+
+	ret = init_stub_pte(mm, STUB_CODE,
+			    (unsigned long) &__syscall_stub_start);
+	if (ret)
+		goto out;
+
+	ret = init_stub_pte(mm, STUB_DATA, mm->context.id.stack);
+	if (ret)
+		goto out;
+
+	pages = kmalloc(2 * sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		printk(KERN_ERR "arch_dup_mmap failed to allocate 2 page "
+		       "pointers\n");
+		goto out;
+	}
+
+	pages[0] = virt_to_page(&__syscall_stub_start);
+	pages[1] = virt_to_page(mm->context.id.stack);
+
+	/* dup_mmap already holds mmap_sem */
+	err = install_special_mapping(mm, STUB_START, STUB_END - STUB_START,
+				      VM_READ | VM_MAYREAD | VM_EXEC |
+				      VM_MAYEXEC | VM_DONTCOPY, pages);
+	if (err) {
+		printk(KERN_ERR "install_special_mapping returned %d\n", err);
+		goto out_free;
+	}
+	return;
+
+out_free:
+	kfree(pages);
+out:
+	force_sigsegv(SIGSEGV, current);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+	pte_t *pte;
+
+	pte = virt_to_pte(mm, STUB_CODE);
+	if (pte != NULL)
+		pte_clear(mm, STUB_CODE, pte);
+
+	pte = virt_to_pte(mm, STUB_DATA);
+	if (pte == NULL)
+		return;
+
+	pte_clear(mm, STUB_DATA, pte);
+}
+
 void destroy_context(struct mm_struct *mm)
 {
 	struct mm_context *mmu = &mm->context;
@@ -146,15 +167,8 @@ void destroy_context(struct mm_struct *mm)
 	else
 		os_kill_ptraced_process(mmu->id.u.pid, 1);
 
-	if (!proc_mm || !ptrace_faultinfo) {
+	if (skas_needs_stub)
 		free_page(mmu->id.stack);
-		pte_lock_deinit(virt_to_page(mmu->last_page_table));
-		pte_free_kernel(mm, (pte_t *) mmu->last_page_table);
-		dec_zone_page_state(virt_to_page(mmu->last_page_table), NR_PAGETABLE);
-#ifdef CONFIG_3_LEVEL_PGTABLES
-		pmd_free(mm, (pmd_t *) mmu->last_pmd);
-#endif
-	}
 
 	free_ldt(mmu);
 }
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 429fed2..ef5a2a2 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -184,6 +184,9 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
 
 	pte = pte_offset_kernel(pmd, addr);
 	do {
+		if ((addr >= STUB_START) && (addr < STUB_END))
+			continue;
+
 		r = pte_read(*pte);
 		w = pte_write(*pte);
 		x = pte_exec(*pte);
@@ -486,9 +489,6 @@ void __flush_tlb_one(unsigned long addr)
 static void fix_range(struct mm_struct *mm, unsigned long start_addr,
 		      unsigned long end_addr, int force)
 {
-	if (!proc_mm && (end_addr > STUB_START))
-		end_addr = STUB_START;
-
 	fix_range_common(mm, start_addr, end_addr, force);
 }
 
@@ -502,8 +502,6 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 
 void flush_tlb_mm(struct mm_struct *mm)
 {
-	unsigned long end;
-
 	/*
 	 * Don't bother flushing if this address space is about to be
 	 * destroyed.
@@ -511,8 +509,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	if (atomic_read(&mm->mm_users) == 0)
 		return;
 
-	end = proc_mm ? TASK_SIZE : STUB_START;
-	fix_range(mm, 0, end, 0);
+	fix_range(mm, 0, TASK_SIZE, 0);
 }
 
 void force_flush_all(void)
diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h
index 5f3b863..6686fc5 100644
--- a/include/asm-um/mmu_context.h
+++ b/include/asm-um/mmu_context.h
@@ -6,11 +6,12 @@
 #ifndef __UM_MMU_CONTEXT_H
 #define __UM_MMU_CONTEXT_H
 
-#include <asm-generic/mm_hooks.h>
-
 #include "linux/sched.h"
 #include "um_mmu.h"
 
+extern void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+extern void arch_exit_mmap(struct mm_struct *mm);
+
 #define get_mmu_context(task) do ; while(0)
 #define activate_context(tsk) do ; while(0)
 
@@ -30,6 +31,8 @@ static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
 	 */
 	if (old != new && (current->flags & PF_BORROWED_MM))
 		__switch_mm(&new->context.id);
+
+	arch_dup_mmap(old, new);
 }
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
-- 
cgit v1.1


From 260c0cb886c304b760835b6033bf876a7701eb91 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:02 -0800
Subject: uml: fx command-line CFLAGS and LDFLAGS support

UML still needed some work in order to allow CFLAGS to be passed in from the
command line.

USER_CFLAGS is produced from KBUILD_CFLAGS in part by removing all the -I
switches.  This is so that kernel headers don't accidentally get pulled into
libc files.  However, a common use of command-line CFLAGS would be to add -I
switches to the build.  This patch specifically adds any command-line -I flags
back to USER_CFLAGS.

I also corrected the spelling of LFLAGS to LDFLAGS.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/um/Makefile b/arch/um/Makefile
index e44fd6a..fb8854a 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -49,7 +49,7 @@ SYS_DIR		:= $(ARCH_DIR)/include/sysdep-$(SUBARCH)
 #
 # These apply to USER_CFLAGS to.
 
-KBUILD_CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\"	\
+KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \
 	$(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap	\
 	-Din6addr_loopback=kernel_in6addr_loopback \
 	-Din6addr_any=kernel_in6addr_any
@@ -58,7 +58,7 @@ KBUILD_AFLAGS += $(ARCH_INCLUDE)
 
 USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\
 	$(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \
-	-D_FILE_OFFSET_BITS=64
+	$(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64
 
 include $(srctree)/$(ARCH_DIR)/Makefile-$(SUBARCH)
 
@@ -130,7 +130,7 @@ CPPFLAGS_vmlinux.lds = -U$(SUBARCH) -DSTART=$(START) -DELF_ARCH=$(ELF_ARCH) \
 # The wrappers will select whether using "malloc" or the kernel allocator.
 LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
 
-LD_FLAGS_CMDLINE = $(foreach opt,$(LFLAGS),-Wl,$(opt))
+LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt))
 
 CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE)
 define cmd_vmlinux__
-- 
cgit v1.1


From fee64d3c153f1d5c28f91214b4d0db54d3f1fe0a Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:02 -0800
Subject: uml: syle fixes in arch/um/os-Linux

Style fixes in arch/um/os-Linux/irq.c and arch/um/os-Linux/sigio.c:
	Updated copyrights
	trimmed includes
	added severity indicators to printks
	CodingStyle fixes
	turned an bunch of panics into printks
	call some libc functions directly instead of going through the
os_* wrappers

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/irq.c   |  27 +++---
 arch/um/os-Linux/sigio.c | 241 +++++++++++++++++++++++++++--------------------
 2 files changed, 150 insertions(+), 118 deletions(-)

diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index a26e066..430866c 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,22 +1,19 @@
 /*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
 #include <stdlib.h>
-#include <unistd.h>
 #include <errno.h>
+#include <poll.h>
 #include <signal.h>
 #include <string.h>
-#include <sys/poll.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include "user.h"
-#include "process.h"
-#include "sigio.h"
 #include "irq_user.h"
+#include "kern_constants.h"
 #include "os.h"
+#include "process.h"
 #include "um_malloc.h"
+#include "user.h"
 
 /*
  * Locked by irq_lock in arch/um/kernel/irq.c.  Changed by os_create_pollfd
@@ -35,7 +32,7 @@ int os_waiting_for_events(struct irq_fd *active_fds)
 	if (n < 0) {
 		err = -errno;
 		if (errno != EINTR)
-			printk("sigio_handler: os_waiting_for_events:"
+			printk(UM_KERN_ERR "os_waiting_for_events:"
 			       " poll returned %d, errno = %d\n", n, errno);
 		return err;
 	}
@@ -94,24 +91,26 @@ void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
 			struct irq_fd *old_fd = *prev;
 			if ((pollfds[i].fd != -1) &&
 			    (pollfds[i].fd != (*prev)->fd)) {
-				printk("os_free_irq_by_cb - mismatch between "
-				       "active_fds and pollfds, fd %d vs %d\n",
+				printk(UM_KERN_ERR "os_free_irq_by_cb - "
+				       "mismatch between active_fds and "
+				       "pollfds, fd %d vs %d\n",
 				       (*prev)->fd, pollfds[i].fd);
 				goto out;
 			}
 
 			pollfds_num--;
 
-			/* This moves the *whole* array after pollfds[i]
+			/*
+			 * This moves the *whole* array after pollfds[i]
 			 * (though it doesn't spot as such)!
 			 */
 			memmove(&pollfds[i], &pollfds[i + 1],
 			       (pollfds_num - i) * sizeof(pollfds[0]));
-			if(*last_irq_ptr2 == &old_fd->next)
+			if (*last_irq_ptr2 == &old_fd->next)
 				*last_irq_ptr2 = prev;
 
 			*prev = (*prev)->next;
-			if(old_fd->type == IRQ_WRITE)
+			if (old_fd->type == IRQ_WRITE)
 				ignore_sigio_fd(old_fd->fd);
 			kfree(old_fd);
 			continue;
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 6443809..abf47a7c 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -1,33 +1,33 @@
 /*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
 #include <unistd.h>
-#include <stdlib.h>
-#include <termios.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
 #include <pty.h>
+#include <sched.h>
 #include <signal.h>
-#include <fcntl.h>
-#include <errno.h>
 #include <string.h>
-#include <sched.h>
-#include <sys/socket.h>
-#include <sys/poll.h>
-#include "init.h"
-#include "user.h"
+#include "kern_constants.h"
 #include "kern_util.h"
-#include "sigio.h"
+#include "init.h"
 #include "os.h"
+#include "sigio.h"
 #include "um_malloc.h"
+#include "user.h"
 
-/* Protected by sigio_lock(), also used by sigio_cleanup, which is an
+/*
+ * Protected by sigio_lock(), also used by sigio_cleanup, which is an
  * exitcall.
  */
 static int write_sigio_pid = -1;
 static unsigned long write_sigio_stack;
 
-/* These arrays are initialized before the sigio thread is started, and
+/*
+ * These arrays are initialized before the sigio thread is started, and
  * the descriptors closed after it is killed.  So, it can't see them change.
  * On the UML side, they are changed under the sigio_lock.
  */
@@ -42,7 +42,8 @@ struct pollfds {
 	int used;
 };
 
-/* Protected by sigio_lock().  Used by the sigio thread, but the UML thread
+/*
+ * Protected by sigio_lock().  Used by the sigio thread, but the UML thread
  * synchronizes with it.
  */
 static struct pollfds current_poll;
@@ -56,23 +57,26 @@ static int write_sigio_thread(void *unused)
 	int i, n, respond_fd;
 	char c;
 
-        signal(SIGWINCH, SIG_IGN);
+	signal(SIGWINCH, SIG_IGN);
 	fds = &current_poll;
-	while(1){
+	while (1) {
 		n = poll(fds->poll, fds->used, -1);
-		if(n < 0){
-			if(errno == EINTR) continue;
-			printk("write_sigio_thread : poll returned %d, "
-			       "errno = %d\n", n, errno);
+		if (n < 0) {
+			if (errno == EINTR)
+				continue;
+			printk(UM_KERN_ERR "write_sigio_thread : poll returned "
+			       "%d, errno = %d\n", n, errno);
 		}
-		for(i = 0; i < fds->used; i++){
+		for (i = 0; i < fds->used; i++) {
 			p = &fds->poll[i];
-			if(p->revents == 0) continue;
-			if(p->fd == sigio_private[1]){
+			if (p->revents == 0)
+				continue;
+			if (p->fd == sigio_private[1]) {
 				CATCH_EINTR(n = read(sigio_private[1], &c,
 						     sizeof(c)));
-				if(n != sizeof(c))
-					printk("write_sigio_thread : "
+				if (n != sizeof(c))
+					printk(UM_KERN_ERR
+					       "write_sigio_thread : "
 					       "read on socket failed, "
 					       "err = %d\n", errno);
 				tmp = current_poll;
@@ -88,9 +92,10 @@ static int write_sigio_thread(void *unused)
 			}
 
 			CATCH_EINTR(n = write(respond_fd, &c, sizeof(c)));
-			if(n != sizeof(c))
-				printk("write_sigio_thread : write on socket "
-				       "failed, err = %d\n", errno);
+			if (n != sizeof(c))
+				printk(UM_KERN_ERR "write_sigio_thread : "
+				       "write on socket failed, err = %d\n",
+				       errno);
 		}
 	}
 
@@ -101,12 +106,13 @@ static int need_poll(struct pollfds *polls, int n)
 {
 	struct pollfd *new;
 
-	if(n <= polls->size)
+	if (n <= polls->size)
 		return 0;
 
 	new = kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC);
-	if(new == NULL){
-		printk("need_poll : failed to allocate new pollfds\n");
+	if (new == NULL) {
+		printk(UM_KERN_ERR "need_poll : failed to allocate new "
+		       "pollfds\n");
 		return -ENOMEM;
 	}
 
@@ -118,7 +124,8 @@ static int need_poll(struct pollfds *polls, int n)
 	return 0;
 }
 
-/* Must be called with sigio_lock held, because it's needed by the marked
+/*
+ * Must be called with sigio_lock held, because it's needed by the marked
  * critical section.
  */
 static void update_thread(void)
@@ -128,15 +135,17 @@ static void update_thread(void)
 	char c;
 
 	flags = set_signals(0);
-	n = write(sigio_private[0], &c, sizeof(c));
-	if(n != sizeof(c)){
-		printk("update_thread : write failed, err = %d\n", errno);
+	CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c)));
+	if (n != sizeof(c)) {
+		printk(UM_KERN_ERR "update_thread : write failed, err = %d\n",
+		       errno);
 		goto fail;
 	}
 
 	CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c)));
-	if(n != sizeof(c)){
-		printk("update_thread : read failed, err = %d\n", errno);
+	if (n != sizeof(c)) {
+		printk(UM_KERN_ERR "update_thread : read failed, err = %d\n",
+		       errno);
 		goto fail;
 	}
 
@@ -163,23 +172,23 @@ int add_sigio_fd(int fd)
 	int err = 0, i, n;
 
 	sigio_lock();
-	for(i = 0; i < all_sigio_fds.used; i++){
-		if(all_sigio_fds.poll[i].fd == fd)
+	for (i = 0; i < all_sigio_fds.used; i++) {
+		if (all_sigio_fds.poll[i].fd == fd)
 			break;
 	}
-	if(i == all_sigio_fds.used)
+	if (i == all_sigio_fds.used)
 		goto out;
 
 	p = &all_sigio_fds.poll[i];
 
-	for(i = 0; i < current_poll.used; i++){
-		if(current_poll.poll[i].fd == fd)
+	for (i = 0; i < current_poll.used; i++) {
+		if (current_poll.poll[i].fd == fd)
 			goto out;
 	}
 
 	n = current_poll.used;
 	err = need_poll(&next_poll, n + 1);
-	if(err)
+	if (err)
 		goto out;
 
 	memcpy(next_poll.poll, current_poll.poll,
@@ -197,27 +206,29 @@ int ignore_sigio_fd(int fd)
 	struct pollfd *p;
 	int err = 0, i, n = 0;
 
-	/* This is called from exitcalls elsewhere in UML - if
+	/*
+	 * This is called from exitcalls elsewhere in UML - if
 	 * sigio_cleanup has already run, then update_thread will hang
 	 * or fail because the thread is no longer running.
 	 */
-	if(write_sigio_pid == -1)
+	if (write_sigio_pid == -1)
 		return -EIO;
 
 	sigio_lock();
-	for(i = 0; i < current_poll.used; i++){
-		if(current_poll.poll[i].fd == fd) break;
+	for (i = 0; i < current_poll.used; i++) {
+		if (current_poll.poll[i].fd == fd)
+			break;
 	}
-	if(i == current_poll.used)
+	if (i == current_poll.used)
 		goto out;
 
 	err = need_poll(&next_poll, current_poll.used - 1);
-	if(err)
+	if (err)
 		goto out;
 
-	for(i = 0; i < current_poll.used; i++){
+	for (i = 0; i < current_poll.used; i++) {
 		p = &current_poll.poll[i];
-		if(p->fd != fd)
+		if (p->fd != fd)
 			next_poll.poll[n++] = *p;
 	}
 	next_poll.used = current_poll.used - 1;
@@ -234,7 +245,8 @@ static struct pollfd *setup_initial_poll(int fd)
 
 	p = kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL);
 	if (p == NULL) {
-		printk("setup_initial_poll : failed to allocate poll\n");
+		printk(UM_KERN_ERR "setup_initial_poll : failed to allocate "
+		       "poll\n");
 		return NULL;
 	}
 	*p = ((struct pollfd) { .fd		= fd,
@@ -260,27 +272,29 @@ static void write_sigio_workaround(void)
 		return;
 
 	err = os_pipe(l_write_sigio_fds, 1, 1);
-	if(err < 0){
-		printk("write_sigio_workaround - os_pipe 1 failed, "
+	if (err < 0) {
+		printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, "
 		       "err = %d\n", -err);
 		return;
 	}
 	err = os_pipe(l_sigio_private, 1, 1);
-	if(err < 0){
-		printk("write_sigio_workaround - os_pipe 2 failed, "
+	if (err < 0) {
+		printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, "
 		       "err = %d\n", -err);
 		goto out_close1;
 	}
 
 	p = setup_initial_poll(l_sigio_private[1]);
-	if(!p)
+	if (!p)
 		goto out_close2;
 
 	sigio_lock();
 
-	/* Did we race? Don't try to optimize this, please, it's not so likely
-	 * to happen, and no more than once at the boot. */
-	if(write_sigio_pid != -1)
+	/*
+	 * Did we race? Don't try to optimize this, please, it's not so likely
+	 * to happen, and no more than once at the boot.
+	 */
+	if (write_sigio_pid != -1)
 		goto out_free;
 
 	current_poll = ((struct pollfds) { .poll 	= p,
@@ -332,19 +346,19 @@ void maybe_sigio_broken(int fd, int read)
 {
 	int err;
 
-	if(!isatty(fd))
+	if (!isatty(fd))
 		return;
 
-	if((read || pty_output_sigio) && (!read || pty_close_sigio))
+	if ((read || pty_output_sigio) && (!read || pty_close_sigio))
 		return;
 
 	write_sigio_workaround();
 
 	sigio_lock();
 	err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1);
-	if(err){
-		printk("maybe_sigio_broken - failed to add pollfd for "
-		       "descriptor %d\n", fd);
+	if (err) {
+		printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd "
+		       "for descriptor %d\n", fd);
 		goto out;
 	}
 
@@ -387,7 +401,7 @@ static void openpty_cb(void *arg)
 	struct openpty_arg *info = arg;
 
 	info->err = 0;
-	if(openpty(&info->master, &info->slave, NULL, NULL, NULL))
+	if (openpty(&info->master, &info->slave, NULL, NULL, NULL))
 		info->err = -errno;
 }
 
@@ -396,14 +410,14 @@ static int async_pty(int master, int slave)
 	int flags;
 
 	flags = fcntl(master, F_GETFL);
-	if(flags < 0)
+	if (flags < 0)
 		return -errno;
 
-	if((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) ||
-	   (fcntl(master, F_SETOWN, os_getpid()) < 0))
+	if ((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) ||
+	    (fcntl(master, F_SETOWN, os_getpid()) < 0))
 		return -errno;
 
-	if((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0))
+	if ((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0))
 		return -errno;
 
 	return 0;
@@ -416,34 +430,49 @@ static void __init check_one_sigio(void (*proc)(int, int))
 	int master, slave, err;
 
 	initial_thread_cb(openpty_cb, &pty);
-	if(pty.err){
-		printk("openpty failed, errno = %d\n", -pty.err);
+	if (pty.err) {
+		printk(UM_KERN_ERR "check_one_sigio failed, errno = %d\n",
+		       -pty.err);
 		return;
 	}
 
 	master = pty.master;
 	slave = pty.slave;
 
-	if((master == -1) || (slave == -1)){
-		printk("openpty failed to allocate a pty\n");
+	if ((master == -1) || (slave == -1)) {
+		printk(UM_KERN_ERR "check_one_sigio failed to allocate a "
+		       "pty\n");
 		return;
 	}
 
 	/* Not now, but complain so we now where we failed. */
 	err = raw(master);
-	if (err < 0)
-		panic("check_sigio : __raw failed, errno = %d\n", -err);
+	if (err < 0) {
+		printk(UM_KERN_ERR "check_one_sigio : raw failed, errno = %d\n",
+		      -err);
+		return;
+	}
 
 	err = async_pty(master, slave);
-	if(err < 0)
-		panic("tty_fds : sigio_async failed, err = %d\n", -err);
+	if (err < 0) {
+		printk(UM_KERN_ERR "check_one_sigio : sigio_async failed, "
+		       "err = %d\n", -err);
+		return;
+	}
+
+	if (sigaction(SIGIO, NULL, &old) < 0) {
+		printk(UM_KERN_ERR "check_one_sigio : sigaction 1 failed, "
+		       "errno = %d\n", errno);
+		return;
+	}
 
-	if(sigaction(SIGIO, NULL, &old) < 0)
-		panic("check_sigio : sigaction 1 failed, errno = %d\n", errno);
 	new = old;
 	new.sa_handler = handler;
-	if(sigaction(SIGIO, &new, NULL) < 0)
-		panic("check_sigio : sigaction 2 failed, errno = %d\n", errno);
+	if (sigaction(SIGIO, &new, NULL) < 0) {
+		printk(UM_KERN_ERR "check_one_sigio : sigaction 2 failed, "
+		       "errno = %d\n", errno);
+		return;
+	}
 
 	got_sigio = 0;
 	(*proc)(master, slave);
@@ -451,8 +480,9 @@ static void __init check_one_sigio(void (*proc)(int, int))
 	close(master);
 	close(slave);
 
-	if(sigaction(SIGIO, &old, NULL) < 0)
-		panic("check_sigio : sigaction 3 failed, errno = %d\n", errno);
+	if (sigaction(SIGIO, &old, NULL) < 0)
+		printk(UM_KERN_ERR "check_one_sigio : sigaction 3 failed, "
+		       "errno = %d\n", errno);
 }
 
 static void tty_output(int master, int slave)
@@ -460,42 +490,45 @@ static void tty_output(int master, int slave)
 	int n;
 	char buf[512];
 
-	printk("Checking that host ptys support output SIGIO...");
+	printk(UM_KERN_INFO "Checking that host ptys support output SIGIO...");
 
 	memset(buf, 0, sizeof(buf));
 
-	while(write(master, buf, sizeof(buf)) > 0) ;
-	if(errno != EAGAIN)
-		panic("tty_output : write failed, errno = %d\n", errno);
-	while(((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio) ;
+	while (write(master, buf, sizeof(buf)) > 0) ;
+	if (errno != EAGAIN)
+		printk(UM_KERN_ERR "tty_output : write failed, errno = %d\n",
+		       errno);
+	while (((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio)
+		;
 
-	if(got_sigio){
-		printk("Yes\n");
+	if (got_sigio) {
+		printk(UM_KERN_CONT "Yes\n");
 		pty_output_sigio = 1;
-	}
-	else if(n == -EAGAIN)
-		printk("No, enabling workaround\n");
-	else panic("tty_output : read failed, err = %d\n", n);
+	} else if (n == -EAGAIN)
+		printk(UM_KERN_CONT "No, enabling workaround\n");
+	else
+		printk(UM_KERN_CONT "tty_output : read failed, err = %d\n", n);
 }
 
 static void tty_close(int master, int slave)
 {
-	printk("Checking that host ptys support SIGIO on close...");
+	printk(UM_KERN_INFO "Checking that host ptys support SIGIO on "
+	       "close...");
 
 	close(slave);
-	if(got_sigio){
-		printk("Yes\n");
+	if (got_sigio) {
+		printk(UM_KERN_CONT "Yes\n");
 		pty_close_sigio = 1;
-	}
-	else printk("No, enabling workaround\n");
+	} else
+		printk(UM_KERN_CONT "No, enabling workaround\n");
 }
 
 void __init check_sigio(void)
 {
-	if((os_access("/dev/ptmx", OS_ACC_R_OK) < 0) &&
-	   (os_access("/dev/ptyp0", OS_ACC_R_OK) < 0)){
-		printk("No pseudo-terminals available - skipping pty SIGIO "
-		       "check\n");
+	if ((access("/dev/ptmx", R_OK) < 0) &&
+	    (access("/dev/ptyp0", R_OK) < 0)) {
+		printk(UM_KERN_WARNING "No pseudo-terminals available - "
+		       "skipping pty SIGIO check\n");
 		return;
 	}
 	check_one_sigio(tty_output);
-- 
cgit v1.1


From 2dc5802a22d68d83ef4c3d616912949a6527bb65 Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:31:03 -0800
Subject: uml: remove duplicate config symbol and unused file and variables

Fix the repetition of the NET symbol.  It was once in UML specific options and
once in networking.  I removed the first occurrence, as it makes more sense to
me to keep it only in networking.

It also removes a mostly empty file which is not used anymore and some
unused variables.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig                 | 17 -----------------
 arch/um/Makefile-tt             |  5 -----
 arch/um/drivers/mconsole_kern.c |  3 ---
 arch/um/kernel/process.c        |  4 ++--
 4 files changed, 2 insertions(+), 27 deletions(-)
 delete mode 100644 arch/um/Makefile-tt

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 58f5a14..a967d95 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -95,23 +95,6 @@ config LD_SCRIPT_DYN
 	default y
 	depends on !LD_SCRIPT_STATIC
 
-config NET
-	bool "Networking support"
-	help
-	  Unless you really know what you are doing, you should say Y here.
-	  The reason is that some programs need kernel networking support even
-	  when running on a stand-alone machine that isn't connected to any
-	  other computer. If you are upgrading from an older kernel, you
-	  should consider updating your networking tools too because changes
-	  in the kernel and the tools often go hand in hand. The tools are
-	  contained in the package net-tools, the location and version number
-	  of which are given in <file:Documentation/Changes>.
-
-	  For a general introduction to Linux networking, it is highly
-	  recommended to read the NET-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>.
-
-
 source "fs/Kconfig.binfmt"
 
 config HOSTFS
diff --git a/arch/um/Makefile-tt b/arch/um/Makefile-tt
deleted file mode 100644
index 03f7b10..0000000
--- a/arch/um/Makefile-tt
+++ /dev/null
@@ -1,5 +0,0 @@
-# 
-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
-# Licensed under the GPL
-#
-
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 0f3c7d1..fabd75f 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -741,7 +741,6 @@ void mconsole_stack(struct mc_request *req)
 {
 	char *ptr = req->request.data;
 	int pid_requested= -1;
-	struct task_struct *from = NULL;
 	struct task_struct *to = NULL;
 
 	/*
@@ -763,8 +762,6 @@ void mconsole_stack(struct mc_request *req)
 		return;
 	}
 
-	from = current;
-
 	to = find_task_by_pid(pid_requested);
 	if ((to == NULL) || (pid_requested == 0)) {
 		mconsole_reply(req, "Couldn't find that pid", 1, 0);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 7a29123..e6d89ad 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -30,7 +30,7 @@
  */
 struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } };
 
-static inline int external_pid(struct task_struct *task)
+static inline int external_pid(void)
 {
 	/* FIXME: Need to look up userspace_pid by cpu */
 	return userspace_pid[0];
@@ -78,7 +78,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 static inline void set_current(struct task_struct *task)
 {
 	cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task)
-		{ external_pid(task), task });
+		{ external_pid(), task });
 }
 
 extern void arch_switch_to(struct task_struct *to);
-- 
cgit v1.1


From 7b5cc6ee6cf9001775c348bac09814a45f1276b7 Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:31:04 -0800
Subject: uml: fix mconsole stop

Bring back the functionality of stopping user mode linux with the help of
mconsole.

[jdike - the bug being fixed is that the mconsole file descriptor is already
set O_NONBLOCK or not, depending on whether we want no blocking (the normal
case) or we want blocking (when an mconsole stop is in effect), so the
MSG_DONTWAIT is redundant in the normal case, and wrong when we want to
block.]

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/mconsole_user.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c
index 430c024..13af2f0 100644
--- a/arch/um/drivers/mconsole_user.c
+++ b/arch/um/drivers/mconsole_user.c
@@ -83,9 +83,8 @@ int mconsole_get_request(int fd, struct mc_request *req)
 	int len;
 
 	req->originlen = sizeof(req->origin);
-	req->len = recvfrom(fd, &req->request, sizeof(req->request),
-			    MSG_DONTWAIT, (struct sockaddr *) req->origin,
-			    &req->originlen);
+	req->len = recvfrom(fd, &req->request, sizeof(req->request), 0,
+			    (struct sockaddr *) req->origin, &req->originlen);
 	if (req->len < 0)
 		return 0;
 
-- 
cgit v1.1


From bf8fde785b872282e7e86d9ea8a9c4e543985bb3 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:04 -0800
Subject: uml: miscellaneous code cleanups

Code tidying -
	the pid field of struct irq_fd isn't used, so it is removed
     	os_set_fd_async needed to read flags before changing them, it
doesn't need a pid passed in because it can call getpid itself, and a
block of unused code needed deleting
	os_get_exec_close was unused, so it is removed
	ptrace_child called _exit for historical reasons which are no
longer valid, so just calls exit instead

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/irq_user.h  |  1 -
 arch/um/include/os.h        |  3 +--
 arch/um/kernel/irq.c        |  6 ++----
 arch/um/kernel/ksyms.c      |  1 -
 arch/um/kernel/smp.c        |  6 ++----
 arch/um/os-Linux/file.c     | 38 +++++++++++---------------------------
 arch/um/os-Linux/start_up.c |  3 ++-
 7 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/arch/um/include/irq_user.h b/arch/um/include/irq_user.h
index 884a9c1..e60b318 100644
--- a/arch/um/include/irq_user.h
+++ b/arch/um/include/irq_user.h
@@ -14,7 +14,6 @@ struct irq_fd {
 	int fd;
 	int type;
 	int irq;
-	int pid;
 	int events;
 	int current_events;
 };
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 9428d34..fdac1a5 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -127,7 +127,6 @@ static inline struct openflags of_cloexec(struct openflags flags)
 extern int os_stat_file(const char *file_name, struct uml_stat *buf);
 extern int os_stat_fd(const int fd, struct uml_stat *buf);
 extern int os_access(const char *file, int mode);
-extern int os_get_exec_close(int fd, int *close_on_exec);
 extern int os_set_exec_close(int fd);
 extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg);
 extern int os_get_ifname(int fd, char *namebuf);
@@ -142,7 +141,7 @@ extern int os_write_file(int fd, const void *buf, int count);
 extern int os_file_size(const char *file, unsigned long long *size_out);
 extern int os_file_modtime(const char *file, unsigned long *modtime);
 extern int os_pipe(int *fd, int stream, int close_on_exec);
-extern int os_set_fd_async(int fd, int owner);
+extern int os_set_fd_async(int fd);
 extern int os_clear_fd_async(int fd);
 extern int os_set_fd_block(int fd, int blocking);
 extern int os_accept_connection(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index ba11ccd..91587f8 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -107,10 +107,9 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 	struct pollfd *tmp_pfd;
 	struct irq_fd *new_fd, *irq_fd;
 	unsigned long flags;
-	int pid, events, err, n;
+	int events, err, n;
 
-	pid = os_getpid();
-	err = os_set_fd_async(fd, pid);
+	err = os_set_fd_async(fd);
 	if (err < 0)
 		goto out;
 
@@ -127,7 +126,6 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 				     .fd 		= fd,
 				     .type 		= type,
 				     .irq 		= irq,
-				     .pid  		= pid,
 				     .events 		= events,
 				     .current_events 	= 0 } );
 
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c
index 187a756..5311ee9 100644
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -36,7 +36,6 @@ EXPORT_SYMBOL(uml_strdup);
 EXPORT_SYMBOL(os_stat_fd);
 EXPORT_SYMBOL(os_stat_file);
 EXPORT_SYMBOL(os_access);
-EXPORT_SYMBOL(os_get_exec_close);
 EXPORT_SYMBOL(os_set_exec_close);
 EXPORT_SYMBOL(os_getpid);
 EXPORT_SYMBOL(os_open_file);
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index a12e5bd..869ac68 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -74,8 +74,7 @@ static int idle_proc(void *cpup)
 	if (err < 0)
 		panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err);
 
-	os_set_fd_async(cpu_data[cpu].ipi_pipe[0],
-		     current->thread.mode.tt.extern_pid);
+	os_set_fd_async(cpu_data[cpu].ipi_pipe[0]);
 
 	wmb();
 	if (cpu_test_and_set(cpu, cpu_callin_map)) {
@@ -128,8 +127,7 @@ void smp_prepare_cpus(unsigned int maxcpus)
 	if (err < 0)
 		panic("CPU#0 failed to create IPI pipe, errno = %d", -err);
 
-	os_set_fd_async(cpu_data[me].ipi_pipe[0],
-		     current->thread.mode.tt.extern_pid);
+	os_set_fd_async(cpu_data[me].ipi_pipe[0]);
 
 	for (cpu = 1; cpu < ncpus; cpu++) {
 		printk(KERN_INFO "Booting processor %d...\n", cpu);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 9387cb1..4f547d7 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -328,19 +328,6 @@ int os_file_modtime(const char *file, unsigned long *modtime)
 	return 0;
 }
 
-int os_get_exec_close(int fd, int *close_on_exec)
-{
-	int ret;
-
-	CATCH_EINTR(ret = fcntl(fd, F_GETFD));
-
-	if(ret < 0)
-		return -errno;
-
-	*close_on_exec = (ret & FD_CLOEXEC) ? 1 : 0;
-	return ret;
-}
-
 int os_set_exec_close(int fd)
 {
 	int err;
@@ -380,30 +367,27 @@ int os_pipe(int *fds, int stream, int close_on_exec)
 	return err;
 }
 
-int os_set_fd_async(int fd, int owner)
+int os_set_fd_async(int fd)
 {
-	int err;
+	int err, flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		return -errno;
 
-	/* XXX This should do F_GETFL first */
-	if(fcntl(fd, F_SETFL, O_ASYNC | O_NONBLOCK) < 0){
+	flags |= O_ASYNC | O_NONBLOCK;
+	if (fcntl(fd, F_SETFL, flags) < 0) {
 		err = -errno;
 		printk("os_set_fd_async : failed to set O_ASYNC and "
 		       "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno);
 		return err;
 	}
-#ifdef notdef
-	if(fcntl(fd, F_SETFD, 1) < 0){
-		printk("os_set_fd_async : Setting FD_CLOEXEC failed, "
-		       "errno = %d\n", errno);
-	}
-#endif
 
-	if((fcntl(fd, F_SETSIG, SIGIO) < 0) ||
-	   (fcntl(fd, F_SETOWN, owner) < 0)){
+	if ((fcntl(fd, F_SETSIG, SIGIO) < 0) ||
+	    (fcntl(fd, F_SETOWN, os_getpid()) < 0)) {
 		err = -errno;
 		printk("os_set_fd_async : Failed to fcntl F_SETOWN "
-		       "(or F_SETSIG) fd %d to pid %d, errno = %d\n", fd,
-		       owner, errno);
+		       "(or F_SETSIG) fd %d, errno = %d\n", fd, errno);
 		return err;
 	}
 
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 6d56d15..bcf0c9b 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -60,7 +60,8 @@ static int ptrace_child(void)
 		 * the UML code itself.
 		 */
 		ret = 2;
-	_exit(ret);
+
+	exit(ret);
 }
 
 static void fatal_perror(const char *str)
-- 
cgit v1.1


From 1adfd6095e1c654dce5a692db5aa5a2b2a8d6b0d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:05 -0800
Subject: uml: style fixes in file.c

arch/um/os-Linux/file.c needed some style work -
	updated the copyright
	cleaned up the includes
	CodingStyle fixes
	added some missing CATCH_EINTRs
	os_set_owner was unused, so it is gone
	all printks now have severities
	fcntl(F_GETFL) was being called without checking the return
	removed an obsolete comment

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h    |   1 -
 arch/um/os-Linux/file.c | 232 ++++++++++++++++++++++++------------------------
 2 files changed, 118 insertions(+), 115 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index fdac1a5..daaafc8 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -131,7 +131,6 @@ extern int os_set_exec_close(int fd);
 extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg);
 extern int os_get_ifname(int fd, char *namebuf);
 extern int os_set_slip(int fd);
-extern int os_set_owner(int fd, int pid);
 extern int os_mode_fd(int fd, int mode);
 
 extern int os_seek_file(int fd, unsigned long long offset);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 4f547d7..d7404c6 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -8,13 +8,12 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
-#include <sys/uio.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include "kern_constants.h"
 #include "os.h"
 #include "user.h"
 
@@ -42,10 +41,10 @@ int os_stat_fd(const int fd, struct uml_stat *ubuf)
 	int err;
 
 	CATCH_EINTR(err = fstat64(fd, &sbuf));
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 
-	if(ubuf != NULL)
+	if (ubuf != NULL)
 		copy_stat(ubuf, &sbuf);
 	return err;
 }
@@ -55,27 +54,26 @@ int os_stat_file(const char *file_name, struct uml_stat *ubuf)
 	struct stat64 sbuf;
 	int err;
 
-	do {
-		err = stat64(file_name, &sbuf);
-	} while((err < 0) && (errno == EINTR)) ;
-
-	if(err < 0)
+	CATCH_EINTR(err = stat64(file_name, &sbuf));
+	if (err < 0)
 		return -errno;
 
-	if(ubuf != NULL)
+	if (ubuf != NULL)
 		copy_stat(ubuf, &sbuf);
 	return err;
 }
 
-int os_access(const char* file, int mode)
+int os_access(const char *file, int mode)
 {
 	int amode, err;
 
-	amode=(mode&OS_ACC_R_OK ? R_OK : 0) | (mode&OS_ACC_W_OK ? W_OK : 0) |
-	      (mode&OS_ACC_X_OK ? X_OK : 0) | (mode&OS_ACC_F_OK ? F_OK : 0) ;
+	amode = (mode & OS_ACC_R_OK ? R_OK : 0) |
+		(mode & OS_ACC_W_OK ? W_OK : 0) |
+		(mode & OS_ACC_X_OK ? X_OK : 0) |
+		(mode & OS_ACC_F_OK ? F_OK : 0);
 
 	err = access(file, amode);
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 
 	return 0;
@@ -87,7 +85,7 @@ int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg)
 	int err;
 
 	err = ioctl(fd, cmd, arg);
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 
 	return err;
@@ -96,7 +94,7 @@ int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg)
 /* FIXME: ensure namebuf in os_get_if_name is big enough */
 int os_get_ifname(int fd, char* namebuf)
 {
-	if(ioctl(fd, SIOCGIFNAME, namebuf) < 0)
+	if (ioctl(fd, SIOCGIFNAME, namebuf) < 0)
 		return -errno;
 
 	return 0;
@@ -107,37 +105,22 @@ int os_set_slip(int fd)
 	int disc, sencap;
 
 	disc = N_SLIP;
-	if(ioctl(fd, TIOCSETD, &disc) < 0)
+	if (ioctl(fd, TIOCSETD, &disc) < 0)
 		return -errno;
 
 	sencap = 0;
-	if(ioctl(fd, SIOCSIFENCAP, &sencap) < 0)
+	if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0)
 		return -errno;
 
 	return 0;
 }
 
-int os_set_owner(int fd, int pid)
-{
-	if(fcntl(fd, F_SETOWN, pid) < 0){
-		int save_errno = errno;
-
-		if(fcntl(fd, F_GETOWN, 0) != pid)
-			return -save_errno;
-	}
-
-	return 0;
-}
-
 int os_mode_fd(int fd, int mode)
 {
 	int err;
 
-	do {
-		err = fchmod(fd, mode);
-	} while((err < 0) && (errno==EINTR)) ;
-
-	if(err < 0)
+	CATCH_EINTR(err = fchmod(fd, mode));
+	if (err < 0)
 		return -errno;
 
 	return 0;
@@ -149,20 +132,20 @@ int os_file_type(char *file)
 	int err;
 
 	err = os_stat_file(file, &buf);
-	if(err < 0)
+	if (err < 0)
 		return err;
 
-	if(S_ISDIR(buf.ust_mode))
+	if (S_ISDIR(buf.ust_mode))
 		return OS_TYPE_DIR;
-	else if(S_ISLNK(buf.ust_mode))
+	else if (S_ISLNK(buf.ust_mode))
 		return OS_TYPE_SYMLINK;
-	else if(S_ISCHR(buf.ust_mode))
+	else if (S_ISCHR(buf.ust_mode))
 		return OS_TYPE_CHARDEV;
-	else if(S_ISBLK(buf.ust_mode))
+	else if (S_ISBLK(buf.ust_mode))
 		return OS_TYPE_BLOCKDEV;
-	else if(S_ISFIFO(buf.ust_mode))
+	else if (S_ISFIFO(buf.ust_mode))
 		return OS_TYPE_FIFO;
-	else if(S_ISSOCK(buf.ust_mode))
+	else if (S_ISSOCK(buf.ust_mode))
 		return OS_TYPE_SOCK;
 	else return OS_TYPE_FILE;
 }
@@ -174,15 +157,15 @@ int os_file_mode(const char *file, struct openflags *mode_out)
 	*mode_out = OPENFLAGS();
 
 	err = access(file, W_OK);
-	if(err && (errno != EACCES))
+	if (err && (errno != EACCES))
 		return -errno;
-	else if(!err)
+	else if (!err)
 		*mode_out = of_write(*mode_out);
 
 	err = access(file, R_OK);
-	if(err && (errno != EACCES))
+	if (err && (errno != EACCES))
 		return -errno;
-	else if(!err)
+	else if (!err)
 		*mode_out = of_read(*mode_out);
 
 	return err;
@@ -192,21 +175,28 @@ int os_open_file(const char *file, struct openflags flags, int mode)
 {
 	int fd, err, f = 0;
 
-	if(flags.r && flags.w) f = O_RDWR;
-	else if(flags.r) f = O_RDONLY;
-	else if(flags.w) f = O_WRONLY;
+	if (flags.r && flags.w)
+		f = O_RDWR;
+	else if (flags.r)
+		f = O_RDONLY;
+	else if (flags.w)
+		f = O_WRONLY;
 	else f = 0;
 
-	if(flags.s) f |= O_SYNC;
-	if(flags.c) f |= O_CREAT;
-	if(flags.t) f |= O_TRUNC;
-	if(flags.e) f |= O_EXCL;
+	if (flags.s)
+		f |= O_SYNC;
+	if (flags.c)
+		f |= O_CREAT;
+	if (flags.t)
+		f |= O_TRUNC;
+	if (flags.e)
+		f |= O_EXCL;
 
 	fd = open64(file, f, mode);
-	if(fd < 0)
+	if (fd < 0)
 		return -errno;
 
-	if(flags.cl && fcntl(fd, F_SETFD, 1)){
+	if (flags.cl && fcntl(fd, F_SETFD, 1)) {
 		err = -errno;
 		close(fd);
 		return err;
@@ -224,13 +214,13 @@ int os_connect_socket(const char *name)
 	snprintf(sock.sun_path, sizeof(sock.sun_path), "%s", name);
 
 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
-	if(fd < 0) {
+	if (fd < 0) {
 		err = -errno;
 		goto out;
 	}
 
 	err = connect(fd, (struct sockaddr *) &sock, sizeof(sock));
-	if(err) {
+	if (err) {
 		err = -errno;
 		goto out_close;
 	}
@@ -253,7 +243,7 @@ int os_seek_file(int fd, unsigned long long offset)
 	unsigned long long actual;
 
 	actual = lseek64(fd, offset, SEEK_SET);
-	if(actual != offset)
+	if (actual != offset)
 		return -errno;
 	return 0;
 }
@@ -262,7 +252,7 @@ int os_read_file(int fd, void *buf, int len)
 {
 	int n = read(fd, buf, len);
 
-	if(n < 0)
+	if (n < 0)
 		return -errno;
 	return n;
 }
@@ -271,7 +261,7 @@ int os_write_file(int fd, const void *buf, int len)
 {
 	int n = write(fd, (void *) buf, len);
 
-	if(n < 0)
+	if (n < 0)
 		return -errno;
 	return n;
 }
@@ -282,26 +272,27 @@ int os_file_size(const char *file, unsigned long long *size_out)
 	int err;
 
 	err = os_stat_file(file, &buf);
-	if(err < 0){
-		printk("Couldn't stat \"%s\" : err = %d\n", file, -err);
+	if (err < 0) {
+		printk(UM_KERN_ERR "Couldn't stat \"%s\" : err = %d\n", file,
+		       -err);
 		return err;
 	}
 
-	if(S_ISBLK(buf.ust_mode)){
+	if (S_ISBLK(buf.ust_mode)) {
 		int fd;
 		long blocks;
 
 		fd = open(file, O_RDONLY, 0);
-		if(fd < 0) {
+		if (fd < 0) {
 			err = -errno;
-			printk("Couldn't open \"%s\", errno = %d\n", file,
-			       errno);
+			printk(UM_KERN_ERR "Couldn't open \"%s\", "
+			       "errno = %d\n", file, errno);
 			return err;
 		}
-		if(ioctl(fd, BLKGETSIZE, &blocks) < 0){
+		if (ioctl(fd, BLKGETSIZE, &blocks) < 0) {
 			err = -errno;
-			printk("Couldn't get the block size of \"%s\", "
-			       "errno = %d\n", file, errno);
+			printk(UM_KERN_ERR "Couldn't get the block size of "
+			       "\"%s\", errno = %d\n", file, errno);
 			close(fd);
 			return err;
 		}
@@ -319,8 +310,9 @@ int os_file_modtime(const char *file, unsigned long *modtime)
 	int err;
 
 	err = os_stat_file(file, &buf);
-	if(err < 0){
-		printk("Couldn't stat \"%s\" : err = %d\n", file, -err);
+	if (err < 0) {
+		printk(UM_KERN_ERR "Couldn't stat \"%s\" : err = %d\n", file,
+		       -err);
 		return err;
 	}
 
@@ -334,7 +326,7 @@ int os_set_exec_close(int fd)
 
 	CATCH_EINTR(err = fcntl(fd, F_SETFD, FD_CLOEXEC));
 
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 	return err;
 }
@@ -344,24 +336,25 @@ int os_pipe(int *fds, int stream, int close_on_exec)
 	int err, type = stream ? SOCK_STREAM : SOCK_DGRAM;
 
 	err = socketpair(AF_UNIX, type, 0, fds);
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 
-	if(!close_on_exec)
+	if (!close_on_exec)
 		return 0;
 
 	err = os_set_exec_close(fds[0]);
-	if(err < 0)
+	if (err < 0)
 		goto error;
 
 	err = os_set_exec_close(fds[1]);
-	if(err < 0)
+	if (err < 0)
 		goto error;
 
 	return 0;
 
  error:
-	printk("os_pipe : Setting FD_CLOEXEC failed, err = %d\n", -err);
+	printk(UM_KERN_ERR "os_pipe : Setting FD_CLOEXEC failed, err = %d\n",
+	       -err);
 	close(fds[1]);
 	close(fds[0]);
 	return err;
@@ -378,15 +371,15 @@ int os_set_fd_async(int fd)
 	flags |= O_ASYNC | O_NONBLOCK;
 	if (fcntl(fd, F_SETFL, flags) < 0) {
 		err = -errno;
-		printk("os_set_fd_async : failed to set O_ASYNC and "
-		       "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno);
+		printk(UM_KERN_ERR "os_set_fd_async : failed to set O_ASYNC "
+		       "and O_NONBLOCK on fd # %d, errno = %d\n", fd, errno);
 		return err;
 	}
 
 	if ((fcntl(fd, F_SETSIG, SIGIO) < 0) ||
 	    (fcntl(fd, F_SETOWN, os_getpid()) < 0)) {
 		err = -errno;
-		printk("os_set_fd_async : Failed to fcntl F_SETOWN "
+		printk(UM_KERN_ERR "os_set_fd_async : Failed to fcntl F_SETOWN "
 		       "(or F_SETSIG) fd %d, errno = %d\n", fd, errno);
 		return err;
 	}
@@ -396,10 +389,14 @@ int os_set_fd_async(int fd)
 
 int os_clear_fd_async(int fd)
 {
-	int flags = fcntl(fd, F_GETFL);
+	int flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		return -errno;
 
 	flags &= ~(O_ASYNC | O_NONBLOCK);
-	if(fcntl(fd, F_SETFL, flags) < 0)
+	if (fcntl(fd, F_SETFL, flags) < 0)
 		return -errno;
 	return 0;
 }
@@ -409,11 +406,15 @@ int os_set_fd_block(int fd, int blocking)
 	int flags;
 
 	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		return -errno;
 
-	if(blocking) flags &= ~O_NONBLOCK;
-	else flags |= O_NONBLOCK;
+	if (blocking)
+		flags &= ~O_NONBLOCK;
+	else
+		flags |= O_NONBLOCK;
 
-	if(fcntl(fd, F_SETFL, flags) < 0)
+	if (fcntl(fd, F_SETFL, flags) < 0)
 		return -errno;
 
 	return 0;
@@ -424,7 +425,7 @@ int os_accept_connection(int fd)
 	int new;
 
 	new = accept(fd, NULL, 0);
-	if(new < 0)
+	if (new < 0)
 		return -errno;
 	return new;
 }
@@ -445,15 +446,17 @@ int os_shutdown_socket(int fd, int r, int w)
 {
 	int what, err;
 
-	if(r && w) what = SHUT_RDWR;
-	else if(r) what = SHUT_RD;
-	else if(w) what = SHUT_WR;
-	else {
-		printk("os_shutdown_socket : neither r or w was set\n");
+	if (r && w)
+		what = SHUT_RDWR;
+	else if (r)
+		what = SHUT_RD;
+	else if (w)
+		what = SHUT_WR;
+	else
 		return -EINVAL;
-	}
+
 	err = shutdown(fd, what);
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 	return 0;
 }
@@ -477,19 +480,20 @@ int os_rcv_fd(int fd, int *helper_pid_out)
 	msg.msg_flags = 0;
 
 	n = recvmsg(fd, &msg, 0);
-	if(n < 0)
+	if (n < 0)
 		return -errno;
-	else if(n != iov.iov_len)
+	else if (n != iov.iov_len)
 		*helper_pid_out = -1;
 
 	cmsg = CMSG_FIRSTHDR(&msg);
-	if(cmsg == NULL){
-		printk("rcv_fd didn't receive anything, error = %d\n", errno);
+	if (cmsg == NULL) {
+		printk(UM_KERN_ERR "rcv_fd didn't receive anything, "
+		       "error = %d\n", errno);
 		return -1;
 	}
-	if((cmsg->cmsg_level != SOL_SOCKET) ||
-	   (cmsg->cmsg_type != SCM_RIGHTS)){
-		printk("rcv_fd didn't receive a descriptor\n");
+	if ((cmsg->cmsg_level != SOL_SOCKET) ||
+	    (cmsg->cmsg_type != SCM_RIGHTS)) {
+		printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n");
 		return -1;
 	}
 
@@ -503,23 +507,22 @@ int os_create_unix_socket(const char *file, int len, int close_on_exec)
 	int sock, err;
 
 	sock = socket(PF_UNIX, SOCK_DGRAM, 0);
-	if(sock < 0)
+	if (sock < 0)
 		return -errno;
 
-	if(close_on_exec) {
+	if (close_on_exec) {
 		err = os_set_exec_close(sock);
-		if(err < 0)
-			printk("create_unix_socket : close_on_exec failed, "
-		       "err = %d", -err);
+		if (err < 0)
+			printk(UM_KERN_ERR "create_unix_socket : "
+			       "close_on_exec failed, err = %d", -err);
 	}
 
 	addr.sun_family = AF_UNIX;
 
-	/* XXX Be more careful about overflow */
 	snprintf(addr.sun_path, len, "%s", file);
 
 	err = bind(sock, (struct sockaddr *) &addr, sizeof(addr));
-	if(err < 0)
+	if (err < 0)
 		return -errno;
 
 	return sock;
@@ -540,17 +543,18 @@ int os_lock_file(int fd, int excl)
 	int err, save;
 
 	err = fcntl(fd, F_SETLK, &lock);
-	if(!err)
+	if (!err)
 		goto out;
 
 	save = -errno;
 	err = fcntl(fd, F_GETLK, &lock);
-	if(err){
+	if (err) {
 		err = -errno;
 		goto out;
 	}
 
-	printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid);
+	printk(UM_KERN_ERR "F_SETLK failed, file already locked by pid %d\n",
+	       lock.l_pid);
 	err = save;
  out:
 	return err;
-- 
cgit v1.1


From 909e90d3c410b684e564729145f7c20dad887757 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:06 -0800
Subject: uml: 64-bit tlb fixes

Some 64-bit tlb fixes -
	moved pmd_page_vaddr to pgtable.h since it's the same for both
2-level and 3-level page tables
	fixed a bogus cast on pud_page_vaddr
	made the address checking in update_*_range more careful

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/tlb.c            | 8 ++++----
 include/asm-um/pgtable-2level.h | 3 ---
 include/asm-um/pgtable-3level.h | 3 +--
 include/asm-um/pgtable.h        | 3 +++
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index ef5a2a2..8127ca8 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -207,7 +207,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
 		else if (pte_newprot(*pte))
 			ret = add_mprotect(addr, PAGE_SIZE, prot, hvc);
 		*pte = pte_mkuptodate(*pte);
-	} while (pte++, addr += PAGE_SIZE, ((addr != end) && !ret));
+	} while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret));
 	return ret;
 }
 
@@ -229,7 +229,7 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr,
 			}
 		}
 		else ret = update_pte_range(pmd, addr, next, hvc);
-	} while (pmd++, addr = next, ((addr != end) && !ret));
+	} while (pmd++, addr = next, ((addr < end) && !ret));
 	return ret;
 }
 
@@ -251,7 +251,7 @@ static inline int update_pud_range(pgd_t *pgd, unsigned long addr,
 			}
 		}
 		else ret = update_pmd_range(pud, addr, next, hvc);
-	} while (pud++, addr = next, ((addr != end) && !ret));
+	} while (pud++, addr = next, ((addr < end) && !ret));
 	return ret;
 }
 
@@ -274,7 +274,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
 			}
 		}
 		else ret = update_pud_range(pgd, addr, next, &hvc);
-	} while (pgd++, addr = next, ((addr != end_addr) && !ret));
+	} while (pgd++, addr = next, ((addr < end_addr) && !ret));
 
 	if (!ret)
 		ret = do_ops(&hvc, hvc.index, 1);
diff --git a/include/asm-um/pgtable-2level.h b/include/asm-um/pgtable-2level.h
index 172a75f..f534b73 100644
--- a/include/asm-um/pgtable-2level.h
+++ b/include/asm-um/pgtable-2level.h
@@ -41,9 +41,6 @@ static inline void pgd_mkuptodate(pgd_t pgd)	{ }
 #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot))
 #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot))
 
-#define pmd_page_vaddr(pmd) \
-	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
-
 /*
  * Bits 0 through 4 are taken
  */
diff --git a/include/asm-um/pgtable-3level.h b/include/asm-um/pgtable-3level.h
index 48f8f5d..0446f45 100644
--- a/include/asm-um/pgtable-3level.h
+++ b/include/asm-um/pgtable-3level.h
@@ -87,8 +87,7 @@ static inline void pud_clear (pud_t *pud)
 }
 
 #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK)
-#define pud_page_vaddr(pud) \
-	((struct page *) __va(pud_val(pud) & PAGE_MASK))
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
 
 /* Find an entry in the second-level page table.. */
 #define pmd_offset(pud, address) ((pmd_t *) pud_page_vaddr(*(pud)) + \
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index fb47777..4102b44 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -308,6 +308,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 
+#define pmd_page_vaddr(pmd) \
+	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+
 /*
  * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
  *
-- 
cgit v1.1


From 0b4e273fb83bce5dd8e166a4defb16ebdd215abf Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:07 -0800
Subject: uml: customize tlb.h

Customize the hooks in tlb.h to optimize TLB flushing some more.

Add start and end fields to tlb_gather_mmu, which are used to limit
the address space range scanned when a region is unmapped.

The interfaces which just free page tables, without actually changing
mappings, don't need to cause a TLB flush.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/tlb.c |  25 ++++++++---
 include/asm-um/tlb.h | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 8127ca8..0b6a77d 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -193,18 +193,18 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_young(*pte)) {
 			r = 0;
 			w = 0;
-		} else if (!pte_dirty(*pte)) {
+		} else if (!pte_dirty(*pte))
 			w = 0;
-		}
+
 		prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) |
 			(x ? UM_PROT_EXEC : 0));
 		if (hvc->force || pte_newpage(*pte)) {
 			if (pte_present(*pte))
 				ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK,
 					       PAGE_SIZE, prot, hvc);
-			else ret = add_munmap(addr, PAGE_SIZE, hvc);
-		}
-		else if (pte_newprot(*pte))
+			else
+				ret = add_munmap(addr, PAGE_SIZE, hvc);
+		} else if (pte_newprot(*pte))
 			ret = add_mprotect(addr, PAGE_SIZE, prot, hvc);
 		*pte = pte_mkuptodate(*pte);
 	} while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret));
@@ -500,7 +500,8 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	else fix_range(vma->vm_mm, start, end, 0);
 }
 
-void flush_tlb_mm(struct mm_struct *mm)
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end)
 {
 	/*
 	 * Don't bother flushing if this address space is about to be
@@ -509,7 +510,17 @@ void flush_tlb_mm(struct mm_struct *mm)
 	if (atomic_read(&mm->mm_users) == 0)
 		return;
 
-	fix_range(mm, 0, TASK_SIZE, 0);
+	fix_range(mm, start, end, 0);
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	while (vma != NULL) {
+		fix_range(mm, vma->vm_start, vma->vm_end, 0);
+		vma = vma->vm_next;
+	}
 }
 
 void force_flush_all(void)
diff --git a/include/asm-um/tlb.h b/include/asm-um/tlb.h
index c640033..39fc475 100644
--- a/include/asm-um/tlb.h
+++ b/include/asm-um/tlb.h
@@ -1,6 +1,126 @@
 #ifndef __UM_TLB_H
 #define __UM_TLB_H
 
-#include <asm/arch/tlb.h>
+#include <linux/swap.h>
+#include <asm/percpu.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+/* struct mmu_gather is an opaque type used by the mm code for passing around
+ * any data needed by arch specific code for tlb_remove_page.
+ */
+struct mmu_gather {
+	struct mm_struct	*mm;
+	unsigned int		need_flush; /* Really unmapped some ptes? */
+	unsigned long		start;
+	unsigned long		end;
+	unsigned int		fullmm; /* non-zero means full mm flush */
+};
+
+/* Users of the generic TLB shootdown code must declare this storage space. */
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
+					  unsigned long address)
+{
+	if (tlb->start > address)
+		tlb->start = address;
+	if (tlb->end < address + PAGE_SIZE)
+		tlb->end = address + PAGE_SIZE;
+}
+
+static inline void init_tlb_gather(struct mmu_gather *tlb)
+{
+	tlb->need_flush = 0;
+
+	tlb->start = TASK_SIZE;
+	tlb->end = 0;
+
+	if (tlb->fullmm) {
+		tlb->start = 0;
+		tlb->end = TASK_SIZE;
+	}
+}
+
+/* tlb_gather_mmu
+ *	Return a pointer to an initialized struct mmu_gather.
+ */
+static inline struct mmu_gather *
+tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+{
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+
+	tlb->mm = mm;
+	tlb->fullmm = full_mm_flush;
+
+	init_tlb_gather(tlb);
+
+	return tlb;
+}
+
+extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+			       unsigned long end);
+
+static inline void
+tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+	if (!tlb->need_flush)
+		return;
+
+	flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end);
+	init_tlb_gather(tlb);
+}
+
+/* tlb_finish_mmu
+ *	Called at the end of the shootdown operation to free up any resources
+ *	that were required.
+ */
+static inline void
+tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+	tlb_flush_mmu(tlb, start, end);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
+}
+
+/* tlb_remove_page
+ *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)),
+ *	while handling the additional races in SMP caused by other CPUs
+ *	caching valid mappings in their TLBs.
+ */
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+	tlb->need_flush = 1;
+	free_page_and_swap_cache(page);
+	return;
+}
+
+/**
+ * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
+ *
+ * Record the fact that pte's were really umapped in ->need_flush, so we can
+ * later optimise away the tlb invalidate.   This helps when userspace is
+ * unmapping already-unmapped pages, which happens quite a lot.
+ */
+#define tlb_remove_tlb_entry(tlb, ptep, address)		\
+	do {							\
+		tlb->need_flush = 1;				\
+		__tlb_remove_tlb_entry(tlb, ptep, address);	\
+	} while (0)
+
+#define pte_free_tlb(tlb, ptep) __pte_free_tlb(tlb, ptep)
+
+#define pud_free_tlb(tlb, pudp) __pud_free_tlb(tlb, pudp)
+
+#define pmd_free_tlb(tlb, pmdp) __pmd_free_tlb(tlb, pmdp)
+
+#define tlb_migrate_finish(mm) do {} while (0)
 
 #endif
-- 
cgit v1.1


From 8efa3c9d545ab6adc5c5e001cbd7aee60909b3da Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:07 -0800
Subject: uml: eliminate setjmp_wrapper

setjmp_wrapper existed to provide setjmp to kernel code when UML used libc's
setjmp and longjmp.  Now that UML has its own implementation, this isn't
needed and kernel code can invoke setjmp directly.

do_buffer_op is massively cleaned up since it is no longer a callback from
setjmp_wrapper and given a va_list from which it must extract its arguments.

The actual setjmp is moved from buffer_op to do_op_one_page because the copy
operation is inside an atomic section (kmap_atomic to kunmap_atomic) and it
shouldn't be longjmp-ed out of.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h          |  1 -
 arch/um/kernel/skas/uaccess.c | 73 ++++++++++++++++---------------------------
 arch/um/os-Linux/util.c       | 15 ---------
 3 files changed, 27 insertions(+), 62 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index daaafc8..c8684b6 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -235,7 +235,6 @@ extern void stack_protections(unsigned long address);
 extern int raw(int fd);
 extern void setup_machinename(char *machine_out);
 extern void setup_hostinfo(char *buf, int len);
-extern int setjmp_wrapper(void (*proc)(void *, void *), ...);
 extern void os_dump_core(void) __attribute__ ((noreturn));
 
 /* time.c */
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 05b41dbc..e22c969 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -58,9 +58,10 @@ static pte_t *maybe_map(unsigned long virt, int is_write)
 static int do_op_one_page(unsigned long addr, int len, int is_write,
 		 int (*op)(unsigned long addr, int len, void *arg), void *arg)
 {
+	jmp_buf buf;
 	struct page *page;
 	pte_t *pte;
-	int n;
+	int n, faulted;
 
 	pte = maybe_map(addr, is_write);
 	if (pte == NULL)
@@ -70,82 +71,62 @@ static int do_op_one_page(unsigned long addr, int len, int is_write,
 	addr = (unsigned long) kmap_atomic(page, KM_UML_USERCOPY) +
 		(addr & ~PAGE_MASK);
 
-	n = (*op)(addr, len, arg);
+	current->thread.fault_catcher = &buf;
+
+	faulted = UML_SETJMP(&buf);
+	if (faulted == 0)
+		n = (*op)(addr, len, arg);
+	else
+		n = -1;
+
+	current->thread.fault_catcher = NULL;
 
 	kunmap_atomic(page, KM_UML_USERCOPY);
 
 	return n;
 }
 
-static void do_buffer_op(void *jmpbuf, void *arg_ptr)
+static int buffer_op(unsigned long addr, int len, int is_write,
+		     int (*op)(unsigned long, int, void *), void *arg)
 {
-	va_list args;
-	unsigned long addr;
-	int len, is_write, size, remain, n;
-	int (*op)(unsigned long, int, void *);
-	void *arg;
-	int *res;
-
-	va_copy(args, *(va_list *)arg_ptr);
-	addr = va_arg(args, unsigned long);
-	len = va_arg(args, int);
-	is_write = va_arg(args, int);
-	op = va_arg(args, void *);
-	arg = va_arg(args, void *);
-	res = va_arg(args, int *);
-	va_end(args);
+	int size, remain, n;
+
 	size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
 	remain = len;
 
-	current->thread.fault_catcher = jmpbuf;
 	n = do_op_one_page(addr, size, is_write, op, arg);
 	if (n != 0) {
-		*res = (n < 0 ? remain : 0);
+		remain = (n < 0 ? remain : 0);
 		goto out;
 	}
 
 	addr += size;
 	remain -= size;
-	if (remain == 0) {
-		*res = 0;
+	if (remain == 0)
 		goto out;
-	}
 
-	while(addr < ((addr + remain) & PAGE_MASK)) {
+	while (addr < ((addr + remain) & PAGE_MASK)) {
 		n = do_op_one_page(addr, PAGE_SIZE, is_write, op, arg);
 		if (n != 0) {
-			*res = (n < 0 ? remain : 0);
+			remain = (n < 0 ? remain : 0);
 			goto out;
 		}
 
 		addr += PAGE_SIZE;
 		remain -= PAGE_SIZE;
 	}
-	if (remain == 0) {
-		*res = 0;
+	if (remain == 0)
 		goto out;
-	}
 
 	n = do_op_one_page(addr, remain, is_write, op, arg);
-	if (n != 0)
-		*res = (n < 0 ? remain : 0);
-	else *res = 0;
- out:
-	current->thread.fault_catcher = NULL;
-}
-
-static int buffer_op(unsigned long addr, int len, int is_write,
-		     int (*op)(unsigned long addr, int len, void *arg),
-		     void *arg)
-{
-	int faulted, res;
-
-	faulted = setjmp_wrapper(do_buffer_op, addr, len, is_write, op, arg,
-				 &res);
-	if (!faulted)
-		return res;
+	if (n != 0) {
+		remain = (n < 0 ? remain : 0);
+		goto out;
+	}
 
-	return addr + len - (unsigned long) current->thread.fault_addr;
+	return 0;
+ out:
+	return remain;
 }
 
 static int copy_chunk_from_user(unsigned long from, int len, void *arg)
diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index 3e058ce..a6f31d4 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -88,21 +88,6 @@ void setup_hostinfo(char *buf, int len)
 		 host.release, host.version, host.machine);
 }
 
-int setjmp_wrapper(void (*proc)(void *, void *), ...)
-{
-	va_list args;
-	jmp_buf buf;
-	int n;
-
-	n = UML_SETJMP(&buf);
-	if(n == 0){
-		va_start(args, proc);
-		(*proc)(&buf, &args);
-	}
-	va_end(args);
-	return n;
-}
-
 void os_dump_core(void)
 {
 	int pid;
-- 
cgit v1.1


From 0983a88b9f0ceffb2116ce92c7b273ce2aec7b93 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:08 -0800
Subject: uml: install panic notifier earlier

It turns out that if there's a panic early enough, UML will just sit there in
the LED-blinking loop because the panic notifier hadn't been installed yet.

This patch installs it earlier.

It also fixes the problem which exposed the hang, namely that if you give UML
a zero-sized initrd, it will ask alloc_bootmem for zero bytes, and that will
cause the panic.

While I was in initrd.c, I gave it a style makeover.

Prompted by checkpatch, I moved a couple extern declarations of uml_exitcode
to kern_util.h.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/kern_util.h |  2 ++
 arch/um/kernel/initrd.c     | 29 ++++++++++++++++++++---------
 arch/um/kernel/um_arch.c    | 41 ++++++++++++++++++++---------------------
 arch/um/os-Linux/main.c     |  2 --
 4 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index 625ca29..715ffb5 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -9,6 +9,8 @@
 #include "sysdep/ptrace.h"
 #include "sysdep/faultinfo.h"
 
+extern int uml_exitcode;
+
 extern int ncpus;
 extern int kmalloc_ok;
 extern int nsyscalls;
diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c
index ae31c62..fa01556 100644
--- a/arch/um/kernel/initrd.c
+++ b/arch/um/kernel/initrd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -20,18 +20,27 @@ static int __init read_initrd(void)
 	long long size;
 	int err;
 
-	if(initrd == NULL)
+	if (initrd == NULL)
 		return 0;
 
 	err = os_file_size(initrd, &size);
-	if(err)
+	if (err)
 		return 0;
 
+	/*
+	 * This is necessary because alloc_bootmem craps out if you
+	 * ask for no memory.
+	 */
+	if (size == 0) {
+		printk(KERN_ERR "\"%\" is a zero-size initrd\n");
+		return 0;
+	}
+
 	area = alloc_bootmem(size);
-	if(area == NULL)
+	if (area == NULL)
 		return 0;
 
-	if(load_initrd(initrd, area, size) == -1)
+	if (load_initrd(initrd, area, size) == -1)
 		return 0;
 
 	initrd_start = (unsigned long) area;
@@ -58,13 +67,15 @@ int load_initrd(char *filename, void *buf, int size)
 	int fd, n;
 
 	fd = os_open_file(filename, of_read(OPENFLAGS()), 0);
-	if(fd < 0){
-		printk("Opening '%s' failed - err = %d\n", filename, -fd);
+	if (fd < 0) {
+		printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename,
+		       -fd);
 		return -1;
 	}
 	n = os_read_file(fd, buf, size);
-	if(n != size){
-		printk("Read of %d bytes from '%s' failed, err = %d\n", size,
+	if (n != size) {
+		printk(KERN_ERR "Read of %d bytes from '%s' failed, "
+		       "err = %d\n", size,
 		       filename, -n);
 		return -1;
 	}
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 9ba39ab..cb7eef8 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -223,6 +223,23 @@ static void __init uml_postsetup(void)
 	return;
 }
 
+static int panic_exit(struct notifier_block *self, unsigned long unused1,
+		      void *unused2)
+{
+	bust_spinlocks(1);
+	show_regs(&(current->thread.regs));
+	bust_spinlocks(0);
+	uml_exitcode = 1;
+	os_dump_core();
+	return 0;
+}
+
+static struct notifier_block panic_exit_notifier = {
+	.notifier_call 		= panic_exit,
+	.next 			= NULL,
+	.priority 		= 0
+};
+
 /* Set during early boot */
 unsigned long brk_start;
 unsigned long end_iomem;
@@ -327,6 +344,9 @@ int __init linux_main(int argc, char **argv)
 		printf("Kernel virtual memory size shrunk to %lu bytes\n",
 		       virtmem_size);
 
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &panic_exit_notifier);
+
 	uml_postsetup();
 
 	stack_protections((unsigned long) &init_thread_info);
@@ -335,29 +355,8 @@ int __init linux_main(int argc, char **argv)
 	return start_uml();
 }
 
-extern int uml_exitcode;
-
-static int panic_exit(struct notifier_block *self, unsigned long unused1,
-		      void *unused2)
-{
-	bust_spinlocks(1);
-	show_regs(&(current->thread.regs));
-	bust_spinlocks(0);
-	uml_exitcode = 1;
-	os_dump_core();
-	return 0;
-}
-
-static struct notifier_block panic_exit_notifier = {
-	.notifier_call 		= panic_exit,
-	.next 			= NULL,
-	.priority 		= 0
-};
-
 void __init setup_arch(char **cmdline_p)
 {
-	atomic_notifier_chain_register(&panic_notifier_list,
-			&panic_exit_notifier);
 	paging_init();
 	strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index de664e7..abb9b0f 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -111,8 +111,6 @@ static void setup_env_path(void)
 	}
 }
 
-extern int uml_exitcode;
-
 extern void scan_elf_aux( char **envp);
 
 int __init main(int argc, char **argv, char **envp)
-- 
cgit v1.1


From fce8c41c9f68b9af36f3076bae8f1d469a6e7aab Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:09 -0800
Subject: uml: use barrier() instead of mb()

signals_enabled and pending have requirements on the order in which they are
modified.  This used to be done by declaring them volatile and putting an mb()
where the ordering requirements were in effect.

After getting a better (I hope) understanding of how to do this correctly, the
volatile declarations are gone and the mb()'s replaced by barrier()'s.

One of the mb()'s was deleted because I see no problematic writes that could
be re-ordered past that point.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/signal.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 7ff8f57..62a66f3 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -15,6 +15,9 @@
 #include "sysdep/sigcontext.h"
 #include "user.h"
 
+/* Copied from linux/compiler-gcc.h since we can't include it directly */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
 /*
  * These are the asynchronous signals.  SIGPROF is excluded because we want to
  * be able to profile all of UML, not just the non-critical sections.  If
@@ -27,13 +30,8 @@
 #define SIGVTALRM_BIT 1
 #define SIGVTALRM_MASK (1 << SIGVTALRM_BIT)
 
-/*
- * These are used by both the signal handlers and
- * block/unblock_signals.  I don't want modifications cached in a
- * register - they must go straight to memory.
- */
-static volatile int signals_enabled = 1;
-static volatile int pending = 0;
+static int signals_enabled;
+static unsigned int pending;
 
 void sig_handler(int sig, struct sigcontext *sc)
 {
@@ -198,7 +196,7 @@ void block_signals(void)
 	 * This might matter if gcc figures out how to inline this and
 	 * decides to shuffle this code into the caller.
 	 */
-	mb();
+	barrier();
 }
 
 void unblock_signals(void)
@@ -224,21 +222,11 @@ void unblock_signals(void)
 		 * Setting signals_enabled and reading pending must
 		 * happen in this order.
 		 */
-		mb();
+		barrier();
 
 		save_pending = pending;
-		if (save_pending == 0) {
-			/*
-			 * This must return with signals enabled, so
-			 * this barrier ensures that writes are
-			 * flushed out before the return.  This might
-			 * matter if gcc figures out how to inline
-			 * this (unlikely, given its size) and decides
-			 * to shuffle this code into the caller.
-			 */
-			mb();
+		if (save_pending == 0)
 			return;
-		}
 
 		pending = 0;
 
-- 
cgit v1.1


From 1aa351a308d2c3ddb92b6cc45083fc54271d0010 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:10 -0800
Subject: uml: tidy helper code

Style fixes to arch/um/os/helper.c and tidying up the breakpoint fix a
bit.

helper.c gets all the usual style fixes -
	 updated copyright
	 all printks get severities

Also -
	 errval changes to err in helper_child
	 fixed an obsolete comment
	 run_helper was killing a child process which is guaranteed to
be dead or dying anyway

Removed the nohang and pname arguments from helper_wait and fixed the
declaration and callers.  nohang was used only in the slirp driver and
I don't think it was needed.  I think pname was a bit of overkill in
putting out an error message when something goes wrong.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/net_user.c               |  2 +-
 arch/um/drivers/slip_user.c              |  2 +-
 arch/um/drivers/slirp_user.c             |  2 +-
 arch/um/include/os.h                     |  2 +-
 arch/um/os-Linux/drivers/ethertap_user.c |  2 +-
 arch/um/os-Linux/drivers/tuntap_user.c   |  2 +-
 arch/um/os-Linux/helper.c                | 74 +++++++++++++-------------------
 7 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c
index 29185ca..abf2653 100644
--- a/arch/um/drivers/net_user.c
+++ b/arch/um/drivers/net_user.c
@@ -201,7 +201,7 @@ static int change_tramp(char **argv, char *output, int output_len)
 	close(fds[1]);
 
 	if (pid > 0)
-		helper_wait(pid, 0, "change_tramp");
+		helper_wait(pid);
 	return pid;
 }
 
diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c
index b8711e5..8b80505 100644
--- a/arch/um/drivers/slip_user.c
+++ b/arch/um/drivers/slip_user.c
@@ -109,7 +109,7 @@ static int slip_tramp(char **argv, int fd)
 	read_output(fds[0], output, output_len);
 	printk("%s", output);
 
-	err = helper_wait(pid, 0, argv[0]);
+	err = helper_wait(pid);
 	close(fds[0]);
 
 out_free:
diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c
index 89c1be2..a0ada8f 100644
--- a/arch/um/drivers/slirp_user.c
+++ b/arch/um/drivers/slirp_user.c
@@ -98,7 +98,7 @@ static void slirp_close(int fd, void *data)
 		       "(%d)\n", pri->pid, errno);
 	}
 #endif
-	err = helper_wait(pri->pid, 1, "slirp_close");
+	err = helper_wait(pri->pid);
 	if (err < 0)
 		return;
 
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index c8684b6..bce2881 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -207,7 +207,7 @@ extern int execvp_noalloc(char *buf, const char *file, char *const argv[]);
 extern int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv);
 extern int run_helper_thread(int (*proc)(void *), void *arg,
 			     unsigned int flags, unsigned long *stack_out);
-extern int helper_wait(int pid, int nohang, char *pname);
+extern int helper_wait(int pid);
 
 
 /* tls.c */
diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c
index 07ca0cb..6fb0b17 100644
--- a/arch/um/os-Linux/drivers/ethertap_user.c
+++ b/arch/um/os-Linux/drivers/ethertap_user.c
@@ -131,7 +131,7 @@ static int etap_tramp(char *dev, char *gate, int control_me,
 	}
 	if (c != 1) {
 		printk(UM_KERN_ERR "etap_tramp : uml_net failed\n");
-		err = helper_wait(pid, 0, "uml_net");
+		err = helper_wait(pid);
 	}
 	return err;
 }
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
index 7739e29..2448be0 100644
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ b/arch/um/os-Linux/drivers/tuntap_user.c
@@ -108,7 +108,7 @@ static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote,
 		       "errno = %d\n", errno);
 		return err;
 	}
-	helper_wait(pid, 0, "tuntap_open_tramp");
+	helper_wait(pid);
 
 	cmsg = CMSG_FIRSTHDR(&msg);
 	if (cmsg == NULL) {
diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c
index fba3f0f..f4bd349 100644
--- a/arch/um/os-Linux/helper.c
+++ b/arch/um/os-Linux/helper.c
@@ -1,22 +1,19 @@
 /*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sched.h>
-#include <limits.h>
-#include <sys/signal.h>
-#include <sys/wait.h>
 #include <sys/socket.h>
-#include "user.h"
+#include <sys/wait.h>
+#include "kern_constants.h"
 #include "kern_util.h"
 #include "os.h"
 #include "um_malloc.h"
-#include "kern_constants.h"
+#include "user.h"
 
 struct helper_data {
 	void (*pre_exec)(void*);
@@ -30,21 +27,19 @@ static int helper_child(void *arg)
 {
 	struct helper_data *data = arg;
 	char **argv = data->argv;
-	int errval;
+	int err;
 
 	if (data->pre_exec != NULL)
 		(*data->pre_exec)(data->pre_data);
-	errval = execvp_noalloc(data->buf, argv[0], argv);
-	printk("helper_child - execvp of '%s' failed - errno = %d\n", argv[0],
-	       -errval);
-	write(data->fd, &errval, sizeof(errval));
-	kill(os_getpid(), SIGKILL);
+	err = execvp_noalloc(data->buf, argv[0], argv);
+
+	/* If the exec succeeds, we don't get here */
+	write(data->fd, &err, sizeof(err));
+
 	return 0;
 }
 
-/* Returns either the pid of the child process we run or -E* on failure.
- * XXX The alloc_stack here breaks if this is called in the tracing thread, so
- * we need to receive a preallocated stack (a local buffer is ok). */
+/* Returns either the pid of the child process we run or -E* on failure. */
 int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
 {
 	struct helper_data data;
@@ -58,14 +53,15 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
 	ret = socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
 	if (ret < 0) {
 		ret = -errno;
-		printk("run_helper : pipe failed, errno = %d\n", errno);
+		printk(UM_KERN_ERR "run_helper : pipe failed, errno = %d\n",
+		       errno);
 		goto out_free;
 	}
 
 	ret = os_set_exec_close(fds[1]);
 	if (ret < 0) {
-		printk("run_helper : setting FD_CLOEXEC failed, ret = %d\n",
-		       -ret);
+		printk(UM_KERN_ERR "run_helper : setting FD_CLOEXEC failed, "
+		       "ret = %d\n", -ret);
 		goto out_close;
 	}
 
@@ -79,7 +75,8 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
 	pid = clone(helper_child, (void *) sp, CLONE_VM, &data);
 	if (pid < 0) {
 		ret = -errno;
-		printk("run_helper : clone failed, errno = %d\n", errno);
+		printk(UM_KERN_ERR "run_helper : clone failed, errno = %d\n",
+		       errno);
 		goto out_free2;
 	}
 
@@ -96,10 +93,9 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv)
 	} else {
 		if (n < 0) {
 			n = -errno;
-			printk("run_helper : read on pipe failed, ret = %d\n",
-			       -n);
+			printk(UM_KERN_ERR "run_helper : read on pipe failed, "
+			       "ret = %d\n", -n);
 			ret = n;
-			kill(pid, SIGKILL);
 		}
 		CATCH_EINTR(waitpid(pid, NULL, __WCLONE));
 	}
@@ -129,50 +125,40 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags,
 	pid = clone(proc, (void *) sp, flags, arg);
 	if (pid < 0) {
 		err = -errno;
-		printk("run_helper_thread : clone failed, errno = %d\n",
-		       errno);
+		printk(UM_KERN_ERR "run_helper_thread : clone failed, "
+		       "errno = %d\n", errno);
 		return err;
 	}
 	if (stack_out == NULL) {
 		CATCH_EINTR(pid = waitpid(pid, &status, __WCLONE));
 		if (pid < 0) {
 			err = -errno;
-			printk("run_helper_thread - wait failed, errno = %d\n",
-			       errno);
+			printk(UM_KERN_ERR "run_helper_thread - wait failed, "
+			       "errno = %d\n", errno);
 			pid = err;
 		}
 		if (!WIFEXITED(status) || (WEXITSTATUS(status) != 0))
-			printk("run_helper_thread - thread returned status "
-			       "0x%x\n", status);
+			printk(UM_KERN_ERR "run_helper_thread - thread "
+			       "returned status 0x%x\n", status);
 		free_stack(stack, 0);
 	} else
 		*stack_out = stack;
 	return pid;
 }
 
-int helper_wait(int pid, int nohang, char *pname)
+int helper_wait(int pid)
 {
 	int ret, status;
 	int wflags = __WCLONE;
 
-	if (nohang)
-		wflags |= WNOHANG;
-
-	if (!pname)
-		pname = "helper_wait";
-
 	CATCH_EINTR(ret = waitpid(pid, &status, wflags));
 	if (ret < 0) {
-		printk(UM_KERN_ERR "%s : waitpid process %d failed, "
-		       "errno = %d\n", pname, pid, errno);
+		printk(UM_KERN_ERR "helper_wait : waitpid process %d failed, "
+		       "errno = %d\n", pid, errno);
 		return -errno;
-	} else if (nohang && ret == 0) {
-		printk(UM_KERN_ERR "%s : process %d has not exited\n",
-		       pname, pid);
-		return -ECHILD;
 	} else if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
-		printk(UM_KERN_ERR "%s : process %d didn't exit with "
-		       "status 0\n", pname, pid);
+		printk(UM_KERN_ERR "helper_wait : process %d exited with "
+		       "status 0x%x\n", pid, status);
 		return -ECHILD;
 	} else
 		return 0;
-- 
cgit v1.1


From 00a905e6145ba200308a6a13e00248b85c600bd0 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:10 -0800
Subject: uml: don't kill pid 0

A bit of defensive programming - during development, it ocassionally
happens that a call to init_new_context is missed, resulting in
context holding a host pid of zero.  When that address space is torn
down, destroy_context does a kill(0), which instantly kills the whole
UML without any errors whatsoever.

This patch add a check for pids less than 2, to also catch 1 and
negative pids.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/skas/mmu.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index e8dc854..78b3e9f 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -164,8 +164,20 @@ void destroy_context(struct mm_struct *mm)
 
 	if (proc_mm)
 		os_close_file(mmu->id.u.mm_fd);
-	else
+	else {
+		/*
+		 * If init_new_context wasn't called, this will be
+		 * zero, resulting in a kill(0), which will result in the
+		 * whole UML suddenly dying.  Also, cover negative and
+		 * 1 cases, since they shouldn't happen either.
+		 */
+		if (mmu->id.u.pid < 2) {
+			printk(KERN_ERR "corrupt mm_context - pid = %d\n",
+			       mmu->id.u.pid);
+			return;
+		}
 		os_kill_ptraced_process(mmu->id.u.pid, 1);
+	}
 
 	if (skas_needs_stub)
 		free_page(mmu->id.stack);
-- 
cgit v1.1


From a9b71b6c5473d2c1526deac0a1a207fe476f6088 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:11 -0800
Subject: uml: get rid of syscall counters

Get rid of some syscall counters which haven't been useful in ages.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/kern_util.h        | 1 -
 arch/um/kernel/skas/syscall.c      | 3 ---
 arch/um/kernel/syscall.c           | 3 ---
 include/asm-um/processor-generic.h | 2 --
 4 files changed, 9 deletions(-)

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index 715ffb5..3c34122 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -13,7 +13,6 @@ extern int uml_exitcode;
 
 extern int ncpus;
 extern int kmalloc_ok;
-extern int nsyscalls;
 
 #define UML_ROUND_UP(addr) \
 	((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK)
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index 50b476f..6450f02 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -17,9 +17,6 @@ void handle_syscall(struct uml_pt_regs *r)
 
 	syscall_trace(r, 0);
 
-	current->thread.nsyscalls++;
-	nsyscalls++;
-
 	/*
 	 * This should go in the declaration of syscall, but when I do that,
 	 * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index b9d92b2..9cffc62 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -13,9 +13,6 @@
 #include "asm/uaccess.h"
 #include "asm/unistd.h"
 
-/*  Unlocked, I don't care if this is a bit off */
-int nsyscalls = 0;
-
 long sys_fork(void)
 {
 	long ret;
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index ecf6706..b7d9a16 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -27,7 +27,6 @@ struct thread_struct {
 	 * as of 2.6.11).
 	 */
 	int forking;
-	int nsyscalls;
 	struct pt_regs regs;
 	int singlestep_syscall;
 	void *fault_addr;
@@ -59,7 +58,6 @@ struct thread_struct {
 #define INIT_THREAD \
 { \
 	.forking		= 0, \
-	.nsyscalls		= 0, \
 	.regs		   	= EMPTY_REGS,	\
 	.fault_addr		= NULL, \
 	.prev_sched		= NULL, \
-- 
cgit v1.1


From e06173bde0ec9830a296720f8cd7cb2f17b76fa4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:12 -0800
Subject: uml: don't allow processes to call into stub

Kill a process that tries to branch into a stub and execute a system
call.  There are no security implications here - a system call in a
stub is treated the same as a system call anywhere else.  But if a
process is trying to branch into a stub, either it is trying something
nasty or it has gone haywire, so it's a good idea to get rid of it in
either case.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/skas/process.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 862fea0..8ab2f5c 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -146,6 +146,9 @@ static void handle_trap(int pid, struct uml_pt_regs *regs,
 {
 	int err, status;
 
+	if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END))
+		fatal_sigsegv();
+
 	/* Mark this as a syscall */
 	UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
 
-- 
cgit v1.1


From 75ada8ffe08cef9b506a796ba6f9ce2071dcf0d7 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:12 -0800
Subject: uml: move sig_handler_common_skas

This patch moves sig_handler_common_skas from
arch/um/os-Linux/skas/trap.c to its only caller in
arch/um/os-Linux/signal.c.  trap.c is now empty, so it can be removed.

This is code movement only - the significant cleanup needed here is
done in the next patch.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h           |  3 --
 arch/um/os-Linux/signal.c      | 62 +++++++++++++++++++++++++++++++++++
 arch/um/os-Linux/skas/Makefile |  6 ++--
 arch/um/os-Linux/skas/trap.c   | 73 ------------------------------------------
 4 files changed, 65 insertions(+), 79 deletions(-)
 delete mode 100644 arch/um/os-Linux/skas/trap.c

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index bce2881..b9779ac 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -292,9 +292,6 @@ extern int add_sigio_fd(int fd);
 extern int ignore_sigio_fd(int fd);
 extern void maybe_sigio_broken(int fd, int read);
 
-/* skas/trap */
-extern void sig_handler_common_skas(int sig, void *sc_ptr);
-
 /* sys-x86_64/prctl.c */
 extern int os_arch_prctl(int pid, int code, unsigned long *addr);
 
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 62a66f3..cde9e76 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -9,12 +9,74 @@
 #include <errno.h>
 #include <signal.h>
 #include <strings.h>
+#include "as-layout.h"
+#include "kern_constants.h"
 #include "kern_util.h"
 #include "os.h"
 #include "sysdep/barrier.h"
 #include "sysdep/sigcontext.h"
+#include "task.h"
 #include "user.h"
 
+void (*sig_info[NSIG])(int, struct uml_pt_regs *) = {
+	[SIGTRAP]	= relay_signal,
+	[SIGFPE]	= relay_signal,
+	[SIGILL]	= relay_signal,
+	[SIGWINCH]	= winch,
+	[SIGBUS]	= bus_handler,
+	[SIGSEGV]	= segv_handler,
+	[SIGIO]		= sigio_handler,
+	[SIGVTALRM]	= timer_handler };
+
+static struct uml_pt_regs ksig_regs[UM_NR_CPUS];
+
+void sig_handler_common_skas(int sig, void *sc_ptr)
+{
+	struct sigcontext *sc = sc_ptr;
+	struct uml_pt_regs *r;
+	void (*handler)(int, struct uml_pt_regs *);
+	int save_user, save_errno = errno;
+
+	/*
+	 * This is done because to allow SIGSEGV to be delivered inside a SEGV
+	 * handler.  This can happen in copy_user, and if SEGV is disabled,
+	 * the process will die.
+	 * XXX Figure out why this is better than SA_NODEFER
+	 */
+	if (sig == SIGSEGV) {
+		change_sig(SIGSEGV, 1);
+		/*
+		 * For segfaults, we want the data from the
+		 * sigcontext.  In this case, we don't want to mangle
+		 * the process registers, so use a static set of
+		 * registers.  For other signals, the process
+		 * registers are OK.
+		 */
+		r = &ksig_regs[cpu()];
+		copy_sc(r, sc_ptr);
+	} else
+		r = TASK_REGS(get_current());
+
+	save_user = r->is_user;
+	r->is_user = 0;
+	if ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
+	    (sig == SIGILL) || (sig == SIGTRAP))
+		GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
+
+	change_sig(SIGUSR1, 1);
+
+	handler = sig_info[sig];
+
+	/* unblock SIGVTALRM, SIGIO if sig isn't IRQ signal */
+	if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
+		unblock_signals();
+
+	handler(sig, r);
+
+	errno = save_errno;
+	r->is_user = save_user;
+}
+
 /* Copied from linux/compiler-gcc.h since we can't include it directly */
 #define barrier() __asm__ __volatile__("": : :"memory")
 
diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile
index 5fd8d4d..d2ea340 100644
--- a/arch/um/os-Linux/skas/Makefile
+++ b/arch/um/os-Linux/skas/Makefile
@@ -1,10 +1,10 @@
 #
-# Copyright (C) 2002 - 2004 Jeff Dike (jdike@addtoit.com)
+# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com)
 # Licensed under the GPL
 #
 
-obj-y := mem.o process.o trap.o
+obj-y := mem.o process.o
 
-USER_OBJS := mem.o process.o trap.o
+USER_OBJS := $(obj-y)
 
 include arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/skas/trap.c b/arch/um/os-Linux/skas/trap.c
deleted file mode 100644
index a19a74f..0000000
--- a/arch/um/os-Linux/skas/trap.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <errno.h>
-#include <signal.h>
-#include "sysdep/ptrace.h"
-#include "kern_constants.h"
-#include "as-layout.h"
-#include "kern_util.h"
-#include "os.h"
-#include "sigcontext.h"
-#include "task.h"
-
-void (*sig_info[NSIG])(int, struct uml_pt_regs *) = {
-	[SIGTRAP]	= relay_signal,
-	[SIGFPE]	= relay_signal,
-	[SIGILL]	= relay_signal,
-	[SIGWINCH]	= winch,
-	[SIGBUS]	= bus_handler,
-	[SIGSEGV]	= segv_handler,
-	[SIGIO]		= sigio_handler,
-	[SIGVTALRM]	= timer_handler };
-
-static struct uml_pt_regs ksig_regs[UM_NR_CPUS];
-
-void sig_handler_common_skas(int sig, void *sc_ptr)
-{
-	struct sigcontext *sc = sc_ptr;
-	struct uml_pt_regs *r;
-	void (*handler)(int, struct uml_pt_regs *);
-	int save_user, save_errno = errno;
-
-	/*
-	 * This is done because to allow SIGSEGV to be delivered inside a SEGV
-	 * handler.  This can happen in copy_user, and if SEGV is disabled,
-	 * the process will die.
-	 * XXX Figure out why this is better than SA_NODEFER
-	 */
-	if (sig == SIGSEGV) {
-		change_sig(SIGSEGV, 1);
-		/*
-		 * For segfaults, we want the data from the
-		 * sigcontext.  In this case, we don't want to mangle
-		 * the process registers, so use a static set of
-		 * registers.  For other signals, the process
-		 * registers are OK.
-		 */
-		r = &ksig_regs[cpu()];
-		copy_sc(r, sc_ptr);
-	}
-	else r = TASK_REGS(get_current());
-
-	save_user = r->is_user;
-	r->is_user = 0;
-	if ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
-	    (sig == SIGILL) || (sig == SIGTRAP))
-		GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
-
-	change_sig(SIGUSR1, 1);
-
-	handler = sig_info[sig];
-
-	/* unblock SIGVTALRM, SIGIO if sig isn't IRQ signal */
-	if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
-		unblock_signals();
-
-	handler(sig, r);
-
-	errno = save_errno;
-	r->is_user = save_user;
-}
-- 
cgit v1.1


From e6a2d1f7024f93e4622cd7ba633666a63ccce49e Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:13 -0800
Subject: uml: clean up sig_handler_common_skas

sig_handler_common_skas needs significant modernization, starting with
its name and storage class.

There is no need to hide the true type of the sigcontext pointer, so
the void * dummy parameter can be replaced with a sigcontext *sc.

The array of uml_pt_regs structs used in the page fault case are gone,
replaced by a local variable.  This is also used in the non-segfault
case instead of the copy in the task_struct.  Since it's local, the
special handling of the is_user flag can go away.

There hasn't been any special treatment of SIGUSR1 in ages, so the
line that enables it can be deleted.

The special treatment of SIGSEGV similarly goes away, but to
compensate, SA_NODEFER is added to sa_mask when registering a signal
handler.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/signal.c | 63 ++++++++++++++---------------------------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index cde9e76..91a35da 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -10,14 +10,15 @@
 #include <signal.h>
 #include <strings.h>
 #include "as-layout.h"
-#include "kern_constants.h"
 #include "kern_util.h"
 #include "os.h"
 #include "sysdep/barrier.h"
 #include "sysdep/sigcontext.h"
-#include "task.h"
 #include "user.h"
 
+/* Copied from linux/compiler-gcc.h since we can't include it directly */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
 void (*sig_info[NSIG])(int, struct uml_pt_regs *) = {
 	[SIGTRAP]	= relay_signal,
 	[SIGFPE]	= relay_signal,
@@ -28,58 +29,27 @@ void (*sig_info[NSIG])(int, struct uml_pt_regs *) = {
 	[SIGIO]		= sigio_handler,
 	[SIGVTALRM]	= timer_handler };
 
-static struct uml_pt_regs ksig_regs[UM_NR_CPUS];
-
-void sig_handler_common_skas(int sig, void *sc_ptr)
+static void sig_handler_common(int sig, struct sigcontext *sc)
 {
-	struct sigcontext *sc = sc_ptr;
-	struct uml_pt_regs *r;
-	void (*handler)(int, struct uml_pt_regs *);
-	int save_user, save_errno = errno;
+	struct uml_pt_regs r;
+	int save_errno = errno;
 
-	/*
-	 * This is done because to allow SIGSEGV to be delivered inside a SEGV
-	 * handler.  This can happen in copy_user, and if SEGV is disabled,
-	 * the process will die.
-	 * XXX Figure out why this is better than SA_NODEFER
-	 */
+	r.is_user = 0;
 	if (sig == SIGSEGV) {
-		change_sig(SIGSEGV, 1);
-		/*
-		 * For segfaults, we want the data from the
-		 * sigcontext.  In this case, we don't want to mangle
-		 * the process registers, so use a static set of
-		 * registers.  For other signals, the process
-		 * registers are OK.
-		 */
-		r = &ksig_regs[cpu()];
-		copy_sc(r, sc_ptr);
-	} else
-		r = TASK_REGS(get_current());
-
-	save_user = r->is_user;
-	r->is_user = 0;
-	if ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
-	    (sig == SIGILL) || (sig == SIGTRAP))
-		GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
-
-	change_sig(SIGUSR1, 1);
-
-	handler = sig_info[sig];
+		/* For segfaults, we want the data from the sigcontext. */
+		copy_sc(&r, sc);
+		GET_FAULTINFO_FROM_SC(r.faultinfo, sc);
+	}
 
-	/* unblock SIGVTALRM, SIGIO if sig isn't IRQ signal */
+	/* enable signals if sig isn't IRQ signal */
 	if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
 		unblock_signals();
 
-	handler(sig, r);
+	(*sig_info[sig])(sig, &r);
 
 	errno = save_errno;
-	r->is_user = save_user;
 }
 
-/* Copied from linux/compiler-gcc.h since we can't include it directly */
-#define barrier() __asm__ __volatile__("": : :"memory")
-
 /*
  * These are the asynchronous signals.  SIGPROF is excluded because we want to
  * be able to profile all of UML, not just the non-critical sections.  If
@@ -107,7 +77,7 @@ void sig_handler(int sig, struct sigcontext *sc)
 
 	block_signals();
 
-	sig_handler_common_skas(sig, sc);
+	sig_handler_common(sig, sc);
 
 	set_signals(enabled);
 }
@@ -227,6 +197,9 @@ void set_handler(int sig, void (*handler)(int), int flags, ...)
 		sigaddset(&action.sa_mask, mask);
 	va_end(ap);
 
+	if (sig == SIGSEGV)
+		flags |= SA_NODEFER;
+
 	action.sa_flags = flags;
 	action.sa_restorer = NULL;
 	if (sigaction(sig, &action, NULL) < 0)
@@ -306,7 +279,7 @@ void unblock_signals(void)
 		 * back here.
 		 */
 		if (save_pending & SIGIO_MASK)
-			sig_handler_common_skas(SIGIO, NULL);
+			sig_handler_common(SIGIO, NULL);
 
 		if (save_pending & SIGVTALRM_MASK)
 			real_alarm_handler(NULL);
-- 
cgit v1.1


From c5d4bb171cab17576779a51d23d313abcb3db102 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:14 -0800
Subject: uml: style fixes in arch/um/kernel

Joe Perches noticed some printks in smp.c that needed fixing.

While I was in there, I did the usual tidying in arch/um/kernel, which
should be fairly style-clean at this point:
	copyright updates
	emacs formatting comments removal
	include tidying
	style fixes

Cc: Joe Perches <joe@perches.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/exitcode.c   | 31 +++++++++++++++++--------------
 arch/um/kernel/gmon_syms.c  | 11 ++++++-----
 arch/um/kernel/gprof_syms.c | 13 +------------
 arch/um/kernel/process.c    | 41 ++++++++++++++++++++++-------------------
 arch/um/kernel/reboot.c     |  6 +++---
 arch/um/kernel/sigio.c      | 18 ++++++------------
 arch/um/kernel/signal.c     | 16 ++++++++--------
 arch/um/kernel/smp.c        |  7 +++----
 arch/um/kernel/sysrq.c      | 44 +++++++++++++++++++++-----------------------
 arch/um/kernel/time.c       | 14 +++++++-------
 arch/um/kernel/tlb.c        |  2 +-
 arch/um/kernel/uaccess.c    | 11 +++++++----
 arch/um/kernel/um_arch.c    | 26 +++++++++++++-------------
 arch/um/kernel/umid.c       | 15 +++++++--------
 14 files changed, 122 insertions(+), 133 deletions(-)

diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c
index c716b5a..984f80e 100644
--- a/arch/um/kernel/exitcode.c
+++ b/arch/um/kernel/exitcode.c
@@ -1,15 +1,17 @@
 /*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include "linux/kernel.h"
-#include "linux/init.h"
-#include "linux/ctype.h"
-#include "linux/proc_fs.h"
-#include "asm/uaccess.h"
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
 
-/* If read and write race, the read will still atomically read a valid
+/*
+ * If read and write race, the read will still atomically read a valid
  * value.
  */
 int uml_exitcode = 0;
@@ -19,18 +21,19 @@ static int read_proc_exitcode(char *page, char **start, off_t off,
 {
 	int len, val;
 
-	/* Save uml_exitcode in a local so that we don't need to guarantee
+	/*
+	 * Save uml_exitcode in a local so that we don't need to guarantee
 	 * that sprintf accesses it atomically.
 	 */
 	val = uml_exitcode;
 	len = sprintf(page, "%d\n", val);
 	len -= off;
-	if(len <= off+count)
+	if (len <= off+count)
 		*eof = 1;
 	*start = page + off;
-	if(len > count)
+	if (len > count)
 		len = count;
-	if(len < 0)
+	if (len < 0)
 		len = 0;
 	return len;
 }
@@ -41,11 +44,11 @@ static int write_proc_exitcode(struct file *file, const char __user *buffer,
 	char *end, buf[sizeof("nnnnn\0")];
 	int tmp;
 
-	if(copy_from_user(buf, buffer, count))
+	if (copy_from_user(buf, buffer, count))
 		return -EFAULT;
 
 	tmp = simple_strtol(buf, &end, 0);
-	if((*end != '\0') && !isspace(*end))
+	if ((*end != '\0') && !isspace(*end))
 		return -EINVAL;
 
 	uml_exitcode = tmp;
@@ -57,7 +60,7 @@ static int make_proc_exitcode(void)
 	struct proc_dir_entry *ent;
 
 	ent = create_proc_entry("exitcode", 0600, &proc_root);
-	if(ent == NULL){
+	if (ent == NULL) {
 		printk(KERN_WARNING "make_proc_exitcode : Failed to register "
 		       "/proc/exitcode\n");
 		return 0;
diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c
index 734f873..72eccd2 100644
--- a/arch/um/kernel/gmon_syms.c
+++ b/arch/um/kernel/gmon_syms.c
@@ -1,5 +1,5 @@
-/* 
- * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -8,12 +8,13 @@
 extern void __bb_init_func(void *)  __attribute__((weak));
 EXPORT_SYMBOL(__bb_init_func);
 
-/* This is defined (and referred to in profiling stub code) only by some GCC
+/*
+ * This is defined (and referred to in profiling stub code) only by some GCC
  * versions in libgcov.
  *
  * Since SuSE backported the fix, we cannot handle it depending on GCC version.
- * So, unconditionally export it. But also give it a weak declaration, which will
- * be overridden by any other one.
+ * So, unconditionally export it. But also give it a weak declaration, which
+ * will be overridden by any other one.
  */
 
 extern void __gcov_init(void *) __attribute__((weak));
diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c
index 9244f01..e2f043d 100644
--- a/arch/um/kernel/gprof_syms.c
+++ b/arch/um/kernel/gprof_syms.c
@@ -1,5 +1,5 @@
 /* 
- * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -7,14 +7,3 @@
 
 extern void mcount(void);
 EXPORT_SYMBOL(mcount);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index e6d89ad..c07961b 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -4,19 +4,21 @@
  * Licensed under the GPL
  */
 
-#include "linux/stddef.h"
-#include "linux/err.h"
-#include "linux/hardirq.h"
-#include "linux/mm.h"
-#include "linux/personality.h"
-#include "linux/proc_fs.h"
-#include "linux/ptrace.h"
-#include "linux/random.h"
-#include "linux/sched.h"
-#include "linux/tick.h"
-#include "linux/threads.h"
-#include "asm/pgtable.h"
-#include "asm/uaccess.h"
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/hardirq.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/personality.h>
+#include <linux/proc_fs.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/threads.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
 #include "as-layout.h"
 #include "kern_util.h"
 #include "os.h"
@@ -40,7 +42,7 @@ int pid_to_processor_id(int pid)
 {
 	int i;
 
-	for(i = 0; i < ncpus; i++) {
+	for (i = 0; i < ncpus; i++) {
 		if (cpu_tasks[i].pid == pid)
 			return i;
 	}
@@ -94,14 +96,15 @@ void *_switch_to(void *prev, void *next, void *last)
 	do {
 		current->thread.saved_task = NULL;
 
-		switch_threads(&from->thread.switch_buf, &to->thread.switch_buf);
+		switch_threads(&from->thread.switch_buf,
+			       &to->thread.switch_buf);
 
 		arch_switch_to(current);
 
 		if (current->thread.saved_task)
 			show_regs(&(current->thread.regs));
-		next = current->thread.saved_task;
-		prev = current;
+		to = current->thread.saved_task;
+		from = current;
 	} while (current->thread.saved_task);
 
 	return current->thread.prev_sched;
@@ -232,7 +235,7 @@ void default_idle(void)
 {
 	unsigned long long nsecs;
 
-	while(1) {
+	while (1) {
 		/* endless idle loop with no priority at all */
 
 		/*
@@ -387,7 +390,7 @@ int singlestepping(void * t)
 {
 	struct task_struct *task = t ? t : current;
 
-	if ( ! (task->ptrace & PT_DTRACE) )
+	if (!(task->ptrace & PT_DTRACE))
 		return 0;
 
 	if (task->thread.singlestep_syscall)
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 1ce49cd..00197d3 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -12,7 +12,7 @@ void (*pm_power_off)(void);
 
 static void kill_off_processes(void)
 {
-	if(proc_mm)
+	if (proc_mm)
 		/*
 		 * FIXME: need to loop over userspace_pids
 		 */
@@ -22,8 +22,8 @@ static void kill_off_processes(void)
 		int pid, me;
 
 		me = os_getpid();
-		for_each_process(p){
-			if(p->mm == NULL)
+		for_each_process(p) {
+			if (p->mm == NULL)
 				continue;
 
 			pid = p->mm->context.id.u.pid;
diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c
index 89f9866..2b272b6 100644
--- a/arch/um/kernel/sigio.c
+++ b/arch/um/kernel/sigio.c
@@ -1,18 +1,12 @@
 /*
- * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com)
  * Licensed under the GPL
  */
 
-#include "linux/kernel.h"
-#include "linux/list.h"
-#include "linux/slab.h"
-#include "linux/signal.h"
-#include "linux/interrupt.h"
-#include "init.h"
-#include "sigio.h"
-#include "irq_user.h"
+#include <linux/interrupt.h>
 #include "irq_kern.h"
 #include "os.h"
+#include "sigio.h"
 
 /* Protected by sigio_lock() called from write_sigio_workaround */
 static int sigio_irq_fd = -1;
@@ -33,9 +27,9 @@ int write_sigio_irq(int fd)
 	err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt,
 			     IRQF_DISABLED|IRQF_SAMPLE_RANDOM, "write sigio",
 			     NULL);
-	if(err){
-		printk("write_sigio_irq : um_request_irq failed, err = %d\n",
-		       err);
+	if (err) {
+		printk(KERN_ERR "write_sigio_irq : um_request_irq failed, "
+		       "err = %d\n", err);
 		return -1;
 	}
 	sigio_irq_fd = fd;
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 19cb977..b0fce72 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -3,12 +3,12 @@
  * Licensed under the GPL
  */
 
-#include "linux/module.h"
-#include "linux/ptrace.h"
-#include "linux/sched.h"
-#include "asm/siginfo.h"
-#include "asm/signal.h"
-#include "asm/unistd.h"
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <asm/siginfo.h>
+#include <asm/signal.h>
+#include <asm/unistd.h>
 #include "frame_kern.h"
 #include "kern_util.h"
 #include "sigcontext.h"
@@ -36,7 +36,7 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr,
 	/* Did we come from a system call? */
 	if (PT_REGS_SYSCALL_NR(regs) >= 0) {
 		/* If so, check system call restarting.. */
-		switch(PT_REGS_SYSCALL_RET(regs)) {
+		switch (PT_REGS_SYSCALL_RET(regs)) {
 		case -ERESTART_RESTARTBLOCK:
 		case -ERESTARTNOHAND:
 			PT_REGS_SYSCALL_RET(regs) = -EINTR;
@@ -116,7 +116,7 @@ static int kern_do_signal(struct pt_regs *regs)
 	/* Did we come from a system call? */
 	if (!handled_sig && (PT_REGS_SYSCALL_NR(regs) >= 0)) {
 		/* Restart the system call - no handlers present */
-		switch(PT_REGS_SYSCALL_RET(regs)) {
+		switch (PT_REGS_SYSCALL_RET(regs)) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 869ac68..e1062ec 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -60,7 +60,7 @@ void smp_send_stop(void)
 			continue;
 		os_write_file(cpu_data[i].ipi_pipe[1], "S", 1);
 	}
-	printk(KERN_INFO "done\n");
+	printk(KERN_CONT "done\n");
 }
 
 static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
@@ -140,9 +140,8 @@ void smp_prepare_cpus(unsigned int maxcpus)
 		while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
 			cpu_relax();
 
-		if (cpu_isset(cpu, cpu_callin_map))
-			printk(KERN_INFO "done\n");
-		else printk(KERN_INFO "failed\n");
+		printk(KERN_INFO "%s\n",
+		       cpu_isset(cpu, cpu_calling_map) ? "done" : "failed");
 	}
 }
 
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index 9326357..56d43d0 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -1,38 +1,37 @@
-/* 
- * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include "linux/sched.h"
-#include "linux/kernel.h"
-#include "linux/module.h"
-#include "linux/kallsyms.h"
-#include "asm/page.h"
-#include "asm/processor.h"
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
 #include "sysrq.h"
 
 /* Catch non-i386 SUBARCH's. */
 #if !defined(CONFIG_UML_X86) || defined(CONFIG_64BIT)
 void show_trace(struct task_struct *task, unsigned long * stack)
 {
-        unsigned long addr;
+	unsigned long addr;
 
-        if (!stack) {
+	if (!stack) {
 		stack = (unsigned long*) &stack;
 		WARN_ON(1);
 	}
 
-        printk("Call Trace: \n");
-        while (((long) stack & (THREAD_SIZE-1)) != 0) {
-                addr = *stack;
+	printk(KERN_INFO "Call Trace: \n");
+	while (((long) stack & (THREAD_SIZE-1)) != 0) {
+		addr = *stack;
 		if (__kernel_text_address(addr)) {
-			printk("%08lx:  [<%08lx>]", (unsigned long) stack, addr);
-			print_symbol(" %s", addr);
-			printk("\n");
-                }
-                stack++;
-        }
-        printk("\n");
+			printk(KERN_INFO "%08lx:  [<%08lx>]",
+			       (unsigned long) stack, addr);
+			print_symbol(KERN_CONT " %s", addr);
+			printk(KERN_CONT "\n");
+		}
+		stack++;
+	}
+	printk(KERN_INFO "\n");
 }
 #endif
 
@@ -67,14 +66,13 @@ void show_stack(struct task_struct *task, unsigned long *esp)
 	}
 
 	stack = esp;
-	for(i = 0; i < kstack_depth_to_print; i++) {
+	for (i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
 		if (i && ((i % 8) == 0))
-			printk("\n       ");
+			printk("\n" KERN_INFO "       ");
 		printk("%08lx ", *stack++);
 	}
 
-	printk("Call Trace: \n");
 	show_trace(task, esp);
 }
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 1ac746a..e066e84 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -3,12 +3,12 @@
  * Licensed under the GPL
  */
 
-#include "linux/clockchips.h"
-#include "linux/interrupt.h"
-#include "linux/jiffies.h"
-#include "linux/threads.h"
-#include "asm/irq.h"
-#include "asm/param.h"
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/threads.h>
+#include <asm/irq.h>
+#include <asm/param.h>
 #include "kern_util.h"
 #include "os.h"
 
@@ -32,7 +32,7 @@ void timer_handler(int sig, struct uml_pt_regs *regs)
 static void itimer_set_mode(enum clock_event_mode mode,
 			    struct clock_event_device *evt)
 {
-	switch(mode) {
+	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
 		set_interval();
 		break;
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 0b6a77d..d175d05 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -57,7 +57,7 @@ static int do_ops(struct host_vm_change *hvc, int end,
 
 	for (i = 0; i < end && !ret; i++) {
 		op = &hvc->ops[i];
-		switch(op->type) {
+		switch (op->type) {
 		case MMAP:
 			ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len,
 				  op->u.mmap.prot, op->u.mmap.fd,
diff --git a/arch/um/kernel/uaccess.c b/arch/um/kernel/uaccess.c
index d7436aa..f0f4b04 100644
--- a/arch/um/kernel/uaccess.c
+++ b/arch/um/kernel/uaccess.c
@@ -1,10 +1,11 @@
 /*
  * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk)
- * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-/* These are here rather than tt/uaccess.c because skas mode needs them in
+/*
+ * These are here rather than tt/uaccess.c because skas mode needs them in
  * order to do SIGBUS recovery when a tmpfs mount runs out of room.
  */
 
@@ -25,6 +26,8 @@ int __do_copy_to_user(void *to, const void *from, int n,
 
 	fault = __do_user_copy(to, from, n, fault_addr, fault_catcher,
 			       __do_copy, &faulted);
-	if(!faulted) return(0);
-	else return(n - (fault - (unsigned long) to));
+	if (!faulted)
+		return 0;
+	else
+		return n - (fault - (unsigned long) to);
 }
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index cb7eef8..468aba9 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -3,23 +3,23 @@
  * Licensed under the GPL
  */
 
-#include "linux/delay.h"
-#include "linux/mm.h"
-#include "linux/module.h"
-#include "linux/seq_file.h"
-#include "linux/string.h"
-#include "linux/utsname.h"
-#include "asm/pgtable.h"
-#include "asm/processor.h"
-#include "asm/setup.h"
-#include "arch.h"
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/utsname.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
 #include "as-layout.h"
+#include "arch.h"
 #include "init.h"
 #include "kern.h"
 #include "kern_util.h"
 #include "mem_user.h"
 #include "os.h"
-#include "skas.h"
 
 #define DEFAULT_COMMAND_LINE "root=98:0"
 
@@ -201,7 +201,7 @@ static void __init uml_checksetup(char *line, int *add)
 	struct uml_param *p;
 
 	p = &__uml_setup_start;
-	while(p < &__uml_setup_end) {
+	while (p < &__uml_setup_end) {
 		int n;
 
 		n = strlen(p->str);
@@ -216,7 +216,7 @@ static void __init uml_postsetup(void)
 	initcall_t *p;
 
 	p = &__uml_postsetup_start;
-	while(p < &__uml_postsetup_end) {
+	while (p < &__uml_postsetup_end) {
 		(*p)();
 		p++;
 	}
diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
index 039e16e..81e07e2b 100644
--- a/arch/um/kernel/umid.c
+++ b/arch/um/kernel/umid.c
@@ -1,13 +1,12 @@
-/* 
- * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include "asm/errno.h"
+#include <asm/errno.h>
 #include "init.h"
-#include "os.h"
 #include "kern.h"
-#include "linux/kernel.h"
+#include "os.h"
 
 /* Changed by set_umid_arg */
 static int umid_inited = 0;
@@ -16,16 +15,16 @@ static int __init set_umid_arg(char *name, int *add)
 {
 	int err;
 
-	if(umid_inited){
+	if (umid_inited) {
 		printf("umid already set\n");
 		return 0;
 	}
 
 	*add = 0;
 	err = set_umid(name);
-	if(err == -EEXIST)
+	if (err == -EEXIST)
 		printf("umid '%s' already in use\n", name);
-	else if(!err)
+	else if (!err)
 		umid_inited = 1;
 
 	return 0;
-- 
cgit v1.1


From d7b88513c504e49d450b0f89f80ba9d451a3c804 Mon Sep 17 00:00:00 2001
From: Dominique Quatravaux <dominique@quatravaux.org>
Date: Mon, 4 Feb 2008 22:31:15 -0800
Subject: uml: fix hostfs tv_usec calculations

To convert from tv_nsec to tv_usec, one needs to divide by 1000, not multiply.

Signed-off-by: Dominique Quatravaux <dominique@quatravaux.org>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs_user.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 35c1a9f..53fd0a6 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -285,17 +285,17 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 			return err;
 
 		times[0].tv_sec = atime_ts.tv_sec;
-		times[0].tv_usec = atime_ts.tv_nsec * 1000;
+		times[0].tv_usec = atime_ts.tv_nsec / 1000;
 		times[1].tv_sec = mtime_ts.tv_sec;
-		times[1].tv_usec = mtime_ts.tv_nsec * 1000;
+		times[1].tv_usec = mtime_ts.tv_nsec / 1000;
 
 		if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
 			times[0].tv_sec = attrs->ia_atime.tv_sec;
-			times[0].tv_usec = attrs->ia_atime.tv_nsec * 1000;
+			times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000;
 		}
 		if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) {
 			times[1].tv_sec = attrs->ia_mtime.tv_sec;
-			times[1].tv_usec = attrs->ia_mtime.tv_nsec * 1000;
+			times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000;
 		}
 
 		if (fd >= 0) {
-- 
cgit v1.1


From cfef8f34e7cf57f3d278ceda79c85112dec13dc6 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:16 -0800
Subject: uml: signal handling tidying

This patch tidies the signal handling code slightly.

pending is renamed to signals_pending for symmetry with signals_enabled.

remove_sigstack was unused, so can be deleted.

The value of change_sig was never used, so it is now void and the
return value is not calculated any more.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/signal.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 91a35da..0fb0cc8 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -63,7 +63,7 @@ static void sig_handler_common(int sig, struct sigcontext *sc)
 #define SIGVTALRM_MASK (1 << SIGVTALRM_BIT)
 
 static int signals_enabled;
-static unsigned int pending;
+static unsigned int signals_pending;
 
 void sig_handler(int sig, struct sigcontext *sc)
 {
@@ -71,7 +71,7 @@ void sig_handler(int sig, struct sigcontext *sc)
 
 	enabled = signals_enabled;
 	if (!enabled && (sig == SIGIO)) {
-		pending |= SIGIO_MASK;
+		signals_pending |= SIGIO_MASK;
 		return;
 	}
 
@@ -99,7 +99,7 @@ void alarm_handler(int sig, struct sigcontext *sc)
 
 	enabled = signals_enabled;
 	if (!signals_enabled) {
-		pending |= SIGVTALRM_MASK;
+		signals_pending |= SIGVTALRM_MASK;
 		return;
 	}
 
@@ -125,16 +125,6 @@ void set_sigstack(void *sig_stack, int size)
 		panic("enabling signal stack failed, errno = %d\n", errno);
 }
 
-void remove_sigstack(void)
-{
-	stack_t stack = ((stack_t) { .ss_flags	= SS_DISABLE,
-				     .ss_sp	= NULL,
-				     .ss_size	= 0 });
-
-	if (sigaltstack(&stack, NULL) != 0)
-		panic("disabling signal stack failed, errno = %d\n", errno);
-}
-
 void (*handlers[_NSIG])(int sig, struct sigcontext *sc);
 
 void handle_signal(int sig, struct sigcontext *sc)
@@ -213,13 +203,14 @@ void set_handler(int sig, void (*handler)(int), int flags, ...)
 
 int change_sig(int signal, int on)
 {
-	sigset_t sigset, old;
+	sigset_t sigset;
 
 	sigemptyset(&sigset);
 	sigaddset(&sigset, signal);
-	if (sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old) < 0)
+	if (sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0)
 		return -errno;
-	return !sigismember(&old, signal);
+
+	return 0;
 }
 
 void block_signals(void)
@@ -244,26 +235,26 @@ void unblock_signals(void)
 	/*
 	 * We loop because the IRQ handler returns with interrupts off.  So,
 	 * interrupts may have arrived and we need to re-enable them and
-	 * recheck pending.
+	 * recheck signals_pending.
 	 */
 	while(1) {
 		/*
 		 * Save and reset save_pending after enabling signals.  This
-		 * way, pending won't be changed while we're reading it.
+		 * way, signals_pending won't be changed while we're reading it.
 		 */
 		signals_enabled = 1;
 
 		/*
-		 * Setting signals_enabled and reading pending must
+		 * Setting signals_enabled and reading signals_pending must
 		 * happen in this order.
 		 */
 		barrier();
 
-		save_pending = pending;
+		save_pending = signals_pending;
 		if (save_pending == 0)
 			return;
 
-		pending = 0;
+		signals_pending = 0;
 
 		/*
 		 * We have pending interrupts, so disable signals, as the
-- 
cgit v1.1


From 3a24ebf0cb2ca44fdcdb5cae9ed2e778e5170f97 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:16 -0800
Subject: uml: remove init_irq_signals

init_irq_signals doesn't need to be called from the context of a new process.
It initializes handlers, which are useless in process context.  With that call
gone, init_irq_signals has only one caller, so it can be inlined into
init_new_thread_signals.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/os.h       |  1 -
 arch/um/os-Linux/irq.c     | 11 -----------
 arch/um/os-Linux/process.c |  5 ++++-
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index b9779ac..0b6b627 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -285,7 +285,6 @@ extern void os_free_irq_later(struct irq_fd *active_fds,
 extern int os_get_pollfd(int i);
 extern void os_set_pollfd(int i, int fd);
 extern void os_set_ioignore(void);
-extern void init_irq_signals(int on_sigstack);
 
 /* sigio.c */
 extern int add_sigio_fd(int fd);
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index 430866c..0348b97 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -136,14 +136,3 @@ void os_set_ioignore(void)
 {
 	signal(SIGIO, SIG_IGN);
 }
-
-void init_irq_signals(int on_sigstack)
-{
-	int flags;
-
-	flags = on_sigstack ? SA_ONSTACK : 0;
-
-	set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART,
-		    SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1);
-	signal(SIGWINCH, SIG_IGN);
-}
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index bda5c31..abf6bea 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -249,7 +249,10 @@ void init_new_thread_signals(void)
 		    SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1);
 	signal(SIGHUP, SIG_IGN);
 
-	init_irq_signals(1);
+	set_handler(SIGIO, (__sighandler_t) sig_handler,
+		    SA_ONSTACK | SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGALRM,
+		    SIGVTALRM, -1);
+	signal(SIGWINCH, SIG_IGN);
 }
 
 int run_kernel_thread(int (*fn)(void *), void *arg, jmp_buf **jmp_ptr)
-- 
cgit v1.1


From 80e39311ff3d7d2267ea8d259aab8dc9d5a59d61 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:17 -0800
Subject: uml: SMP locking commentary

Add some more commentary about various pieces of global data not needing
locking.

Also got rid of unmap_physmem since that is no longer used.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/net_kern.c |  7 ++++++-
 arch/um/include/mem_user.h |  1 -
 arch/um/kernel/mem.c       | 13 ++++++++-----
 arch/um/kernel/physmem.c   |  6 +++---
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 3c6c44c..ca71577 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -368,7 +368,6 @@ static struct platform_driver uml_net_driver = {
 		.name  = DRIVER_NAME,
 	},
 };
-static int driver_registered;
 
 static void net_device_release(struct device *dev)
 {
@@ -383,6 +382,12 @@ static void net_device_release(struct device *dev)
 	free_netdev(netdev);
 }
 
+/*
+ * Ensures that platform_driver_register is called only once by
+ * eth_configure.  Will be set in an initcall.
+ */
+static int driver_registered;
+
 static void eth_configure(int n, void *init, char *mac,
 			  struct transport *transport)
 {
diff --git a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h
index 4e6707b..46384ac 100644
--- a/arch/um/include/mem_user.h
+++ b/arch/um/include/mem_user.h
@@ -56,7 +56,6 @@ extern void setup_physmem(unsigned long start, unsigned long usable,
 			  unsigned long len, unsigned long long highmem);
 extern void add_iomem(char *name, int fd, unsigned long size);
 extern unsigned long phys_offset(unsigned long phys);
-extern void unmap_physmem(void);
 extern void map_memory(unsigned long virt, unsigned long phys,
 		       unsigned long len, int r, int w, int x);
 
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 663011c..d948bab 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -22,17 +22,20 @@
 unsigned long *empty_zero_page = NULL;
 /* allocated in paging_init and unchanged thereafter */
 unsigned long *empty_bad_page = NULL;
+
+/*
+ * Initialized during boot, and readonly for initializing page tables
+ * afterwards
+ */
 pgd_t swapper_pg_dir[PTRS_PER_PGD];
+
+/* Initialized at boot time, and readonly after that */
 unsigned long long highmem;
 int kmalloc_ok = 0;
 
+/* Used during early boot */
 static unsigned long brk_end;
 
-void unmap_physmem(void)
-{
-	os_unmap_memory((void *) brk_end, uml_reserved - brk_end);
-}
-
 static void map_cb(void *unused)
 {
 	map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 9c92900..9757085 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -164,10 +164,10 @@ __uml_setup("iomem=", parse_iomem,
  * setup_iomem, both of which run during early boot.  Afterwards, it's
  * unchanged.
  */
-struct iomem_region *iomem_regions = NULL;
+struct iomem_region *iomem_regions;
 
-/* Initialized in parse_iomem */
-int iomem_size = 0;
+/* Initialized in parse_iomem and unchanged thereafter */
+int iomem_size;
 
 unsigned long find_iomem(char *driver, unsigned long *len_out)
 {
-- 
cgit v1.1


From bf53d85ec20c228e0efdadbdb12c0f92283fcfd0 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:18 -0800
Subject: uml: implement O_APPEND

The .a flags in openflags never had an implementation.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index d7404c6..b5afcfd 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -191,6 +191,8 @@ int os_open_file(const char *file, struct openflags flags, int mode)
 		f |= O_TRUNC;
 	if (flags.e)
 		f |= O_EXCL;
+	if (flags.a)
+		f |= O_APPEND;
 
 	fd = open64(file, f, mode);
 	if (fd < 0)
-- 
cgit v1.1


From 83380cc1c9694a05bcdb7c95d293e99d3475d906 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:18 -0800
Subject: uml: remove fakehd

The fakehd switch lost its implementation at some point.  Since no one is
screaming for it, we might as well remove it.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/ubd_kern.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index b498f27..be3a279 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -460,20 +460,6 @@ __uml_help(udb_setup,
 "    in the boot output.\n\n"
 );
 
-static int fakehd_set = 0;
-static int fakehd(char *str)
-{
-	printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n");
-	fakehd_set = 1;
-	return 1;
-}
-
-__setup("fakehd", fakehd);
-__uml_help(fakehd,
-"fakehd\n"
-"    Change the ubd device name to \"hd\".\n\n"
-);
-
 static void do_ubd_request(struct request_queue * q);
 
 /* Only changed by ubd_init, which is an initcall. */
@@ -722,8 +708,10 @@ static int ubd_add(int n, char **error_out)
 		ubd_disk_register(fake_major, ubd_dev->size, n,
 				  &fake_gendisk[n]);
 
-	/* perhaps this should also be under the "if (fake_major)" above */
-	/* using the fake_disk->disk_name and also the fakehd_set name */
+	/*
+	 * Perhaps this should also be under the "if (fake_major)" above
+	 * using the fake_disk->disk_name
+	 */
 	if (fake_ide)
 		make_ide_entries(ubd_gendisk[n]->disk_name);
 
-- 
cgit v1.1


From 438ee6798cd8bfc44da725fca846367e19d86652 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:19 -0800
Subject: uml: DEBUG_SHIRQ fixes

A couple more DEBUG_SHIRQ fixes.

The previous mconsole blocking fix exposed the lack of O_NONBLOCK on the
mconsole socket.

Also, winch_interrupt started crashing because it is called at irq free time
and it tries to dereference tty->driver_data, which has already been set to
NULL.

I added some error cleanup in mconsole_init while I was there.

Cc: "Karol Swietlicki" <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/line.c          | 8 +++++---
 arch/um/drivers/mconsole_kern.c | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index fac058b..2c898c4 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -774,9 +774,11 @@ static irqreturn_t winch_interrupt(int irq, void *data)
 	tty = winch->tty;
 	if (tty != NULL) {
 		line = tty->driver_data;
-		chan_window_size(&line->chan_list, &tty->winsize.ws_row,
-				 &tty->winsize.ws_col);
-		kill_pgrp(tty->pgrp, SIGWINCH, 1);
+		if (line != NULL) {
+			chan_window_size(&line->chan_list, &tty->winsize.ws_row,
+					 &tty->winsize.ws_col);
+			kill_pgrp(tty->pgrp, SIGWINCH, 1);
+		}
 	}
  out:
 	if (winch->fd != -1)
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index fabd75f..c953e14 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -792,6 +792,8 @@ static int __init mconsole_init(void)
 		printk(KERN_ERR "Failed to initialize management console\n");
 		return 1;
 	}
+	if (os_set_fd_block(sock, 0))
+		goto out;
 
 	register_reboot_notifier(&reboot_notifier);
 
@@ -800,7 +802,7 @@ static int __init mconsole_init(void)
 			     "mconsole", (void *)sock);
 	if (err) {
 		printk(KERN_ERR "Failed to get IRQ for management console\n");
-		return 1;
+		goto out;
 	}
 
 	if (notify_socket != NULL) {
@@ -816,6 +818,10 @@ static int __init mconsole_init(void)
 	printk(KERN_INFO "mconsole (version %d) initialized on %s\n",
 	       MCONSOLE_VERSION, mconsole_socket_name);
 	return 0;
+
+ out:
+	os_close_file(sock);
+	return 1;
 }
 
 __initcall(mconsole_init);
-- 
cgit v1.1


From 7281ff952c7b3cbefb14847460e5fe73a2d240e4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:19 -0800
Subject: uml: add back CONFIG_HZ

avoid-overflows-in-kernel-timec.patch makes CONFIG_HZ necessary for a
successful build.  UML lacks a definition, so this patch adds one.  It also
changes the hard-wired definition of HZ to CONFIG_HZ.

Note: this patch is a good idea even in the absence of hpa's time fixes.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Kconfig        | 4 ++++
 include/asm-um/param.h | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index a967d95..99e51d0 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -68,6 +68,10 @@ config IRQ_RELEASE_METHOD
 	bool
 	default y
 
+config HZ
+	int
+	default 100
+
 menu "UML-specific options"
 
 config STATIC_LINK
diff --git a/include/asm-um/param.h b/include/asm-um/param.h
index f914e7d..4cd4a22 100644
--- a/include/asm-um/param.h
+++ b/include/asm-um/param.h
@@ -10,7 +10,7 @@
 #define MAXHOSTNAMELEN  64      /* max length of hostname */
 
 #ifdef __KERNEL__
-#define HZ 100
+#define HZ CONFIG_HZ
 #define USER_HZ	100	   /* .. some user interfaces are in "ticks" */
 #define CLOCKS_PER_SEC (USER_HZ)  /* frequency at which times() counts */
 #endif
-- 
cgit v1.1


From 95906b24fbe4d22e5861f67fe1e8274c7ecfeda1 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:20 -0800
Subject: uml: style fixes in arch/um/sys-x86_64

Style fixes in arch/um/sys-x86_64:
	updated copyrights
	CodingStyle fixes
	added severities to printks which needed them

A bunch of functions in sys-*/ptrace_user.c turn out to be unused, so they and
their declarations are gone.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/ptrace_user.h      | 11 +++------
 arch/um/sys-i386/ptrace_user.c     | 14 ------------
 arch/um/sys-x86_64/bug.c           |  3 ++-
 arch/um/sys-x86_64/ptrace.c        | 29 ++++++++++++------------
 arch/um/sys-x86_64/ptrace_user.c   | 46 +++++---------------------------------
 arch/um/sys-x86_64/syscall_table.c | 31 ++++++++++++++++---------
 arch/um/sys-x86_64/sysrq.c         | 29 ++++++++++++------------
 arch/um/sys-x86_64/um_module.c     |  8 ++++---
 8 files changed, 65 insertions(+), 106 deletions(-)

diff --git a/arch/um/include/ptrace_user.h b/arch/um/include/ptrace_user.h
index f3450e6..4bce6e0 100644
--- a/arch/um/include/ptrace_user.h
+++ b/arch/um/include/ptrace_user.h
@@ -1,5 +1,5 @@
 /* 
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -10,12 +10,6 @@
 
 extern int ptrace_getregs(long pid, unsigned long *regs_out);
 extern int ptrace_setregs(long pid, unsigned long *regs_in);
-extern int ptrace_getfpregs(long pid, unsigned long *regs_out);
-extern int ptrace_setfpregs(long pid, unsigned long *regs);
-extern void arch_enter_kernel(void *task, int pid);
-extern void arch_leave_kernel(void *task, int pid);
-extern void ptrace_pokeuser(unsigned long addr, unsigned long data);
-
 
 /* syscall emulation path in ptrace */
 
@@ -54,7 +48,8 @@ extern int sysemu_supported;
 	(((int[3][3] ) { \
 		{ PTRACE_SYSCALL, PTRACE_SYSCALL, PTRACE_SINGLESTEP }, \
 		{ PTRACE_SYSEMU, PTRACE_SYSEMU, PTRACE_SINGLESTEP }, \
-		{ PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP, PTRACE_SYSEMU_SINGLESTEP }}) \
+		{ PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP, \
+		  PTRACE_SYSEMU_SINGLESTEP } }) \
 		[sysemu_mode][singlestep_mode])
 
 #endif
diff --git a/arch/um/sys-i386/ptrace_user.c b/arch/um/sys-i386/ptrace_user.c
index 5cf97bc..0b10c3e 100644
--- a/arch/um/sys-i386/ptrace_user.c
+++ b/arch/um/sys-i386/ptrace_user.c
@@ -19,17 +19,3 @@ int ptrace_setregs(long pid, unsigned long *regs)
 		return -errno;
 	return 0;
 }
-
-int ptrace_getfpregs(long pid, unsigned long *regs)
-{
-	if (ptrace(PTRACE_GETFPREGS, pid, 0, regs) < 0)
-		return -errno;
-	return 0;
-}
-
-int ptrace_setfpregs(long pid, unsigned long *regs)
-{
-	if (ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0)
-		return -errno;
-	return 0;
-}
diff --git a/arch/um/sys-x86_64/bug.c b/arch/um/sys-x86_64/bug.c
index a4360b5..e8034e3 100644
--- a/arch/um/sys-x86_64/bug.c
+++ b/arch/um/sys-x86_64/bug.c
@@ -5,7 +5,8 @@
 
 #include <linux/uaccess.h>
 
-/* Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because
+/*
+ * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because
  * that's not relevant in skas mode.
  */
 
diff --git a/arch/um/sys-x86_64/ptrace.c b/arch/um/sys-x86_64/ptrace.c
index b7631b0..f3458d7 100644
--- a/arch/um/sys-x86_64/ptrace.c
+++ b/arch/um/sys-x86_64/ptrace.c
@@ -5,13 +5,12 @@
  * Licensed under the GPL
  */
 
-#define __FRAME_OFFSETS
-#include <asm/ptrace.h>
+#include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/mm.h>
+#define __FRAME_OFFSETS
+#include <asm/ptrace.h>
 #include <asm/uaccess.h>
-#include <asm/elf.h>
 
 /*
  * determines which flags the user has access to.
@@ -24,12 +23,14 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
 	unsigned long tmp;
 
 #ifdef TIF_IA32
-	/* Some code in the 64bit emulation may not be 64bit clean.
-	   Don't take any chances. */
+	/*
+	 * Some code in the 64bit emulation may not be 64bit clean.
+	 * Don't take any chances.
+	 */
 	if (test_tsk_thread_flag(child, TIF_IA32))
 		value &= 0xffffffff;
 #endif
-	switch (regno){
+	switch (regno) {
 	case FS:
 	case GS:
 	case DS:
@@ -66,7 +67,7 @@ int poke_user(struct task_struct *child, long addr, long data)
 	if (addr < MAX_REG_OFFSET)
 		return putreg(child, addr, data);
 	else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
-		(addr <= offsetof(struct user, u_debugreg[7]))){
+		(addr <= offsetof(struct user, u_debugreg[7]))) {
 		addr -= offsetof(struct user, u_debugreg[0]);
 		addr = addr >> 2;
 		if ((addr == 4) || (addr == 5))
@@ -108,11 +109,10 @@ int peek_user(struct task_struct *child, long addr, long data)
 		return -EIO;
 
 	tmp = 0;  /* Default return condition */
-	if (addr < MAX_REG_OFFSET){
+	if (addr < MAX_REG_OFFSET)
 		tmp = getreg(child, addr);
-	}
 	else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
-		(addr <= offsetof(struct user, u_debugreg[7]))){
+		(addr <= offsetof(struct user, u_debugreg[7]))) {
 		addr -= offsetof(struct user, u_debugreg[0]);
 		addr = addr >> 2;
 		tmp = child->thread.arch.debugregs[addr];
@@ -127,8 +127,9 @@ int is_syscall(unsigned long addr)
 	int n;
 
 	n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
-	if (n){
-		/* access_process_vm() grants access to vsyscall and stub,
+	if (n) {
+		/*
+		 * access_process_vm() grants access to vsyscall and stub,
 		 * while copy_from_user doesn't. Maybe access_process_vm is
 		 * slow, but that doesn't matter, since it will be called only
 		 * in case of singlestepping, if copy_from_user failed.
@@ -155,7 +156,7 @@ int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
 		return err;
 
 	n = copy_to_user(buf, fpregs, sizeof(fpregs));
-	if(n > 0)
+	if (n > 0)
 		return -EFAULT;
 
 	return n;
diff --git a/arch/um/sys-x86_64/ptrace_user.c b/arch/um/sys-x86_64/ptrace_user.c
index b5f9c33..c57a496 100644
--- a/arch/um/sys-x86_64/ptrace_user.c
+++ b/arch/um/sys-x86_64/ptrace_user.c
@@ -4,55 +4,19 @@
  * Licensed under the GPL
  */
 
-#include <stddef.h>
 #include <errno.h>
 #include "ptrace_user.h"
-#include "user.h"
-#include "kern_constants.h"
 
 int ptrace_getregs(long pid, unsigned long *regs_out)
 {
-	if(ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0)
-		return(-errno);
-	return(0);
-}
-
-int ptrace_setregs(long pid, unsigned long *regs)
-{
-	if(ptrace(PTRACE_SETREGS, pid, 0, regs) < 0)
-		return(-errno);
+	if (ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0)
+		return -errno;
 	return(0);
 }
 
-int ptrace_setfpregs(long pid, unsigned long *regs)
+int ptrace_setregs(long pid, unsigned long *regs_out)
 {
-	if (ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0)
+	if (ptrace(PTRACE_SETREGS, pid, 0, regs_out) < 0)
 		return -errno;
-	return 0;
-}
-
-void ptrace_pokeuser(unsigned long addr, unsigned long data)
-{
-	panic("ptrace_pokeuser");
-}
-
-#define DS 184
-#define ES 192
-#define __USER_DS     0x2b
-
-void arch_enter_kernel(void *task, int pid)
-{
-}
-
-void arch_leave_kernel(void *task, int pid)
-{
-#ifdef UM_USER_CS
-        if(ptrace(PTRACE_POKEUSR, pid, CS, UM_USER_CS) < 0)
-                printk("POKEUSR CS failed");
-#endif
-
-        if(ptrace(PTRACE_POKEUSR, pid, DS, __USER_DS) < 0)
-                printk("POKEUSR DS failed");
-        if(ptrace(PTRACE_POKEUSR, pid, ES, __USER_DS) < 0)
-                printk("POKEUSR ES failed");
+	return(0);
 }
diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c
index 71b2ae4..555faee 100644
--- a/arch/um/sys-x86_64/syscall_table.c
+++ b/arch/um/sys-x86_64/syscall_table.c
@@ -1,5 +1,7 @@
-/* System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c
- * with some changes for UML. */
+/*
+ * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c
+ * with some changes for UML.
+ */
 
 #include <linux/linkage.h>
 #include <linux/sys.h>
@@ -8,22 +10,26 @@
 
 #define __NO_STUBS
 
-/* Below you can see, in terms of #define's, the differences between the x86-64
- * and the UML syscall table. */
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
 
 /* Not going to be implemented by UML, since we have no hardware. */
 #define stub_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
-/* The UML TLS problem. Note that x86_64 does not implement this, so the below
- * is needed only for the ia32 compatibility. */
-/*#define sys_set_thread_area sys_ni_syscall
-#define sys_get_thread_area sys_ni_syscall*/
+/*
+ * The UML TLS problem. Note that x86_64 does not implement this, so the below
+ * is needed only for the ia32 compatibility.
+ */
 
 /* On UML we call it this way ("old" means it's not mmap2) */
 #define sys_mmap old_mmap
-/* On x86-64 sys_uname is actually sys_newuname plus a compatibility trick.
- * See arch/x86_64/kernel/sys_x86_64.c */
+/*
+ * On x86-64 sys_uname is actually sys_newuname plus a compatibility trick.
+ * See arch/x86_64/kernel/sys_x86_64.c
+ */
 #define sys_uname sys_uname64
 
 #define stub_clone sys_clone
@@ -47,7 +53,10 @@ typedef void (*sys_call_ptr_t)(void);
 extern void sys_ni_syscall(void);
 
 sys_call_ptr_t sys_call_table[UM_NR_syscall_max+1] __cacheline_aligned = {
-	/* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work when the &
+	 * below is removed.
+	 */
 	[0 ... UM_NR_syscall_max] = &sys_ni_syscall,
 #include <asm-x86/unistd_64.h>
 };
diff --git a/arch/um/sys-x86_64/sysrq.c b/arch/um/sys-x86_64/sysrq.c
index 7654440..f4f82be 100644
--- a/arch/um/sys-x86_64/sysrq.c
+++ b/arch/um/sys-x86_64/sysrq.c
@@ -4,32 +4,33 @@
  * Licensed under the GPL
  */
 
-#include "linux/kernel.h"
-#include "linux/utsname.h"
-#include "linux/module.h"
-#include "asm/current.h"
-#include "asm/ptrace.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/utsname.h>
+#include <asm/current.h>
+#include <asm/ptrace.h>
 #include "sysrq.h"
 
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs)
 {
 	printk("\n");
 	print_modules();
-	printk("Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current),
+	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current),
 		current->comm, print_tainted(), init_utsname()->release);
-	printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff,
+	printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff,
 	       PT_REGS_RIP(regs));
-	printk("\nRSP: %016lx  EFLAGS: %08lx\n", PT_REGS_RSP(regs),
+	printk(KERN_INFO "RSP: %016lx  EFLAGS: %08lx\n", PT_REGS_RSP(regs),
 	       PT_REGS_EFLAGS(regs));
-	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs));
-	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 	       PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs));
-	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
 	       PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs));
-	printk("R10: %016lx R11: %016lx R12: %016lx\n",
+	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
 	       PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs));
-	printk("R13: %016lx R14: %016lx R15: %016lx\n",
+	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
 	       PT_REGS_R13(regs), PT_REGS_R14(regs), PT_REGS_R15(regs));
 }
 
diff --git a/arch/um/sys-x86_64/um_module.c b/arch/um/sys-x86_64/um_module.c
index 8b8eff1..3dead39 100644
--- a/arch/um/sys-x86_64/um_module.c
+++ b/arch/um/sys-x86_64/um_module.c
@@ -1,7 +1,7 @@
 #include <linux/vmalloc.h>
 #include <linux/moduleloader.h>
 
-/*Copied from i386 arch/i386/kernel/module.c */
+/* Copied from i386 arch/i386/kernel/module.c */
 void *module_alloc(unsigned long size)
 {
 	if (size == 0)
@@ -13,7 +13,9 @@ void *module_alloc(unsigned long size)
 void module_free(struct module *mod, void *module_region)
 {
 	vfree(module_region);
-	/* FIXME: If module_region == mod->init_region, trim exception
-           table entries. */
+	/*
+	 * FIXME: If module_region == mod->init_region, trim exception
+	 * table entries.
+	 */
 }
 
-- 
cgit v1.1


From b54988325c4cbf8bd92c0def53387ab6516d0920 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:21 -0800
Subject: uml: add newlines to printks

Some printks were missing newlines.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/skas/process.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 8ab2f5c..d36c89c 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -273,7 +273,7 @@ int start_userspace(unsigned long stub_stack)
 	if (stack == MAP_FAILED) {
 		err = -errno;
 		printk(UM_KERN_ERR "start_userspace : mmap failed, "
-		       "errno = %d", errno);
+		       "errno = %d\n", errno);
 		return err;
 	}
 
@@ -289,7 +289,7 @@ int start_userspace(unsigned long stub_stack)
 	if (pid < 0) {
 		err = -errno;
 		printk(UM_KERN_ERR "start_userspace : clone failed, "
-		       "errno = %d", errno);
+		       "errno = %d\n", errno);
 		return err;
 	}
 
@@ -298,7 +298,7 @@ int start_userspace(unsigned long stub_stack)
 		if (n < 0) {
 			err = -errno;
 			printk(UM_KERN_ERR "start_userspace : wait failed, "
-			       "errno = %d", errno);
+			       "errno = %d\n", errno);
 			goto out_kill;
 		}
 	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM));
@@ -306,7 +306,7 @@ int start_userspace(unsigned long stub_stack)
 	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
 		err = -EINVAL;
 		printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got "
-		       "status = %d", status);
+		       "status = %d\n", status);
 		goto out_kill;
 	}
 
-- 
cgit v1.1


From 576c013df0ac9ad1f217452c14ddde246bb1a70d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:22 -0800
Subject: uml: move register initialization

Calling init_registers inside the skas3 checking causes mysterious crashes if
it doesn't happen because the skas3 checking is bypassed.  This patch moves it
to os_early_checks.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/start_up.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index bcf0c9b..b616e15 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -342,6 +342,8 @@ static void __init check_coredump_limit(void)
 
 void __init os_early_checks(void)
 {
+	int pid;
+
 	/* Print out the core dump limits early */
 	check_coredump_limit();
 
@@ -351,6 +353,11 @@ void __init os_early_checks(void)
 	 * kernel is running.
 	 */
 	check_tmpexec();
+
+	pid = start_ptraced_child();
+	if (init_registers(pid))
+		fatal("Failed to initialize default registers");
+	stop_ptraced_child(pid, 1, 1);
 }
 
 static int __init noprocmm_cmd_param(char *str, int* add)
@@ -412,9 +419,6 @@ static inline void check_skas3_ptrace_faultinfo(void)
 			non_fatal("found\n");
 	}
 
-	if (init_registers(pid))
-		fatal("Failed to initialize default registers");
-
 	stop_ptraced_child(pid, 1, 1);
 }
 
-- 
cgit v1.1


From d449c5036778dfa00374c55c9c9f02bd45574c58 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:22 -0800
Subject: uml: remove unused fields from mm_context

The 3-level page table fixes forgot to remove a couple now-unused fields from
struct mm_context.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/um_mmu.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/um/include/um_mmu.h b/arch/um/include/um_mmu.h
index 8855d8d..82865fc 100644
--- a/arch/um/include/um_mmu.h
+++ b/arch/um/include/um_mmu.h
@@ -12,10 +12,6 @@
 
 typedef struct mm_context {
 	struct mm_id id;
-	unsigned long last_page_table;
-#ifdef CONFIG_3_LEVEL_PGTABLES
-	unsigned long last_pmd;
-#endif
 	struct uml_ldt ldt;
 } mm_context_t;
 
-- 
cgit v1.1


From 47afa1d5f826606def7c498e93ec79a905042c56 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 4 Feb 2008 22:31:23 -0800
Subject: uml: remove TOPDIR

TOPDIR is obsolete, use srctree instead.  This patch removes TOPDIR from all
UML Makefiles.

Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/Makefile         |  4 ++--
 arch/um/sys-ppc/Makefile | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/um/Makefile b/arch/um/Makefile
index fb8854a..cb4af9b 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -160,7 +160,7 @@ ifneq ($(KBUILD_SRC),)
 	$(Q)mkdir -p $(objtree)/include/asm-um
 	$(Q)ln -fsn $(srctree)/include/asm-um/$(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $@
 else
-	$(Q)cd $(TOPDIR)/$(dir $@) ; \
+	$(Q)cd $(srctree)/$(dir $@) ; \
 	ln -sf $(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $(notdir $@)
 endif
 
@@ -170,7 +170,7 @@ ifneq ($(KBUILD_SRC),)
 	$(Q)mkdir -p $(objtree)/include/asm-um
 	$(Q)ln -fsn $(srctree)/include/asm-$(HEADER_ARCH) include/asm-um/arch
 else
-	$(Q)cd $(TOPDIR)/include/asm-um && ln -fsn ../asm-$(HEADER_ARCH) arch
+	$(Q)cd $(srctree)/include/asm-um && ln -fsn ../asm-$(HEADER_ARCH) arch
 endif
 
 $(objtree)/$(ARCH_DIR)/include:
diff --git a/arch/um/sys-ppc/Makefile b/arch/um/sys-ppc/Makefile
index a9814a7..0890152 100644
--- a/arch/um/sys-ppc/Makefile
+++ b/arch/um/sys-ppc/Makefile
@@ -6,7 +6,7 @@ OBJ = built-in.o
 OBJS = ptrace.o sigcontext.o semaphore.o checksum.o miscthings.o misc.o \
 	ptrace_user.o sysrq.o
 
-EXTRA_AFLAGS := -DCONFIG_PPC32 -I. -I$(TOPDIR)/arch/ppc/kernel
+EXTRA_AFLAGS := -DCONFIG_PPC32 -I. -I$(srctree)/arch/ppc/kernel
 
 all: $(OBJ)
 
@@ -22,25 +22,25 @@ sigcontext.o: sigcontext.c
 
 semaphore.c:
 	rm -f $@
-	ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
+	ln -s $(srctree)/arch/ppc/kernel/$@ $@
 
 checksum.S:
 	rm -f $@
-	ln -s $(TOPDIR)/arch/ppc/lib/$@ $@
+	ln -s $(srctree)/arch/ppc/lib/$@ $@
 
 mk_defs.c:
 	rm -f $@
-	ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
+	ln -s $(srctree)/arch/ppc/kernel/$@ $@
 
 ppc_defs.head:
 	rm -f $@
-	ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
+	ln -s $(srctree)/arch/ppc/kernel/$@ $@
 
 ppc_defs.h: mk_defs.c ppc_defs.head \
-		$(TOPDIR)/include/asm-ppc/mmu.h \
-		$(TOPDIR)/include/asm-ppc/processor.h \
-		$(TOPDIR)/include/asm-ppc/pgtable.h \
-		$(TOPDIR)/include/asm-ppc/ptrace.h
+		$(srctree)/include/asm-ppc/mmu.h \
+		$(srctree)/include/asm-ppc/processor.h \
+		$(srctree)/include/asm-ppc/pgtable.h \
+		$(srctree)/include/asm-ppc/ptrace.h
 #	$(CC) $(CFLAGS) -S mk_defs.c
 	cp ppc_defs.head ppc_defs.h
 # for bk, this way we can write to the file even if it's not checked out
@@ -56,13 +56,13 @@ ppc_defs.h: mk_defs.c ppc_defs.head \
 
 checksum.o: checksum.S
 	rm -f asm
-	ln -s $(TOPDIR)/include/asm-ppc asm
+	ln -s $(srctree)/include/asm-ppc asm
 	$(CC) $(EXTRA_AFLAGS) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
 	rm -f asm
 
 misc.o: misc.S ppc_defs.h
 	rm -f asm
-	ln -s $(TOPDIR)/include/asm-ppc asm
+	ln -s $(srctree)/include/asm-ppc asm
 	$(CC) $(EXTRA_AFLAGS) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
 	rm -f asm
 
-- 
cgit v1.1


From ab26a5276c1b0945c3281a73b3a89d025906c880 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:24 -0800
Subject: uml: remove map_cb

John Reiser noticed that a physical memory region was being mapped twice.

This patch fixes that, and it inlines the responsible function, as that had
only one caller.

Cc: John Reiser <jreiser@BitWagon.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/kernel/mem.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index d948bab..d872fdc 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -36,11 +36,6 @@ int kmalloc_ok = 0;
 /* Used during early boot */
 static unsigned long brk_end;
 
-static void map_cb(void *unused)
-{
-	map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
-}
-
 #ifdef CONFIG_HIGHMEM
 static void setup_highmem(unsigned long highmem_start,
 			  unsigned long highmem_len)
@@ -68,8 +63,7 @@ void __init mem_init(void)
 	 * to be turned on.
 	 */
 	brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
-	map_cb(NULL);
-	initial_thread_cb(map_cb, NULL);
+	map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
 	free_bootmem(__pa(brk_end), uml_reserved - brk_end);
 	uml_reserved = brk_end;
 
-- 
cgit v1.1


From cc0be0fb3fd4bd2c363ef1b5c968cd6f2ce478cf Mon Sep 17 00:00:00 2001
From: Karol Swietlicki <magotari@gmail.com>
Date: Mon, 4 Feb 2008 22:31:25 -0800
Subject: uml: fix infinite mconsole loop

This patch takes care of a problem with the stopping code.

The function inside the while condition returns 0 to signify a problem.  A
problem could be for example a bad command or a bad version of the mconsole
client.  A bad command would terminate the stopping loop and resume the
kernel.  This is a problem.

A better solution is to make the loop infinite and don't leave it until we are
explicitly told to.

Signed-off-by: Karol Swietlicki <magotari@gmail.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/mconsole_kern.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index c953e14..949037e 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -305,7 +305,9 @@ void mconsole_stop(struct mc_request *req)
 	deactivate_fd(req->originating_fd, MCONSOLE_IRQ);
 	os_set_fd_block(req->originating_fd, 1);
 	mconsole_reply(req, "stopped", 0, 0);
-	while (mconsole_get_request(req->originating_fd, req)) {
+	for (;;) {
+		if (!mconsole_get_request(req->originating_fd, req))
+			continue;
 		if (req->cmd->handler == mconsole_go)
 			break;
 		if (req->cmd->handler == mconsole_stop) {
-- 
cgit v1.1


From 2278c5ac9d39699bac44250b9c532de0c02cb16a Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:25 -0800
Subject: uml: use of a public MAC is a warning, not an error

Downgrade one of the MAC validity checks.  If it's one that could be possibly
assigned to a physical NIC, then nothing will break.  So, emit a warning in
this case, but keep the requested MAC.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/net_kern.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index ca71577..1e8f41a 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -318,7 +318,7 @@ static void setup_etheraddr(char *str, unsigned char *addr, char *name)
 	if (str == NULL)
 		goto random;
 
-	for (i = 0;i < 6; i++) {
+	for (i = 0; i < 6; i++) {
 		addr[i] = simple_strtoul(str, &end, 16);
 		if ((end == str) ||
 		   ((*end != ':') && (*end != ',') && (*end != '\0'))) {
@@ -343,14 +343,13 @@ static void setup_etheraddr(char *str, unsigned char *addr, char *name)
 	}
 	if (!is_local_ether_addr(addr)) {
 		printk(KERN_WARNING
-		       "Warning: attempt to assign a globally valid ethernet "
+		       "Warning: Assigning a globally valid ethernet "
 		       "address to a device\n");
-		printk(KERN_WARNING "You should better enable the 2nd "
-		       "rightmost bit in the first byte of the MAC,\n");
+		printk(KERN_WARNING "You should set the 2nd rightmost bit in "
+		       "the first byte of the MAC,\n");
 		printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n",
 		       addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4],
 		       addr[5]);
-		goto random;
 	}
 	return;
 
-- 
cgit v1.1


From 01ac835fdd121f36dded404af15225101f6ccee3 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 4 Feb 2008 22:31:26 -0800
Subject: uml: LDT mutex conversion

The ldt.semaphore conforms to the new struct mutex requirments, so I converted
it to use the new API and changed the name.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/sys-i386/ldt.c | 14 +++++++-------
 include/asm-um/ldt.h   |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/um/sys-i386/ldt.c b/arch/um/sys-i386/ldt.c
index 505ed5c..a34263e 100644
--- a/arch/um/sys-i386/ldt.c
+++ b/arch/um/sys-i386/ldt.c
@@ -147,7 +147,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 	if (ptrace_ldt)
 		return read_ldt_from_host(ptr, bytecount);
 
-	down(&ldt->semaphore);
+	mutex_lock(&ldt->lock);
 	if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
 		size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
 		if (size > bytecount)
@@ -171,7 +171,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 			ptr += size;
 		}
 	}
-	up(&ldt->semaphore);
+	mutex_unlock(&ldt->lock);
 
 	if (bytecount == 0 || err == -EFAULT)
 		goto out;
@@ -229,7 +229,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
 	}
 
 	if (!ptrace_ldt)
-		down(&ldt->semaphore);
+		mutex_lock(&ldt->lock);
 
 	err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
 	if (err)
@@ -289,7 +289,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
 	err = 0;
 
 out_unlock:
-	up(&ldt->semaphore);
+	mutex_unlock(&ldt->lock);
 out:
 	return err;
 }
@@ -396,7 +396,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 
 
 	if (!ptrace_ldt)
-		init_MUTEX(&new_mm->ldt.semaphore);
+		mutex_init(&new_mm->ldt.lock);
 
 	if (!from_mm) {
 		memset(&desc, 0, sizeof(desc));
@@ -456,7 +456,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 		 * i.e., we have to use the stub for modify_ldt, which
 		 * can't handle the big read buffer of up to 64kB.
 		 */
-		down(&from_mm->ldt.semaphore);
+		mutex_lock(&from_mm->ldt.lock);
 		if (from_mm->ldt.entry_count <= LDT_DIRECT_ENTRIES)
 			memcpy(new_mm->ldt.u.entries, from_mm->ldt.u.entries,
 			       sizeof(new_mm->ldt.u.entries));
@@ -475,7 +475,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 			}
 		}
 		new_mm->ldt.entry_count = from_mm->ldt.entry_count;
-		up(&from_mm->ldt.semaphore);
+		mutex_unlock(&from_mm->ldt.lock);
 	}
 
     out:
diff --git a/include/asm-um/ldt.h b/include/asm-um/ldt.h
index b2553f3..52af512 100644
--- a/include/asm-um/ldt.h
+++ b/include/asm-um/ldt.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_LDT_H
 #define __ASM_LDT_H
 
-#include "asm/semaphore.h"
+#include <linux/mutex.h>
 #include "asm/host_ldt.h"
 
 extern void ldt_host_info(void);
@@ -27,7 +27,7 @@ struct ldt_entry {
 
 typedef struct uml_ldt {
 	int entry_count;
-	struct semaphore semaphore;
+	struct mutex lock;
 	union {
 		struct ldt_entry * pages[LDT_PAGES_MAX];
 		struct ldt_entry entries[LDT_DIRECT_ENTRIES];
-- 
cgit v1.1


From e98fa28160eabdcda4c4c5bf7af7a3256c10c922 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 4 Feb 2008 22:31:27 -0800
Subject: uml: mconsole mutex conversion

The plug_mem_mutex is already used as a mutex since it's using
DECLARE_MUTEX(), but the underlying construct is still a semaphore ..  This
patch switches it over to a struct mutex.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/mconsole_kern.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 949037e..9820fdb 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -17,6 +17,7 @@
 #include "linux/syscalls.h"
 #include "linux/utsname.h"
 #include "linux/workqueue.h"
+#include "linux/mutex.h"
 #include "asm/uaccess.h"
 #include "init.h"
 #include "irq_kern.h"
@@ -360,7 +361,7 @@ struct unplugged_pages {
 	void *pages[UNPLUGGED_PER_PAGE];
 };
 
-static DECLARE_MUTEX(plug_mem_mutex);
+static DEFINE_MUTEX(plug_mem_mutex);
 static unsigned long long unplugged_pages_count = 0;
 static LIST_HEAD(unplugged_pages);
 static int unplug_index = UNPLUGGED_PER_PAGE;
@@ -396,7 +397,7 @@ static int mem_config(char *str, char **error_out)
 
 	diff /= PAGE_SIZE;
 
-	down(&plug_mem_mutex);
+	mutex_lock(&plug_mem_mutex);
 	for (i = 0; i < diff; i++) {
 		struct unplugged_pages *unplugged;
 		void *addr;
@@ -453,7 +454,7 @@ static int mem_config(char *str, char **error_out)
 
 	err = 0;
 out_unlock:
-	up(&plug_mem_mutex);
+	mutex_unlock(&plug_mem_mutex);
 out:
 	return err;
 }
-- 
cgit v1.1


From 2aa9c5db8e1eadf12a6c938dbd3e39ba6b923b8c Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 4 Feb 2008 22:31:27 -0800
Subject: uml: port mutex conversion

The port_sem is already used as a mutex since it's using DECLARE_MUTEX(), but
the underlying construct is still a semaphore ..  This patch switches it over
to a struct mutex.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/port_kern.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 330543b..1993008 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -6,6 +6,7 @@
 #include "linux/completion.h"
 #include "linux/interrupt.h"
 #include "linux/list.h"
+#include "linux/mutex.h"
 #include "asm/atomic.h"
 #include "init.h"
 #include "irq_kern.h"
@@ -120,7 +121,7 @@ static int port_accept(struct port_list *port)
 	return 0;
 }
 
-static DECLARE_MUTEX(ports_sem);
+static DEFINE_MUTEX(ports_mutex);
 static LIST_HEAD(ports);
 
 static void port_work_proc(struct work_struct *unused)
@@ -161,7 +162,7 @@ void *port_data(int port_num)
 	struct port_dev *dev = NULL;
 	int fd;
 
-	down(&ports_sem);
+	mutex_lock(&ports_mutex);
 	list_for_each(ele, &ports) {
 		port = list_entry(ele, struct port_list, list);
 		if (port->port == port_num)
@@ -216,7 +217,7 @@ void *port_data(int port_num)
  out_free:
 	kfree(port);
  out:
-	up(&ports_sem);
+	mutex_unlock(&ports_mutex);
 	return dev;
 }
 
-- 
cgit v1.1


From 966f1d8f344bcec3db7d774a4ba3ab0dedfad873 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:28 -0800
Subject: uml: defconfig tweaks

Tweak the UML defconfig -
      we probably don't need 256 old-style ptys - this slows down udev
noticably
      enable hostfs
      disable slab debugging - another noticable performance hit

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/defconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/um/defconfig b/arch/um/defconfig
index f609ede..86db286 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -77,7 +77,7 @@ CONFIG_LD_SCRIPT_DYN=y
 CONFIG_NET=y
 CONFIG_BINFMT_ELF=y
 CONFIG_BINFMT_MISC=m
-# CONFIG_HOSTFS is not set
+CONFIG_HOSTFS=y
 # CONFIG_HPPFS is not set
 CONFIG_MCONSOLE=y
 CONFIG_MAGIC_SYSRQ=y
@@ -188,7 +188,7 @@ CONFIG_CON_CHAN="xterm"
 CONFIG_SSL_CHAN="pts"
 CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
-CONFIG_LEGACY_PTY_COUNT=256
+CONFIG_LEGACY_PTY_COUNT=32
 # CONFIG_WATCHDOG is not set
 CONFIG_UML_SOUND=m
 CONFIG_SOUND=m
@@ -508,7 +508,7 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_SCHEDSTATS is not set
-CONFIG_DEBUG_SLAB=y
+# CONFIG_DEBUG_SLAB is not set
 # CONFIG_DEBUG_SLAB_LEAK is not set
 # CONFIG_DEBUG_MUTEXES is not set
 # CONFIG_DEBUG_SPINLOCK is not set
-- 
cgit v1.1


From f87ea91d988637b3bbf6aa2d281c6010e7d5f48d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:29 -0800
Subject: uml: redo the calculation of NR_syscalls

Redo the calculation of NR_syscalls since that disappeared from i386 and
use a similar mechanism on x86_64.

We now figure out the size of the system call table in arch code and stick
that in syscall_table_size.  arch/um/kernel/skas/syscall.c defines
NR_syscalls in terms of that since its the only thing that needs to know
how many system calls there are.

The old mechananism that was used on x86_64 is gone.

arch/um/include/sysdep-i386/syscalls.h got some formatting since I was
looking at it.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/include/sysdep-i386/syscalls.h         |  5 +++--
 arch/um/include/sysdep-x86_64/kernel-offsets.h |  9 ---------
 arch/um/include/sysdep-x86_64/syscalls.h       |  2 --
 arch/um/kernel/skas/syscall.c                  |  3 +++
 arch/um/sys-i386/sys_call_table.S              |  5 +++++
 arch/um/sys-x86_64/syscall_table.c             | 20 ++++++++++++++------
 6 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/arch/um/include/sysdep-i386/syscalls.h b/arch/um/include/sysdep-i386/syscalls.h
index 57bd79e..9056981 100644
--- a/arch/um/include/sysdep-i386/syscalls.h
+++ b/arch/um/include/sysdep-i386/syscalls.h
@@ -1,5 +1,5 @@
 /* 
- * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -18,7 +18,8 @@ extern syscall_handler_t old_mmap_i386;
 extern syscall_handler_t *sys_call_table[];
 
 #define EXECUTE_SYSCALL(syscall, regs) \
-	((long (*)(struct syscall_args)) (*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
+	((long (*)(struct syscall_args)) \
+	 (*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
 
 extern long sys_mmap2(unsigned long addr, unsigned long len,
 		      unsigned long prot, unsigned long flags,
diff --git a/arch/um/include/sysdep-x86_64/kernel-offsets.h b/arch/um/include/sysdep-x86_64/kernel-offsets.h
index c978b58..a307237 100644
--- a/arch/um/include/sysdep-x86_64/kernel-offsets.h
+++ b/arch/um/include/sysdep-x86_64/kernel-offsets.h
@@ -17,16 +17,7 @@
 #define OFFSET(sym, str, mem) \
 	DEFINE(sym, offsetof(struct str, mem));
 
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/arch/unistd.h>
-};
-
 void foo(void)
 {
 #include <common-offsets.h>
-DEFINE(UM_NR_syscall_max, sizeof(syscalls) - 1);
 }
diff --git a/arch/um/include/sysdep-x86_64/syscalls.h b/arch/um/include/sysdep-x86_64/syscalls.h
index cf72256..7cfb0b08 100644
--- a/arch/um/include/sysdep-x86_64/syscalls.h
+++ b/arch/um/include/sysdep-x86_64/syscalls.h
@@ -30,6 +30,4 @@ extern long old_mmap(unsigned long addr, unsigned long len,
 extern syscall_handler_t sys_modify_ldt;
 extern syscall_handler_t sys_arch_prctl;
 
-#define NR_syscalls (UM_NR_syscall_max + 1)
-
 #endif
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index 6450f02..4e3b820 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -9,6 +9,9 @@
 #include "sysdep/ptrace.h"
 #include "sysdep/syscalls.h"
 
+extern int syscall_table_size;
+#define NR_syscalls (syscall_table_size / sizeof(void *))
+
 void handle_syscall(struct uml_pt_regs *r)
 {
 	struct pt_regs *regs = container_of(r, struct pt_regs, regs);
diff --git a/arch/um/sys-i386/sys_call_table.S b/arch/um/sys-i386/sys_call_table.S
index 12d4148..00e5f520 100644
--- a/arch/um/sys-i386/sys_call_table.S
+++ b/arch/um/sys-i386/sys_call_table.S
@@ -9,4 +9,9 @@
 
 #define old_mmap old_mmap_i386
 
+.section .rodata,"a"
+
 #include "../../x86/kernel/syscall_table_32.S"
+
+ENTRY(syscall_table_size)
+.long .-sys_call_table
diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c
index 555faee..c128eb8 100644
--- a/arch/um/sys-x86_64/syscall_table.c
+++ b/arch/um/sys-x86_64/syscall_table.c
@@ -52,11 +52,19 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[UM_NR_syscall_max+1] __cacheline_aligned = {
-	/*
-	 * Smells like a like a compiler bug -- it doesn't work when the &
-	 * below is removed.
-	 */
-	[0 ... UM_NR_syscall_max] = &sys_ni_syscall,
+/*
+ * We used to have a trick here which made sure that holes in the
+ * x86_64 table were filled in with sys_ni_syscall, but a comment in
+ * unistd_64.h says that holes aren't allowed, so the trick was
+ * removed.
+ * The trick looked like this
+ *	[0 ... UM_NR_syscall_max] = &sys_ni_syscall
+ * before including unistd_64.h - the later initializations overwrote
+ * the sys_ni_syscall filler.
+ */
+
+sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 #include <asm-x86/unistd_64.h>
 };
+
+int syscall_table_size = sizeof(sys_call_table);
-- 
cgit v1.1


From 827b3f6abc21246928e38480bb308936701a2ad4 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 4 Feb 2008 22:31:29 -0800
Subject: uml: make mconsole_stack namespace-aware

Also fixed the include syntax while I was there.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/mconsole_kern.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 9820fdb..ebb265c 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -1,24 +1,25 @@
 /*
  * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org)
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
-#include "linux/console.h"
-#include "linux/ctype.h"
-#include "linux/interrupt.h"
-#include "linux/list.h"
-#include "linux/mm.h"
-#include "linux/module.h"
-#include "linux/notifier.h"
-#include "linux/reboot.h"
-#include "linux/proc_fs.h"
-#include "linux/slab.h"
-#include "linux/syscalls.h"
-#include "linux/utsname.h"
-#include "linux/workqueue.h"
-#include "linux/mutex.h"
-#include "asm/uaccess.h"
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/utsname.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
 #include "init.h"
 #include "irq_kern.h"
 #include "irq_user.h"
@@ -765,7 +766,7 @@ void mconsole_stack(struct mc_request *req)
 		return;
 	}
 
-	to = find_task_by_pid(pid_requested);
+	to = find_task_by_pid_ns(pid_requested, &init_pid_ns);
 	if ((to == NULL) || (pid_requested == 0)) {
 		mconsole_reply(req, "Couldn't find that pid", 1, 0);
 		return;
-- 
cgit v1.1


From 8cb2a7c1e95e472b5ad8cbde4d5c7bb65c532603 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 5 Feb 2008 22:26:01 +0000
Subject: stop c_p_a corrupting the pds

When change_page_attr splits a large page on x86_32 (without PAE), it is
currently corrupting every process's page directory: fix that by removing
the thinko which passes down a physical instead of a virtual address.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pageattr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb55a78..16ce841 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -237,7 +237,6 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 	if (!SHARED_KERNEL_PMD) {
 		struct page *page;
 
-		address = __pa(address);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			pud_t *pud;
-- 
cgit v1.1


From 46a56c5a02430f80ef73357aa1295875c1cef2a7 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Tue, 5 Feb 2008 14:22:55 -0800
Subject: Fix timerfd breakage on avr32

Hmm. Someone removed the timerfd() syscall...

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/kernel/syscall_table.S | 2 +-
 include/asm-avr32/unistd.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index 75c81f2..478bda4 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -293,6 +293,6 @@ sys_call_table:
 	.long	sys_shmctl
 	.long	sys_utimensat
 	.long	sys_signalfd
-	.long	sys_timerfd		/* 280 */
+	.long	sys_ni_syscall		/* 280, was sys_timerfd */
 	.long	sys_eventfd
 	.long	sys_ni_syscall		/* r8 is saturated at nr_syscalls */
diff --git a/include/asm-avr32/unistd.h b/include/asm-avr32/unistd.h
index de09009..89861a2 100644
--- a/include/asm-avr32/unistd.h
+++ b/include/asm-avr32/unistd.h
@@ -297,7 +297,7 @@
 
 #define __NR_utimensat		278
 #define __NR_signalfd		279
-#define __NR_timerfd		280
+/* 280 was __NR_timerfd */
 #define __NR_eventfd		281
 
 #ifdef __KERNEL__
-- 
cgit v1.1


From 9692bd9c140618e3f6a2848900aee96c9cd8a65c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 5 Feb 2008 14:22:56 -0800
Subject: timerfd: fix remaining architectures

Cc: David Howells <dhowells@redhat.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/entry.S          | 2 +-
 arch/m32r/kernel/syscall_table.S | 2 +-
 arch/sh/kernel/syscalls_32.S     | 2 +-
 arch/sh/kernel/syscalls_64.S     | 2 +-
 include/asm-frv/unistd.h         | 2 +-
 include/asm-m32r/unistd.h        | 2 +-
 include/asm-sh/unistd_32.h       | 2 +-
 include/asm-sh/unistd_64.h       | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 99046b1..ca6a345 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1494,7 +1494,7 @@ sys_call_table:
 	.long sys_epoll_pwait
 	.long sys_utimensat		/* 320 */
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate
 
diff --git a/arch/m32r/kernel/syscall_table.S b/arch/m32r/kernel/syscall_table.S
index 95aa798..aa3bf4c 100644
--- a/arch/m32r/kernel/syscall_table.S
+++ b/arch/m32r/kernel/syscall_table.S
@@ -321,6 +321,6 @@ ENTRY(sys_call_table)
 	.long sys_epoll_pwait
 	.long sys_utimensat		/* 320 */
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 10bec45..719e127 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -338,6 +338,6 @@ ENTRY(sys_call_table)
 	.long sys_epoll_pwait
 	.long sys_utimensat		/* 320 */
 	.long sys_signalfd
-	.long sys_timerfd
+	.long sys_ni_syscall
 	.long sys_eventfd
 	.long sys_fallocate
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index 98a93ef..12c7340 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -376,6 +376,6 @@ sys_call_table:
 	.long sys_epoll_pwait
 	.long sys_utimensat
 	.long sys_signalfd
-	.long sys_timerfd		/* 350 */
+	.long sys_ni_syscall		/* 350 */
 	.long sys_eventfd
 	.long sys_fallocate
diff --git a/include/asm-frv/unistd.h b/include/asm-frv/unistd.h
index cd84f17..e8c9866 100644
--- a/include/asm-frv/unistd.h
+++ b/include/asm-frv/unistd.h
@@ -328,7 +328,7 @@
 #define __NR_epoll_pwait	319
 #define __NR_utimensat		320
 #define __NR_signalfd		321
-#define __NR_timerfd		322
+/* #define __NR_timerfd		322 removed */
 #define __NR_eventfd		323
 #define __NR_fallocate		324
 
diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h
index f467eac..cf701c9 100644
--- a/include/asm-m32r/unistd.h
+++ b/include/asm-m32r/unistd.h
@@ -327,7 +327,7 @@
 #define __NR_epoll_pwait	319
 #define __NR_utimensat		320
 #define __NR_signalfd		321
-#define __NR_timerfd		322
+/* #define __NR_timerfd		322 removed */
 #define __NR_eventfd		323
 #define __NR_fallocate		324
 
diff --git a/include/asm-sh/unistd_32.h b/include/asm-sh/unistd_32.h
index b182b1c..433fd1b 100644
--- a/include/asm-sh/unistd_32.h
+++ b/include/asm-sh/unistd_32.h
@@ -330,7 +330,7 @@
 #define __NR_epoll_pwait	319
 #define __NR_utimensat		320
 #define __NR_signalfd		321
-#define __NR_timerfd		322
+/* #define __NR_timerfd		322 removed */
 #define __NR_eventfd		323
 #define __NR_fallocate		324
 
diff --git a/include/asm-sh/unistd_64.h b/include/asm-sh/unistd_64.h
index 9445118..108d2ba 100644
--- a/include/asm-sh/unistd_64.h
+++ b/include/asm-sh/unistd_64.h
@@ -370,7 +370,7 @@
 #define __NR_epoll_pwait	347
 #define __NR_utimensat		348
 #define __NR_signalfd		349
-#define __NR_timerfd		350
+/* #define __NR_timerfd		350 removed */
 #define __NR_eventfd		351
 #define __NR_fallocate		352
 
-- 
cgit v1.1


From c773633916c66f8362ca01983d97bd33e35b743f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 5 Feb 2008 14:22:58 -0800
Subject: deprecate smbfs in favour of cifs

smbfs is a bit buggy and has no maintainer.  Change it to shout at the user on
the first five mount attempts - tell them to switch to CIFS.

Come December we'll mark it BROKEN and see what happens.

[olecom@flower.upol.cz: documentation update]
Cc: Urban Widmark <urban@teststation.com>
Acked-by: Steven French <sfrench@us.ibm.com>
Signed-off-by: Oleg Verych <olecom@flower.upol.cz>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig       | 28 ++++++++++++++--------------
 fs/smbfs/inode.c |  7 +++++++
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 987b5d7..ea5b359 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1152,7 +1152,7 @@ config BEFS_DEBUG
 	depends on BEFS_FS
 	help
 	  If you say Y here, you can use the 'debug' mount option to enable
-	  debugging output from the driver. 
+	  debugging output from the driver.
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
@@ -1263,7 +1263,7 @@ config JFFS2_FS_XATTR
 	  Extended attributes are name:value pairs associated with inodes by
 	  the kernel or by users (see the attr(5) manual page, or visit
 	  <http://acl.bestbits.at/> for details).
-	  
+
 	  If unsure, say N.
 
 config JFFS2_FS_POSIX_ACL
@@ -1274,10 +1274,10 @@ config JFFS2_FS_POSIX_ACL
 	help
 	  Posix Access Control Lists (ACLs) support permissions for users and
 	  groups beyond the owner/group/world scheme.
-	  
+
 	  To learn more about Access Control Lists, visit the Posix ACLs for
 	  Linux website <http://acl.bestbits.at/>.
-	  
+
 	  If you don't know what Access Control Lists are, say N
 
 config JFFS2_FS_SECURITY
@@ -1289,7 +1289,7 @@ config JFFS2_FS_SECURITY
 	  implemented by security modules like SELinux.  This option
 	  enables an extended attribute handler for file security
 	  labels in the jffs2 filesystem.
-	  
+
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
@@ -1835,7 +1835,7 @@ config RPCSEC_GSS_SPKM3
 	  If unsure, say N.
 
 config SMB_FS
-	tristate "SMB file system support (to mount Windows shares etc.)"
+	tristate "SMB file system support (OBSOLETE, please use CIFS)"
 	depends on INET
 	select NLS
 	help
@@ -1858,8 +1858,8 @@ config SMB_FS
 	  General information about how to connect Linux, Windows machines and
 	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
 
-	  To compile the SMB support as a module, choose M here: the module will
-	  be called smbfs.  Most people say N, however.
+	  To compile the SMB support as a module, choose M here:
+	  the module will be called smbfs.  Most people say N, however.
 
 config SMB_NLS_DEFAULT
 	bool "Use a default NLS"
@@ -1891,7 +1891,7 @@ config SMB_NLS_REMOTE
 	  smbmount from samba 2.2.0 or later supports this.
 
 config CIFS
-	tristate "CIFS support (advanced network filesystem for Samba, Window and other CIFS compliant servers)"
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
 	depends on INET
 	select NLS
 	help
@@ -1949,16 +1949,16 @@ config CIFS_WEAK_PW_HASH
 	  LANMAN based servers such as OS/2 and Windows 95, but such
 	  mounts may be less secure than mounts using NTLM or more recent
 	  security mechanisms if you are on a public network.  Unless you
-	  have a need to access old SMB servers (and are on a private 
+	  have a need to access old SMB servers (and are on a private
 	  network) you probably want to say N.  Even if this support
 	  is enabled in the kernel build, LANMAN authentication will not be
 	  used automatically. At runtime LANMAN mounts are disabled but
 	  can be set to required (or optional) either in
 	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
-	  option on the mount command. This support is disabled by 
+	  option on the mount command. This support is disabled by
 	  default in order to reduce the possibility of a downgrade
 	  attack.
- 
+
 	  If unsure, say N.
 
 config CIFS_XATTR
@@ -1999,7 +1999,7 @@ config CIFS_DEBUG2
 	   messages in some error paths, slowing performance. This
 	   option can be turned off unless you are debugging
 	   cifs problems.  If unsure, say N.
-	   
+
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
@@ -2090,7 +2090,7 @@ config CODA_FS_OLD_API
 	  However this new API is not backward compatible with older
 	  clients. If you really need to run the old Coda userspace
 	  cache manager then say Y.
-	  
+
 	  For most cases you probably want to say N.
 
 config AFS_FS
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 9416ead..4e5c22c 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -500,6 +500,13 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	struct smb_fattr root;
 	int ver;
 	void *mem;
+	static int warn_count;
+
+	if (warn_count < 5) {
+		warn_count++;
+		printk(KERN_EMERG "smbfs is deprecated and will be removed"
+			"from the 2.6.27 kernel.  Please migrate to cifs\n");
+	}
 
 	if (!raw_data)
 		goto out_no_data;
-- 
cgit v1.1


From b2a53bc636b0e7e9ce4c899ad605432339ef5861 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:48 +0100
Subject: ide-generic: probing bugfix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Tuesday 05 February 2008, Linus Torvalds wrote:
>
> On Sat, 2 Feb 2008, Bartlomiej Zolnierkiewicz wrote:
> >
> > * next part of IDE probing code re-organization saga
> >   (that would be me)
>
> This seems to cause very irritating and bogus messages for me:
>
>       Probing IDE interface ide0...
>       Probing IDE interface ide1...
>       ide2: I/O resource 0x0-0x7 not free.
>       ide2: ports already in use, skipping probe
>       ide3: I/O resource 0x0-0x7 not free.
>       ide3: ports already in use, skipping probe
>       ide4: I/O resource 0x0-0x7 not free.
>       ide4: ports already in use, skipping probe
>       ide5: I/O resource 0x0-0x7 not free.
>       ide5: ports already in use, skipping probe
>       ide6: I/O resource 0x0-0x7 not free.
>       ide6: ports already in use, skipping probe
>       ide7: I/O resource 0x0-0x7 not free.
>       ide7: ports already in use, skipping probe
>       ide8: I/O resource 0x0-0x7 not free.
>       ide8: ports already in use, skipping probe
>       ide9: I/O resource 0x0-0x7 not free.
>       ide9: ports already in use, skipping probe
>
> and that's just totally bogus. It shouldn't even request that region,
> since it's not been allocated!

The commit 139ddfcab50e5eabcc88341c8743a990ac1be6a2 ("ide: move handling of
I/O resources out of ide_probe_port()") changed the ordering of hwif->noprobe
check vs ide_hwif_request_regions() call (so that we now reserve I/O regions
before checking for hwif->noprobe).  However ide-generic host driver depended
on hwif->noprobe to be set for skipping probing of empty ide_hwifs[] slots.

Fix it by passing only indexes of non-empty slots to ide_device_add_all()
from ide_generic_init().

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-generic.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
index be469db..709b9e4 100644
--- a/drivers/ide/ide-generic.c
+++ b/drivers/ide/ide-generic.c
@@ -20,8 +20,14 @@ static int __init ide_generic_init(void)
 	if (ide_hwifs[0].io_ports[IDE_DATA_OFFSET])
 		ide_get_lock(NULL, NULL); /* for atari only */
 
-	for (i = 0; i < MAX_HWIFS; i++)
-		idx[i] = ide_hwifs[i].present ? 0xff : i;
+	for (i = 0; i < MAX_HWIFS; i++) {
+		ide_hwif_t *hwif = &ide_hwifs[i];
+
+		if (hwif->io_ports[IDE_DATA_OFFSET] && !hwif->present)
+			idx[i] = i;
+		else
+			idx[i] = 0xff;
+	}
 
 	ide_device_add_all(idx, NULL);
 
-- 
cgit v1.1


From 7c7e92a9268965e08bba853ecdb94fa55e886741 Mon Sep 17 00:00:00 2001
From: Anton Salnikov <asalnikov@ru.mvista.com>
Date: Wed, 6 Feb 2008 02:57:48 +0100
Subject: Palmchip BK3710 IDE driver

This is Palmchip BK3710 IDE controller support.

The IDE controller logic supports PIO, MultiWord-DMA and Ultra-DMA modes.
Supports interface to Compact Flash (CF) configured in True-IDE mode.

Bart:
- remove dead code
- fix ide_hwif_setup_dma() build problem

Signed-off-by: Anton Salnikov <asalnikov@ru.mvista.com>
Reviewed-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Reviewed-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig           |   9 +
 drivers/ide/arm/Makefile      |   1 +
 drivers/ide/arm/palm_bk3710.c | 395 ++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-proc.c        |   1 +
 include/linux/ide.h           |   5 +-
 5 files changed, 409 insertions(+), 2 deletions(-)
 create mode 100644 drivers/ide/arm/palm_bk3710.c

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 45b26ed..ab8fb25 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -1009,6 +1009,15 @@ config BLK_DEV_Q40IDE
 	  normally be on; disable it only if you are running a custom hard
 	  drive subsystem through an expansion card.
 
+config BLK_DEV_PALMCHIP_BK3710
+	tristate "Palmchip bk3710 IDE controller support"
+	depends on ARCH_DAVINCI
+	select BLK_DEV_IDEDMA_PCI
+	help
+	  Say Y here if you want to support the onchip IDE controller on the
+	  TI DaVinci SoC
+
+
 config BLK_DEV_MPC8xx_IDE
 	tristate "MPC8xx IDE support"
 	depends on 8xx && (LWMON || IVMS8 || IVML24 || TQM8xxL) && IDE=y && BLK_DEV_IDE=y && !PPC_MERGE
diff --git a/drivers/ide/arm/Makefile b/drivers/ide/arm/Makefile
index 5f63ad2..936e7b0 100644
--- a/drivers/ide/arm/Makefile
+++ b/drivers/ide/arm/Makefile
@@ -2,6 +2,7 @@
 obj-$(CONFIG_BLK_DEV_IDE_ICSIDE)	+= icside.o
 obj-$(CONFIG_BLK_DEV_IDE_RAPIDE)	+= rapide.o
 obj-$(CONFIG_BLK_DEV_IDE_BAST)		+= bast-ide.o
+obj-$(CONFIG_BLK_DEV_PALMCHIP_BK3710)	+= palm_bk3710.o
 
 ifeq ($(CONFIG_IDE_ARM), m)
 	obj-m += ide_arm.o
diff --git a/drivers/ide/arm/palm_bk3710.c b/drivers/ide/arm/palm_bk3710.c
new file mode 100644
index 0000000..c306997
--- /dev/null
+++ b/drivers/ide/arm/palm_bk3710.c
@@ -0,0 +1,395 @@
+/*
+ * Palmchip bk3710 IDE controller
+ *
+ * Copyright (C) 2006 Texas Instruments.
+ * Copyright (C) 2007 MontaVista Software, Inc., <source@mvista.com>
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * ----------------------------------------------------------------------------
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/hdreg.h>
+#include <linux/ide.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+
+/* Offset of the primary interface registers */
+#define IDE_PALM_ATA_PRI_REG_OFFSET 0x1F0
+
+/* Primary Control Offset */
+#define IDE_PALM_ATA_PRI_CTL_OFFSET 0x3F6
+
+/*
+ * PalmChip 3710 IDE Controller UDMA timing structure Definition
+ */
+struct palm_bk3710_udmatiming {
+	unsigned int rptime;	/* Ready to pause time  */
+	unsigned int cycletime;	/* Cycle Time           */
+};
+
+#define BK3710_BMICP		0x00
+#define BK3710_BMISP		0x02
+#define BK3710_BMIDTP		0x04
+#define BK3710_BMICS		0x08
+#define BK3710_BMISS		0x0A
+#define BK3710_BMIDTS		0x0C
+#define BK3710_IDETIMP		0x40
+#define BK3710_IDETIMS		0x42
+#define BK3710_SIDETIM		0x44
+#define BK3710_SLEWCTL		0x45
+#define BK3710_IDESTATUS	0x47
+#define BK3710_UDMACTL		0x48
+#define BK3710_UDMATIM		0x4A
+#define BK3710_MISCCTL		0x50
+#define BK3710_REGSTB		0x54
+#define BK3710_REGRCVR		0x58
+#define BK3710_DATSTB		0x5C
+#define BK3710_DATRCVR		0x60
+#define BK3710_DMASTB		0x64
+#define BK3710_DMARCVR		0x68
+#define BK3710_UDMASTB		0x6C
+#define BK3710_UDMATRP		0x70
+#define BK3710_UDMAENV		0x74
+#define BK3710_IORDYTMP		0x78
+#define BK3710_IORDYTMS		0x7C
+
+#include "../ide-timing.h"
+
+static long ide_palm_clk;
+
+static const struct palm_bk3710_udmatiming palm_bk3710_udmatimings[6] = {
+	{160, 240},		/* UDMA Mode 0 */
+	{125, 160},		/* UDMA Mode 1 */
+	{100, 120},		/* UDMA Mode 2 */
+	{100, 90},		/* UDMA Mode 3 */
+	{85,  60},		/* UDMA Mode 4 */
+};
+
+static struct clk *ideclkp;
+
+static void palm_bk3710_setudmamode(void __iomem *base, unsigned int dev,
+				    unsigned int mode)
+{
+	u8 tenv, trp, t0;
+	u32 val32;
+	u16 val16;
+
+	/* DMA Data Setup */
+	t0 = (palm_bk3710_udmatimings[mode].cycletime + ide_palm_clk - 1)
+			/ ide_palm_clk - 1;
+	tenv = (20 + ide_palm_clk - 1) / ide_palm_clk - 1;
+	trp = (palm_bk3710_udmatimings[mode].rptime + ide_palm_clk - 1)
+			/ ide_palm_clk - 1;
+
+	/* udmatim Register */
+	val16 = readw(base + BK3710_UDMATIM) & (dev ? 0xFF0F : 0xFFF0);
+	val16 |= (mode << (dev ? 4 : 0));
+	writew(val16, base + BK3710_UDMATIM);
+
+	/* udmastb Ultra DMA Access Strobe Width */
+	val32 = readl(base + BK3710_UDMASTB) & (0xFF << (dev ? 0 : 8));
+	val32 |= (t0 << (dev ? 8 : 0));
+	writel(val32, base + BK3710_UDMASTB);
+
+	/* udmatrp Ultra DMA Ready to Pause Time */
+	val32 = readl(base + BK3710_UDMATRP) & (0xFF << (dev ? 0 : 8));
+	val32 |= (trp << (dev ? 8 : 0));
+	writel(val32, base + BK3710_UDMATRP);
+
+	/* udmaenv Ultra DMA envelop Time */
+	val32 = readl(base + BK3710_UDMAENV) & (0xFF << (dev ? 0 : 8));
+	val32 |= (tenv << (dev ? 8 : 0));
+	writel(val32, base + BK3710_UDMAENV);
+
+	/* Enable UDMA for Device */
+	val16 = readw(base + BK3710_UDMACTL) | (1 << dev);
+	writew(val16, base + BK3710_UDMACTL);
+}
+
+static void palm_bk3710_setdmamode(void __iomem *base, unsigned int dev,
+				   unsigned short min_cycle,
+				   unsigned int mode)
+{
+	u8 td, tkw, t0;
+	u32 val32;
+	u16 val16;
+	struct ide_timing *t;
+	int cycletime;
+
+	t = ide_timing_find_mode(mode);
+	cycletime = max_t(int, t->cycle, min_cycle);
+
+	/* DMA Data Setup */
+	t0 = (cycletime + ide_palm_clk - 1) / ide_palm_clk;
+	td = (t->active + ide_palm_clk - 1) / ide_palm_clk;
+	tkw = t0 - td - 1;
+	td -= 1;
+
+	val32 = readl(base + BK3710_DMASTB) & (0xFF << (dev ? 0 : 8));
+	val32 |= (td << (dev ? 8 : 0));
+	writel(val32, base + BK3710_DMASTB);
+
+	val32 = readl(base + BK3710_DMARCVR) & (0xFF << (dev ? 0 : 8));
+	val32 |= (tkw << (dev ? 8 : 0));
+	writel(val32, base + BK3710_DMARCVR);
+
+	/* Disable UDMA for Device */
+	val16 = readw(base + BK3710_UDMACTL) & ~(1 << dev);
+	writew(val16, base + BK3710_UDMACTL);
+}
+
+static void palm_bk3710_setpiomode(void __iomem *base, ide_drive_t *mate,
+				   unsigned int dev, unsigned int cycletime,
+				   unsigned int mode)
+{
+	u8 t2, t2i, t0;
+	u32 val32;
+	struct ide_timing *t;
+
+	/* PIO Data Setup */
+	t0 = (cycletime + ide_palm_clk - 1) / ide_palm_clk;
+	t2 = (ide_timing_find_mode(XFER_PIO_0 + mode)->active +
+	      ide_palm_clk - 1)	/ ide_palm_clk;
+
+	t2i = t0 - t2 - 1;
+	t2 -= 1;
+
+	val32 = readl(base + BK3710_DATSTB) & (0xFF << (dev ? 0 : 8));
+	val32 |= (t2 << (dev ? 8 : 0));
+	writel(val32, base + BK3710_DATSTB);
+
+	val32 = readl(base + BK3710_DATRCVR) & (0xFF << (dev ? 0 : 8));
+	val32 |= (t2i << (dev ? 8 : 0));
+	writel(val32, base + BK3710_DATRCVR);
+
+	if (mate && mate->present) {
+		u8 mode2 = ide_get_best_pio_mode(mate, 255, 4);
+
+		if (mode2 < mode)
+			mode = mode2;
+	}
+
+	/* TASKFILE Setup */
+	t = ide_timing_find_mode(XFER_PIO_0 + mode);
+	t0 = (t->cyc8b + ide_palm_clk - 1) / ide_palm_clk;
+	t2 = (t->act8b + ide_palm_clk - 1) / ide_palm_clk;
+
+	t2i = t0 - t2 - 1;
+	t2 -= 1;
+
+	val32 = readl(base + BK3710_REGSTB) & (0xFF << (dev ? 0 : 8));
+	val32 |= (t2 << (dev ? 8 : 0));
+	writel(val32, base + BK3710_REGSTB);
+
+	val32 = readl(base + BK3710_REGRCVR) & (0xFF << (dev ? 0 : 8));
+	val32 |= (t2i << (dev ? 8 : 0));
+	writel(val32, base + BK3710_REGRCVR);
+}
+
+static void palm_bk3710_set_dma_mode(ide_drive_t *drive, u8 xferspeed)
+{
+	int is_slave = drive->dn & 1;
+	void __iomem *base = (void *)drive->hwif->dma_base;
+
+	if (xferspeed >= XFER_UDMA_0) {
+		palm_bk3710_setudmamode(base, is_slave,
+					xferspeed - XFER_UDMA_0);
+	} else {
+		palm_bk3710_setdmamode(base, is_slave, drive->id->eide_dma_min,
+				       xferspeed);
+	}
+}
+
+static void palm_bk3710_set_pio_mode(ide_drive_t *drive, u8 pio)
+{
+	unsigned int cycle_time;
+	int is_slave = drive->dn & 1;
+	ide_drive_t *mate;
+	void __iomem *base = (void *)drive->hwif->dma_base;
+
+	/*
+	 * Obtain the drive PIO data for tuning the Palm Chip registers
+	 */
+	cycle_time = ide_pio_cycle_time(drive, pio);
+	mate = ide_get_paired_drive(drive);
+	palm_bk3710_setpiomode(base, mate, is_slave, cycle_time, pio);
+}
+
+static void __devinit palm_bk3710_chipinit(void __iomem *base)
+{
+	/*
+	 * enable the reset_en of ATA controller so that when ata signals
+	 * are brought out, by writing into device config. at that
+	 * time por_n signal should not be 'Z' and have a stable value.
+	 */
+	writel(0x0300, base + BK3710_MISCCTL);
+
+	/* wait for some time and deassert the reset of ATA Device. */
+	mdelay(100);
+
+	/* Deassert the Reset */
+	writel(0x0200, base + BK3710_MISCCTL);
+
+	/*
+	 * Program the IDETIMP Register Value based on the following assumptions
+	 *
+	 * (ATA_IDETIMP_IDEEN		, ENABLE ) |
+	 * (ATA_IDETIMP_SLVTIMEN	, DISABLE) |
+	 * (ATA_IDETIMP_RDYSMPL		, 70NS)    |
+	 * (ATA_IDETIMP_RDYRCVRY	, 50NS)    |
+	 * (ATA_IDETIMP_DMAFTIM1	, PIOCOMP) |
+	 * (ATA_IDETIMP_PREPOST1	, DISABLE) |
+	 * (ATA_IDETIMP_RDYSEN1		, DISABLE) |
+	 * (ATA_IDETIMP_PIOFTIM1	, DISABLE) |
+	 * (ATA_IDETIMP_DMAFTIM0	, PIOCOMP) |
+	 * (ATA_IDETIMP_PREPOST0	, DISABLE) |
+	 * (ATA_IDETIMP_RDYSEN0		, DISABLE) |
+	 * (ATA_IDETIMP_PIOFTIM0	, DISABLE)
+	 */
+	writew(0xB388, base + BK3710_IDETIMP);
+
+	/*
+	 * Configure  SIDETIM  Register
+	 * (ATA_SIDETIM_RDYSMPS1	,120NS ) |
+	 * (ATA_SIDETIM_RDYRCYS1	,120NS )
+	 */
+	writeb(0, base + BK3710_SIDETIM);
+
+	/*
+	 * UDMACTL Ultra-ATA DMA Control
+	 * (ATA_UDMACTL_UDMAP1	, 0 ) |
+	 * (ATA_UDMACTL_UDMAP0	, 0 )
+	 *
+	 */
+	writew(0, base + BK3710_UDMACTL);
+
+	/*
+	 * MISCCTL Miscellaneous Conrol Register
+	 * (ATA_MISCCTL_RSTMODEP	, 1) |
+	 * (ATA_MISCCTL_RESETP		, 0) |
+	 * (ATA_MISCCTL_TIMORIDE	, 1)
+	 */
+	writel(0x201, base + BK3710_MISCCTL);
+
+	/*
+	 * IORDYTMP IORDY Timer for Primary Register
+	 * (ATA_IORDYTMP_IORDYTMP     , 0xffff  )
+	 */
+	writel(0xFFFF, base + BK3710_IORDYTMP);
+
+	/*
+	 * Configure BMISP Register
+	 * (ATA_BMISP_DMAEN1	, DISABLE )	|
+	 * (ATA_BMISP_DMAEN0	, DISABLE )	|
+	 * (ATA_BMISP_IORDYINT	, CLEAR)	|
+	 * (ATA_BMISP_INTRSTAT	, CLEAR)	|
+	 * (ATA_BMISP_DMAERROR	, CLEAR)
+	 */
+	writew(0, base + BK3710_BMISP);
+
+	palm_bk3710_setpiomode(base, NULL, 0, 600, 0);
+	palm_bk3710_setpiomode(base, NULL, 1, 600, 0);
+}
+static int __devinit palm_bk3710_probe(struct platform_device *pdev)
+{
+	hw_regs_t ide_ctlr_info;
+	int index = 0;
+	int pribase;
+	struct clk *clkp;
+	struct resource *mem, *irq;
+	ide_hwif_t *hwif;
+	void __iomem *base;
+
+	clkp = clk_get(NULL, "IDECLK");
+	if (IS_ERR(clkp))
+		return -ENODEV;
+
+	ideclkp = clkp;
+	clk_enable(ideclkp);
+	ide_palm_clk = clk_get_rate(ideclkp)/100000;
+	ide_palm_clk = (10000/ide_palm_clk) + 1;
+	/* Register the IDE interface with Linux ATA Interface */
+	memset(&ide_ctlr_info, 0, sizeof(ide_ctlr_info));
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (mem == NULL) {
+		printk(KERN_ERR "failed to get memory region resource\n");
+		return -ENODEV;
+	}
+	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (irq == NULL) {
+		printk(KERN_ERR "failed to get IRQ resource\n");
+		return -ENODEV;
+	}
+
+	base = (void *)mem->start;
+
+	/* Configure the Palm Chip controller */
+	palm_bk3710_chipinit(base);
+
+	pribase = mem->start + IDE_PALM_ATA_PRI_REG_OFFSET;
+	for (index = 0; index < IDE_NR_PORTS - 2; index++)
+		ide_ctlr_info.io_ports[index] = pribase + index;
+	ide_ctlr_info.io_ports[IDE_CONTROL_OFFSET] = mem->start +
+			IDE_PALM_ATA_PRI_CTL_OFFSET;
+	ide_ctlr_info.irq = irq->start;
+	ide_ctlr_info.chipset = ide_palm3710;
+
+	if (ide_register_hw(&ide_ctlr_info, NULL, &hwif) < 0) {
+		printk(KERN_WARNING "Palm Chip BK3710 IDE Register Fail\n");
+		return -ENODEV;
+	}
+
+	hwif->set_pio_mode = &palm_bk3710_set_pio_mode;
+	hwif->set_dma_mode = &palm_bk3710_set_dma_mode;
+	hwif->mmio = 1;
+	default_hwif_mmiops(hwif);
+	hwif->cbl = ATA_CBL_PATA80;
+	hwif->ultra_mask = 0x1f;	/* Ultra DMA Mode 4 Max
+						(input clk 99MHz) */
+	hwif->mwdma_mask = 0x7;
+	hwif->drives[0].autotune = 1;
+	hwif->drives[1].autotune = 1;
+
+	ide_setup_dma(hwif, mem->start);
+
+	return 0;
+}
+
+static struct platform_driver platform_bk_driver = {
+	.driver = {
+		.name = "palm_bk3710",
+	},
+	.probe = palm_bk3710_probe,
+	.remove = NULL,
+};
+
+static int __init palm_bk3710_init(void)
+{
+	return platform_driver_register(&platform_bk_driver);
+}
+
+module_init(palm_bk3710_init);
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 975c0ff..bab88ca 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -65,6 +65,7 @@ static int proc_ide_read_imodel
 		case ide_4drives:	name = "4drives";	break;
 		case ide_pmac:		name = "mac-io";	break;
 		case ide_au1xxx:	name = "au1xxx";	break;
+		case ide_palm3710:      name = "palm3710";      break;
 		case ide_etrax100:	name = "etrax100";	break;
 		case ide_acorn:		name = "acorn";		break;
 		default:		name = "(unknown)";	break;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 367c170..ad75e70 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -173,7 +173,7 @@ enum {		ide_unknown,	ide_generic,	ide_pci,
 		ide_rz1000,	ide_trm290,
 		ide_cmd646,	ide_cy82c693,	ide_4drives,
 		ide_pmac,	ide_etrax100,	ide_acorn,
-		ide_au1xxx, ide_forced
+		ide_au1xxx,	ide_palm3710,	ide_forced
 };
 
 typedef u8 hwif_chipset_t;
@@ -1014,7 +1014,8 @@ extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *o
 void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *, int, u8 *);
 void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
 
-#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
+/* FIXME: palm_bk3710 uses BLK_DEV_IDEDMA_PCI without BLK_DEV_IDEPCI! */
+#if defined(CONFIG_BLK_DEV_IDEPCI) && defined(CONFIG_BLK_DEV_IDEDMA_PCI)
 void ide_hwif_setup_dma(ide_hwif_t *, const struct ide_port_info *);
 #else
 static inline void ide_hwif_setup_dma(ide_hwif_t *hwif,
-- 
cgit v1.1


From 1dcfdf93f66375567ec563de74bbb8c295ac88df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 6 Feb 2008 02:57:49 +0100
Subject: drivers/ide/ide-acpi.c: fix uninitialized var warning

drivers/ide/ide-acpi.c: In function 'ide_acpi_init':
drivers/ide/ide-acpi.c:175: warning: 'dev_handle' may be used uninitialized in this function

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 25aaeae..e07b189 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -171,7 +171,7 @@ err:
 static acpi_handle ide_acpi_hwif_get_handle(ide_hwif_t *hwif)
 {
 	struct device		*dev = hwif->gendev.parent;
-	acpi_handle		dev_handle;
+	acpi_handle		uninitialized_var(dev_handle);
 	acpi_integer		pcidevfn;
 	acpi_handle		chan_handle;
 	int			err;
-- 
cgit v1.1


From b004223db7249d42db893df916457acecc22759c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 6 Feb 2008 02:57:49 +0100
Subject: drivers/ide/legacy/hd.c: fix uninitialized var warning

drivers/ide/legacy/hd.c: In function 'hd_request':
drivers/ide/legacy/hd.c:424: warning: 'stat' may be used uninitialized in this function

gcc is being stupid.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/legacy/hd.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 8e05d88..0b0d867 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -421,11 +421,14 @@ static void bad_rw_intr(void)
 
 static inline int wait_DRQ(void)
 {
-	int retries = 100000, stat;
+	int retries;
+	int stat;
 
-	while (--retries > 0)
-		if ((stat = inb_p(HD_STATUS)) & DRQ_STAT)
+	for (retries = 0; retries < 100000; retries++) {
+		stat = inb_p(HD_STATUS);
+		if (stat & DRQ_STAT)
 			return 0;
+	}
 	dump_status("wait_DRQ", stat);
 	return -1;
 }
-- 
cgit v1.1


From 594765a7316562cb7442f760a9a2f6e02804b610 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Wed, 6 Feb 2008 02:57:49 +0100
Subject: ide-pci-generic: kill the unused ifdef/endif/MODULE code

with module_param macro, the __setup code can be killed now:
	const __setup("all-generic-ide", ide_generic_all_on);

and the module name "generic.ko" is not descriptive to its functionality,
can be changed in Makefile, the "ide-pci-generic.ko" is better.

the ide-pci-generic.all-generic-ide parameter also documented
in Documentation/kernel-parameters.txt

Signed-off-by: Denis Cheng <crquan@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 Documentation/kernel-parameters.txt |  3 +++
 drivers/ide/pci/Makefile            |  3 ++-
 drivers/ide/pci/generic.c           | 13 -------------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9ad4e6f..8fd5aa4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -780,6 +780,9 @@ and is between 256 and 4096 characters. It is defined in the file
 			loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
 			as idle=poll.
 
+	ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
+			Claim all unknown PCI IDE storage controllers.
+
 	ignore_loglevel	[KNL]
 			Ignore loglevel setting - this will print /all/
 			kernel messages to the console. Useful for debugging.
diff --git a/drivers/ide/pci/Makefile b/drivers/ide/pci/Makefile
index 9480325..02e6ee7 100644
--- a/drivers/ide/pci/Makefile
+++ b/drivers/ide/pci/Makefile
@@ -34,7 +34,8 @@ obj-$(CONFIG_BLK_DEV_TRM290)		+= trm290.o
 obj-$(CONFIG_BLK_DEV_VIA82CXXX)		+= via82cxxx.o
 
 # Must appear at the end of the block
-obj-$(CONFIG_BLK_DEV_GENERIC)          += generic.o
+obj-$(CONFIG_BLK_DEV_GENERIC)		+= ide-pci-generic.o
+ide-pci-generic-y			+= generic.o
 
 ifeq ($(CONFIG_BLK_DEV_CMD640), m)
 	obj-m += cmd640.o
diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index 9262a91..7fd83a9 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -29,19 +29,6 @@
 
 static int ide_generic_all;		/* Set to claim all devices */
 
-/*
- * the module_param_named() was added for the modular case
- * the __setup() is left as compatibility for existing setups
- */
-#ifndef MODULE
-static int __init ide_generic_all_on(char *unused)
-{
-	ide_generic_all = 1;
-	printk(KERN_INFO "IDE generic will claim all unknown PCI IDE storage controllers.\n");
-	return 1;
-}
-const __setup("all-generic-ide", ide_generic_all_on);
-#endif
 module_param_named(all_generic_ide, ide_generic_all, bool, 0444);
 MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers.");
 
-- 
cgit v1.1


From 34394e45c3387bd66619d9a51b4be507e4222b02 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:50 +0100
Subject: ppc: fix #ifdef-s in mediabay driver (take 2)

* Replace incorrect CONFIG_BLK_DEV_IDE #ifdef in
  check_media_bay() by CONFIG_MAC_FLOPPY one.

* Replace incorrect CONFIG_BLK_DEV_IDE #ifdef-s by
  CONFIG_BLK_DEV_IDE_PMAC ones.

* check_media_bay() is used only by drivers/block/swim3.c
  so make this function available only if CONFIG_MAC_FLOPPY
  is defined.

* check_media_bay_by_base() and media_bay_set_ide_infos()
  are used only by drivers/ide/ppc/pmac.c so so make these
  functions available only if CONFIG_MAC_FLOPPY is defined.

v2:
* Remove ifdefs from function prototypes. (Andrew Morton)

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/macintosh/mediabay.c   | 46 +++++++++++++++++++-----------------------
 include/asm-powerpc/mediabay.h |  8 ++++----
 2 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index de9ebbf..9367882 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -78,12 +78,14 @@ struct media_bay_info {
 	int				cached_gpio;
 	int				sleeping;
 	struct semaphore		lock;
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 	void __iomem			*cd_base;
-	int 				cd_index;
 	int				cd_irq;
 	int				cd_retry;
 #endif
+#if defined(CONFIG_BLK_DEV_IDE_PMAC) || defined(CONFIG_MAC_FLOPPY)
+	int 				cd_index;
+#endif
 };
 
 #define MAX_BAYS	2
@@ -91,7 +93,7 @@ struct media_bay_info {
 static struct media_bay_info media_bays[MAX_BAYS];
 int media_bay_count = 0;
 
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 /* check the busy bit in the media-bay ide interface
    (assumes the media-bay contains an ide device) */
 #define MB_IDE_READY(i)	((readb(media_bays[i].cd_base + 0x70) & 0x80) == 0)
@@ -401,7 +403,7 @@ static void poll_media_bay(struct media_bay_info* bay)
 				set_mb_power(bay, id != MB_NO);
 				bay->content_id = id;
 				if (id == MB_NO) {
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 					bay->cd_retry = 0;
 #endif
 					printk(KERN_INFO "media bay %d is empty\n", bay->index);
@@ -414,9 +416,9 @@ static void poll_media_bay(struct media_bay_info* bay)
 	}
 }
 
+#ifdef CONFIG_MAC_FLOPPY
 int check_media_bay(struct device_node *which_bay, int what)
 {
-#ifdef CONFIG_BLK_DEV_IDE
 	int	i;
 
 	for (i=0; i<media_bay_count; i++)
@@ -426,14 +428,14 @@ int check_media_bay(struct device_node *which_bay, int what)
 			media_bays[i].cd_index = -1;
 			return -EINVAL;
 		}
-#endif /* CONFIG_BLK_DEV_IDE */
 	return -ENODEV;
 }
 EXPORT_SYMBOL(check_media_bay);
+#endif /* CONFIG_MAC_FLOPPY */
 
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 int check_media_bay_by_base(unsigned long base, int what)
 {
-#ifdef CONFIG_BLK_DEV_IDE
 	int	i;
 
 	for (i=0; i<media_bay_count; i++)
@@ -443,15 +445,13 @@ int check_media_bay_by_base(unsigned long base, int what)
 			media_bays[i].cd_index = -1;
 			return -EINVAL;
 		} 
-#endif
-	
+
 	return -ENODEV;
 }
 
 int media_bay_set_ide_infos(struct device_node* which_bay, unsigned long base,
-	int irq, int index)
+			    int irq, int index)
 {
-#ifdef CONFIG_BLK_DEV_IDE
 	int	i;
 
 	for (i=0; i<media_bay_count; i++) {
@@ -483,10 +483,10 @@ int media_bay_set_ide_infos(struct device_node* which_bay, unsigned long base,
 			return -ENODEV;
 		}
 	}
-#endif /* CONFIG_BLK_DEV_IDE */
-	
+
 	return -ENODEV;
 }
+#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 
 static void media_bay_step(int i)
 {
@@ -521,14 +521,13 @@ static void media_bay_step(int i)
 	    	bay->state = mb_resetting;
 		MBDBG("mediabay%d: waiting reset (kind:%d)\n", i, bay->content_id);
 	    	break;
-	    
 	case mb_resetting:
 		if (bay->content_id != MB_CD) {
 			MBDBG("mediabay%d: bay is up (kind:%d)\n", i, bay->content_id);
 			bay->state = mb_up;
 			break;
 	    	}
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 		MBDBG("mediabay%d: waiting IDE reset (kind:%d)\n", i, bay->content_id);
 		bay->ops->un_reset_ide(bay);
 	    	bay->timer = msecs_to_jiffies(MB_IDE_WAIT);
@@ -536,16 +535,14 @@ static void media_bay_step(int i)
 #else
 		printk(KERN_DEBUG "media-bay %d is ide (not compiled in kernel)\n", i);
 		set_mb_power(bay, 0);
-#endif /* CONFIG_BLK_DEV_IDE */
+#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 	    	break;
-	    
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 	case mb_ide_resetting:
 	    	bay->timer = msecs_to_jiffies(MB_IDE_TIMEOUT);
 	    	bay->state = mb_ide_waiting;
 		MBDBG("mediabay%d: waiting IDE ready (kind:%d)\n", i, bay->content_id);
 	    	break;
-	    
 	case mb_ide_waiting:
 		if (bay->cd_base == NULL) {
 			bay->timer = 0;
@@ -587,11 +584,10 @@ static void media_bay_step(int i)
 			bay->timer = 0;
 	    	}
 		break;
-#endif /* CONFIG_BLK_DEV_IDE */
-
+#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 	case mb_powering_down:
 	    	bay->state = mb_empty;
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
     	        if (bay->cd_index >= 0) {
 			printk(KERN_DEBUG "Unregistering mb %d ide, index:%d\n", i,
 			       bay->cd_index);
@@ -607,7 +603,7 @@ static void media_bay_step(int i)
 				bay->content_id = MB_NO;
 			}
 	    	}
-#endif /* CONFIG_BLK_DEV_IDE */    
+#endif /* CONFIG_BLK_DEV_IDE_PMAC */
 		MBDBG("mediabay%d: end of power down\n", i);
 	    	break;
 	}
@@ -739,7 +735,7 @@ static int media_bay_resume(struct macio_dev *mdev)
 	       	bay->last_value = bay->content_id;
 	       	bay->value_count = msecs_to_jiffies(MB_STABLE_DELAY);
 	       	bay->timer = msecs_to_jiffies(MB_POWER_DELAY);
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 	       	bay->cd_retry = 0;
 #endif
 	       	do {
@@ -829,7 +825,7 @@ static int __init media_bay_init(void)
 	for (i=0; i<MAX_BAYS; i++) {
 		memset((char *)&media_bays[i], 0, sizeof(struct media_bay_info));
 		media_bays[i].content_id	= -1;
-#ifdef CONFIG_BLK_DEV_IDE
+#ifdef CONFIG_BLK_DEV_IDE_PMAC
 		media_bays[i].cd_index		= -1;
 #endif
 	}
diff --git a/include/asm-powerpc/mediabay.h b/include/asm-powerpc/mediabay.h
index 9daa325..de83fe1 100644
--- a/include/asm-powerpc/mediabay.h
+++ b/include/asm-powerpc/mediabay.h
@@ -18,14 +18,14 @@
 #define MB_NO		7	/* media bay contains nothing */
 
 int check_media_bay(struct device_node *which_bay, int what);
-int check_media_bay_by_base(unsigned long base, int what);
 
 /* Number of bays in the machine or 0 */
 extern int media_bay_count;
 
-/* called by pmac-ide.c to register IDE controller for media bay */
-extern int media_bay_set_ide_infos(struct device_node* which_bay,
-			unsigned long base, int irq, int index);
+int check_media_bay_by_base(unsigned long base, int what);
+/* called by IDE PMAC host driver to register IDE controller for media bay */
+int media_bay_set_ide_infos(struct device_node *which_bay, unsigned long base,
+			    int irq, int index);
 
 #endif /* __KERNEL__ */
 #endif /* _PPC_MEDIABAY_H */
-- 
cgit v1.1


From afdd360c95632b0c882790a7b25ff505664adcd0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:50 +0100
Subject: ide: remove write-only ->sata_misc[] from ide_hwif_t

* Remove write-only ->sata_misc[] from ide_hwif_t.

* Remove no longer used SATA_{MISC,PHY,IEN}_OFFSET defines.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/pci/siimage.c | 3 ---
 include/linux/ide.h       | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index ef5b39f..cc4be96 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -704,9 +704,6 @@ static void __devinit init_mmio_iops_siimage(ide_hwif_t *hwif)
 		hwif->sata_scr[SATA_STATUS_OFFSET]	= base + 0x104;
 		hwif->sata_scr[SATA_ERROR_OFFSET]	= base + 0x108;
 		hwif->sata_scr[SATA_CONTROL_OFFSET]	= base + 0x100;
-		hwif->sata_misc[SATA_MISC_OFFSET]	= base + 0x140;
-		hwif->sata_misc[SATA_PHY_OFFSET]	= base + 0x144;
-		hwif->sata_misc[SATA_IEN_OFFSET]	= base + 0x148;
 	}
 
 	memcpy(hwif->io_ports, hw.io_ports, sizeof(hwif->io_ports));
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ad75e70..e02fd11 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -115,10 +115,6 @@ typedef unsigned char	byte;	/* used everywhere */
 #define SATA_ERROR_OFFSET	(1)
 #define SATA_CONTROL_OFFSET	(2)
 
-#define SATA_MISC_OFFSET	(0)
-#define SATA_PHY_OFFSET		(1)
-#define SATA_IEN_OFFSET		(2)
-
 /*
  * Our Physical Region Descriptor (PRD) table should be large enough
  * to handle the biggest I/O request we are likely to see.  Since requests
@@ -473,7 +469,6 @@ typedef struct hwif_s {
 		/* task file registers for pata and sata */
 	unsigned long	io_ports[IDE_NR_PORTS];
 	unsigned long	sata_scr[SATA_NR_PORTS];
-	unsigned long	sata_misc[SATA_NR_PORTS];
 
 	ide_drive_t	drives[MAX_DRIVES];	/* drive info */
 
-- 
cgit v1.1


From f2694b7e3bad75436b47b6840de352f7b7f53feb Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:50 +0100
Subject: ide: remove redundant BUG_ON() from [atapi_]reset_pollfunc()

Same BUG_ON() is present inside ide_set_handler().

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-iops.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index a95178f..7647ac4 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -860,7 +860,6 @@ static ide_startstop_t atapi_reset_pollfunc (ide_drive_t *drive)
 		printk("%s: ATAPI reset complete\n", drive->name);
 	} else {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
-			BUG_ON(HWGROUP(drive)->handler != NULL);
 			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
 			return ide_started;
@@ -900,7 +899,6 @@ static ide_startstop_t reset_pollfunc (ide_drive_t *drive)
 
 	if (!OK_STAT(tmp = hwif->INB(IDE_STATUS_REG), 0, BUSY_STAT)) {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
-			BUG_ON(HWGROUP(drive)->handler != NULL);
 			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
 			return ide_started;
-- 
cgit v1.1


From 29dd59755a849cc6475faa6a75f3b804e23a6fc2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:50 +0100
Subject: ide: remove ide_setup_ports()

ide-cris.c:
* Add cris_setup_ports() helper and use it instead of ide_setup_ports()
  (fixes random value being set in ->io_ports[IDE_IRQ_OFFSET]).

buddha.c:
* Add buddha_setup_ports() helper and use it instead of ide_setup_ports().

falconide.c:
* Add falconide_setup_ports() helper and use it instead of ide_setup_ports(),
  also fix return value of falconide_init() while at it.

gayle.c:
* Add gayle_setup_ports() helper and use it instead of ide_setup_ports().

macide.c:
* Add macide_setup_ports() helper and use it instead of ide_setup_ports()
  (fixes incorrect value being set in ->io_ports[IDE_IRQ_OFFSET]).

q40ide.c:
* Fix q40_ide_setup_ports() comments.

ide.c:
* Remove no longer needed ide_setup_ports().

Cc: Mikael Starvik <starvik@axis.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cris/ide-cris.c    | 33 ++++++++++++-------
 drivers/ide/ide.c              | 54 -------------------------------
 drivers/ide/legacy/buddha.c    | 72 ++++++++++++++++++++++--------------------
 drivers/ide/legacy/falconide.c | 42 ++++++++++++------------
 drivers/ide/legacy/gayle.c     | 39 ++++++++++++-----------
 drivers/ide/legacy/macide.c    | 57 ++++++++++++++++++---------------
 drivers/ide/legacy/q40ide.c    |  9 +-----
 include/linux/ide.h            | 11 -------
 8 files changed, 133 insertions(+), 184 deletions(-)

diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index 00587a8..e79bf8f 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -753,6 +753,25 @@ static void cris_set_dma_mode(ide_drive_t *drive, const u8 speed)
 		cris_ide_set_speed(TYPE_DMA, 0, strobe, hold);
 }
 
+static void __init cris_setup_ports(hw_regs_t *hw, unsigned long base)
+{
+	int i;
+
+	memset(hw, 0, sizeof(*hw));
+
+	for (i = 0; i <= 7; i++)
+		hw->io_ports[i] = base + cris_ide_reg_addr(i, 0, 1);
+
+	/*
+	 * the IDE control register is at ATA address 6,
+	 * with CS1 active instead of CS0
+	 */
+	hw->io_ports[IDE_CONTROL_OFFSET] = base + cris_ide_reg_addr(6, 1, 0);
+
+	hw->irq = ide_default_irq(0);
+	hw->ack_intr = cris_ide_ack_intr;
+}
+
 static const struct ide_port_info cris_port_info __initdata = {
 	.chipset		= ide_etrax100,
 	.host_flags		= IDE_HFLAG_NO_ATAPI_DMA |
@@ -765,24 +784,16 @@ static const struct ide_port_info cris_port_info __initdata = {
 static int __init init_e100_ide(void)
 {
 	hw_regs_t hw;
-	int ide_offsets[IDE_NR_PORTS], h, i;
+	int h;
 	u8 idx[4] = { 0xff, 0xff, 0xff, 0xff };
 
 	printk("ide: ETRAX FS built-in ATA DMA controller\n");
 
-	for (i = IDE_DATA_OFFSET; i <= IDE_STATUS_OFFSET; i++)
-		ide_offsets[i] = cris_ide_reg_addr(i, 0, 1);
-
-	/* the IDE control register is at ATA address 6, with CS1 active instead of CS0 */
-	ide_offsets[IDE_CONTROL_OFFSET] = cris_ide_reg_addr(6, 1, 0);
-
 	for (h = 0; h < 4; h++) {
 		ide_hwif_t *hwif = NULL;
 
-		ide_setup_ports(&hw, cris_ide_base_address(h),
-		                ide_offsets,
-		                0, 0, cris_ide_ack_intr,
-		                ide_default_irq(0));
+		cris_setup_ports(&hw, cris_ide_base_address(h));
+
 		hwif = ide_find_port(hw.io_ports[IDE_DATA_OFFSET]);
 		if (hwif == NULL)
 			continue;
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index ac61360..ad0e995 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -618,60 +618,6 @@ abort:
 
 EXPORT_SYMBOL(ide_unregister);
 
-
-/**
- *	ide_setup_ports 	-	set up IDE interface ports
- *	@hw: register descriptions
- *	@base: base register
- *	@offsets: table of register offsets
- *	@ctrl: control register
- *	@ack_irq: IRQ ack
- *	@irq: interrupt lie
- *
- *	Setup hw_regs_t structure described by parameters.  You
- *	may set up the hw structure yourself OR use this routine to
- *	do it for you. This is basically a helper
- *
- */
- 
-void ide_setup_ports (	hw_regs_t *hw,
-			unsigned long base, int *offsets,
-			unsigned long ctrl, unsigned long intr,
-			ide_ack_intr_t *ack_intr,
-/*
- *			ide_io_ops_t *iops,
- */
-			int irq)
-{
-	int i;
-
-	memset(hw, 0, sizeof(hw_regs_t));
-	for (i = 0; i < IDE_NR_PORTS; i++) {
-		if (offsets[i] == -1) {
-			switch(i) {
-				case IDE_CONTROL_OFFSET:
-					hw->io_ports[i] = ctrl;
-					break;
-#if defined(CONFIG_AMIGA) || defined(CONFIG_MAC)
-				case IDE_IRQ_OFFSET:
-					hw->io_ports[i] = intr;
-					break;
-#endif /* (CONFIG_AMIGA) || (CONFIG_MAC) */
-				default:
-					hw->io_ports[i] = 0;
-					break;
-			}
-		} else {
-			hw->io_ports[i] = base + offsets[i];
-		}
-	}
-	hw->irq = irq;
-	hw->ack_intr = ack_intr;
-/*
- *	hw->iops = iops;
- */
-}
-
 void ide_init_port_hw(ide_hwif_t *hwif, hw_regs_t *hw)
 {
 	memcpy(hwif->io_ports, hw->io_ports, sizeof(hwif->io_ports));
diff --git a/drivers/ide/legacy/buddha.c b/drivers/ide/legacy/buddha.c
index 8bdb79d..50ffa87 100644
--- a/drivers/ide/legacy/buddha.c
+++ b/drivers/ide/legacy/buddha.c
@@ -56,31 +56,11 @@ static u_int xsurf_bases[XSURF_NUM_HWIFS] __initdata = {
      XSURF_BASE1, XSURF_BASE2
 };
 
-
     /*
      *  Offsets from one of the above bases
      */
 
-#define BUDDHA_DATA	0x00
-#define BUDDHA_ERROR	0x06		/* see err-bits */
-#define BUDDHA_NSECTOR	0x0a		/* nr of sectors to read/write */
-#define BUDDHA_SECTOR	0x0e		/* starting sector */
-#define BUDDHA_LCYL	0x12		/* starting cylinder */
-#define BUDDHA_HCYL	0x16		/* high byte of starting cyl */
-#define BUDDHA_SELECT	0x1a		/* 101dhhhh , d=drive, hhhh=head */
-#define BUDDHA_STATUS	0x1e		/* see status-bits */
 #define BUDDHA_CONTROL	0x11a
-#define XSURF_CONTROL   -1              /* X-Surf has no CS1* (Control/AltStat) */
-
-static int buddha_offsets[IDE_NR_PORTS] __initdata = {
-    BUDDHA_DATA, BUDDHA_ERROR, BUDDHA_NSECTOR, BUDDHA_SECTOR, BUDDHA_LCYL,
-    BUDDHA_HCYL, BUDDHA_SELECT, BUDDHA_STATUS, BUDDHA_CONTROL, -1
-};
-
-static int xsurf_offsets[IDE_NR_PORTS] __initdata = {
-    BUDDHA_DATA, BUDDHA_ERROR, BUDDHA_NSECTOR, BUDDHA_SECTOR, BUDDHA_LCYL,
-    BUDDHA_HCYL, BUDDHA_SELECT, BUDDHA_STATUS, XSURF_CONTROL, -1
-};
 
     /*
      *  Other registers
@@ -140,6 +120,26 @@ static int xsurf_ack_intr(ide_hwif_t *hwif)
     return 1;
 }
 
+static void __init buddha_setup_ports(hw_regs_t *hw, unsigned long base,
+				      unsigned long ctl, unsigned long irq_port,
+				      ide_ack_intr_t *ack_intr)
+{
+	int i;
+
+	memset(hw, 0, sizeof(*hw));
+
+	hw->io_ports[IDE_DATA_OFFSET] = base;
+
+	for (i = 1; i < 8; i++)
+		hw->io_ports[i] = base + 2 + i * 4;
+
+	hw->io_ports[IDE_CONTROL_OFFSET] = ctl;
+	hw->io_ports[IDE_IRQ_OFFSET] = irq_port;
+
+	hw->irq = IRQ_AMIGA_PORTS;
+	hw->ack_intr = ack_intr;
+}
+
     /*
      *  Probe for a Buddha or Catweasel IDE interface
      */
@@ -202,22 +202,24 @@ fail_base2:
 		printk(KERN_INFO "ide: %s IDE controller\n",
 				 buddha_board_name[type]);
 
-		for(i=0;i<buddha_num_hwifs;i++) {
-			if(type != BOARD_XSURF) {
-				ide_setup_ports(&hw, (buddha_board+buddha_bases[i]),
-						buddha_offsets, 0,
-						(buddha_board+buddha_irqports[i]),
-						buddha_ack_intr,
-//						budda_iops,
-						IRQ_AMIGA_PORTS);
+		for (i = 0; i < buddha_num_hwifs; i++) {
+			unsigned long base, ctl, irq_port;
+			ide_ack_intr_t *ack_intr;
+
+			if (type != BOARD_XSURF) {
+				base = buddha_board + buddha_bases[i];
+				ctl = base + BUDDHA_CONTROL;
+				irq_port = buddha_board + buddha_irqports[i];
+				ack_intr = buddha_ack_intr;
 			} else {
-				ide_setup_ports(&hw, (buddha_board+xsurf_bases[i]),
-						xsurf_offsets, 0,
-						(buddha_board+xsurf_irqports[i]),
-						xsurf_ack_intr,
-//						xsurf_iops,
-						IRQ_AMIGA_PORTS);
-			}	
+				base = buddha_board + xsurf_bases[i];
+				/* X-Surf has no CS1* (Control/AltStat) */
+				ctl = 0;
+				irq_port = buddha_board + xsurf_irqports[i];
+				ack_intr = xsurf_ack_intr;
+			}
+
+			buddha_setup_ports(&hw, base, ctl, irq_port, ack_intr);
 
 			hwif = ide_find_port(hw.io_ports[IDE_DATA_OFFSET]);
 			if (hwif) {
diff --git a/drivers/ide/legacy/falconide.c b/drivers/ide/legacy/falconide.c
index 85b69a8..f044048 100644
--- a/drivers/ide/legacy/falconide.c
+++ b/drivers/ide/legacy/falconide.c
@@ -33,22 +33,8 @@
      *  Offsets from the above base
      */
 
-#define ATA_HD_DATA	0x00
-#define ATA_HD_ERROR	0x05		/* see err-bits */
-#define ATA_HD_NSECTOR	0x09		/* nr of sectors to read/write */
-#define ATA_HD_SECTOR	0x0d		/* starting sector */
-#define ATA_HD_LCYL	0x11		/* starting cylinder */
-#define ATA_HD_HCYL	0x15		/* high byte of starting cyl */
-#define ATA_HD_SELECT	0x19		/* 101dhhhh , d=drive, hhhh=head */
-#define ATA_HD_STATUS	0x1d		/* see status-bits */
 #define ATA_HD_CONTROL	0x39
 
-static int falconide_offsets[IDE_NR_PORTS] __initdata = {
-    ATA_HD_DATA, ATA_HD_ERROR, ATA_HD_NSECTOR, ATA_HD_SECTOR, ATA_HD_LCYL,
-    ATA_HD_HCYL, ATA_HD_SELECT, ATA_HD_STATUS, ATA_HD_CONTROL, -1
-};
-
-
     /*
      *  falconide_intr_lock is used to obtain access to the IDE interrupt,
      *  which is shared between several drivers.
@@ -57,6 +43,22 @@ static int falconide_offsets[IDE_NR_PORTS] __initdata = {
 int falconide_intr_lock;
 EXPORT_SYMBOL(falconide_intr_lock);
 
+static void __init falconide_setup_ports(hw_regs_t *hw)
+{
+	int i;
+
+	memset(hw, 0, sizeof(*hw));
+
+	hw->io_ports[IDE_DATA_OFFSET] = ATA_HD_BASE;
+
+	for (i = 1; i < 8; i++)
+		hw->io_ports[i] = ATA_HD_BASE + 1 + i * 4;
+
+	hw->io_ports[IDE_CONTROL_OFFSET] = ATA_HD_CONTROL;
+
+	hw->irq = IRQ_MFP_IDE;
+	hw->ack_intr = NULL;
+}
 
     /*
      *  Probe for a Falcon IDE interface
@@ -64,16 +66,15 @@ EXPORT_SYMBOL(falconide_intr_lock);
 
 static int __init falconide_init(void)
 {
-    if (MACH_IS_ATARI && ATARIHW_PRESENT(IDE)) {
 	hw_regs_t hw;
 	ide_hwif_t *hwif;
 
+	if (!MACH_IS_ATARI || !ATARIHW_PRESENT(IDE))
+		return 0;
+
 	printk(KERN_INFO "ide: Falcon IDE controller\n");
 
-	ide_setup_ports(&hw, ATA_HD_BASE, falconide_offsets,
-			0, 0, NULL,
-//			falconide_iops,
-			IRQ_MFP_IDE);
+	falconide_setup_ports(&hw);
 
 	hwif = ide_find_port(hw.io_ports[IDE_DATA_OFFSET]);
 	if (hwif) {
@@ -85,9 +86,8 @@ static int __init falconide_init(void)
 
 		ide_device_add(idx, NULL);
 	}
-    }
 
-    return 0;
+	return 0;
 }
 
 module_init(falconide_init);
diff --git a/drivers/ide/legacy/gayle.c b/drivers/ide/legacy/gayle.c
index fc29ce7..9d3851d 100644
--- a/drivers/ide/legacy/gayle.c
+++ b/drivers/ide/legacy/gayle.c
@@ -34,22 +34,8 @@
      *  Offsets from one of the above bases
      */
 
-#define GAYLE_DATA	0x00
-#define GAYLE_ERROR	0x06		/* see err-bits */
-#define GAYLE_NSECTOR	0x0a		/* nr of sectors to read/write */
-#define GAYLE_SECTOR	0x0e		/* starting sector */
-#define GAYLE_LCYL	0x12		/* starting cylinder */
-#define GAYLE_HCYL	0x16		/* high byte of starting cyl */
-#define GAYLE_SELECT	0x1a		/* 101dhhhh , d=drive, hhhh=head */
-#define GAYLE_STATUS	0x1e		/* see status-bits */
 #define GAYLE_CONTROL	0x101a
 
-static int gayle_offsets[IDE_NR_PORTS] __initdata = {
-    GAYLE_DATA, GAYLE_ERROR, GAYLE_NSECTOR, GAYLE_SECTOR, GAYLE_LCYL,
-    GAYLE_HCYL, GAYLE_SELECT, GAYLE_STATUS, -1, -1
-};
-
-
     /*
      *  These are at different offsets from the base
      */
@@ -106,6 +92,26 @@ static int gayle_ack_intr_a1200(ide_hwif_t *hwif)
     return 1;
 }
 
+static void __init gayle_setup_ports(hw_regs_t *hw, unsigned long base,
+				     unsigned long ctl, unsigned long irq_port,
+				     ide_ack_intr_t *ack_intr);
+{
+	int i;
+
+	memset(hw, 0, sizeof(*hw));
+
+	hw->io_ports[IDE_DATA_OFFSET] = base;
+
+	for (i = 1; i < 8; i++)
+		hw->io_ports[i] = base + 2 + i * 4;
+
+	hw->io_ports[IDE_CONTROL_OFFSET] = ctl;
+	hw->io_ports[IDE_IRQ_OFFSET] = irq_port;
+
+	hw->irq = IRQ_AMIGA_PORTS;
+	hw->ack_intr = ack_intr;
+}
+
     /*
      *  Probe for a Gayle IDE interface (and optionally for an IDE doubler)
      */
@@ -167,10 +173,7 @@ found:
 	base = (unsigned long)ZTWO_VADDR(phys_base);
 	ctrlport = GAYLE_HAS_CONTROL_REG ? (base + GAYLE_CONTROL) : 0;
 
-	ide_setup_ports(&hw, base, gayle_offsets,
-			ctrlport, irqport, ack_intr,
-//			&gayle_iops,
-			IRQ_AMIGA_PORTS);
+	gayle_setup_ports(&hw, base, ctrlport, irqport, ack_intr);
 
 	hwif = ide_find_port(base);
 	if (hwif) {
diff --git a/drivers/ide/legacy/macide.c b/drivers/ide/legacy/macide.c
index 06df8df..a61e607 100644
--- a/drivers/ide/legacy/macide.c
+++ b/drivers/ide/legacy/macide.c
@@ -31,14 +31,6 @@
  * These match MkLinux so they should be correct.
  */
 
-#define IDE_DATA	0x00
-#define IDE_ERROR	0x04	/* see err-bits */
-#define IDE_NSECTOR	0x08	/* nr of sectors to read/write */
-#define IDE_SECTOR	0x0c	/* starting sector */
-#define IDE_LCYL	0x10	/* starting cylinder */
-#define IDE_HCYL	0x14	/* high byte of starting cyl */
-#define IDE_SELECT	0x18	/* 101dhhhh , d=drive, hhhh=head */
-#define IDE_STATUS	0x1c	/* see status-bits */
 #define IDE_CONTROL	0x38	/* control/altstatus */
 
 /*
@@ -63,11 +55,6 @@
 
 volatile unsigned char *ide_ifr = (unsigned char *) (IDE_BASE + IDE_IFR);
 
-static int macide_offsets[IDE_NR_PORTS] = {
-    IDE_DATA, IDE_ERROR,  IDE_NSECTOR, IDE_SECTOR, IDE_LCYL,
-    IDE_HCYL, IDE_SELECT, IDE_STATUS,  IDE_CONTROL
-};
-
 int macide_ack_intr(ide_hwif_t* hwif)
 {
 	if (*ide_ifr & 0x20) {
@@ -77,6 +64,22 @@ int macide_ack_intr(ide_hwif_t* hwif)
 	return 0;
 }
 
+static void __init macide_setup_ports(hw_regs_t *hw, unsigned long base,
+				      int irq, ide_ack_intr_t *ack_intr)
+{
+	int i;
+
+	memset(hw, 0, sizeof(*hw));
+
+	for (i = 0; i < 8; i++)
+		hw->io_ports[i] = base + i * 4;
+
+	hw->io_ports[IDE_CONTROL_OFFSET] = IDE_CONTROL;
+
+	hw->irq = irq;
+	hw->ack_intr = ack_intr;
+}
+
 static const char *mac_ide_name[] =
 	{ "Quadra", "Powerbook", "Powerbook Baboon" };
 
@@ -86,27 +89,27 @@ static const char *mac_ide_name[] =
 
 static int __init macide_init(void)
 {
-	hw_regs_t hw;
 	ide_hwif_t *hwif;
+	ide_ack_intr_t *ack_intr;
+	unsigned long base;
+	int irq;
+	hw_regs_t hw;
 
 	switch (macintosh_config->ide_type) {
 	case MAC_IDE_QUADRA:
-		ide_setup_ports(&hw, IDE_BASE, macide_offsets,
-				0, 0, macide_ack_intr,
-//				quadra_ide_iops,
-				IRQ_NUBUS_F);
+		base = IDE_BASE;
+		ack_intr = macide_ack_intr;
+		irq = IRQ_NUBUS_F;
 		break;
 	case MAC_IDE_PB:
-		ide_setup_ports(&hw, IDE_BASE, macide_offsets,
-				0, 0, macide_ack_intr,
-//				macide_pb_iops,
-				IRQ_NUBUS_C);
+		base = IDE_BASE;
+		ack_intr = macide_ack_intr;
+		irq = IRQ_NUBUS_C;
 		break;
 	case MAC_IDE_BABOON:
-		ide_setup_ports(&hw, BABOON_BASE, macide_offsets,
-				0, 0, NULL,
-//				macide_baboon_iops,
-				IRQ_BABOON_1);
+		base = BABOON_BASE;
+		ack_intr = NULL;
+		irq = IRQ_BABOON_1;
 		break;
 	default:
 		return -ENODEV;
@@ -115,6 +118,8 @@ static int __init macide_init(void)
 	printk(KERN_INFO "ide: Macintosh %s IDE controller\n",
 			 mac_ide_name[macintosh_config->ide_type - 1]);
 
+	macide_setup_ports(&hw, base, irq, ack_intr);
+
 	hwif = ide_find_port(hw.io_ports[IDE_DATA_OFFSET]);
 	if (hwif) {
 		u8 index = hwif->index;
diff --git a/drivers/ide/legacy/q40ide.c b/drivers/ide/legacy/q40ide.c
index 2f0b34d..1381b91 100644
--- a/drivers/ide/legacy/q40ide.c
+++ b/drivers/ide/legacy/q40ide.c
@@ -66,16 +66,12 @@ static int q40ide_default_irq(unsigned long base)
 
 
 /*
- * This is very similar to ide_setup_ports except that addresses
- * are pretranslated for q40 ISA access
+ * Addresses are pretranslated for Q40 ISA access.
  */
 void q40_ide_setup_ports ( hw_regs_t *hw,
 			unsigned long base, int *offsets,
 			unsigned long ctrl, unsigned long intr,
 			ide_ack_intr_t *ack_intr,
-/*
- *			ide_io_ops_t *iops,
- */
 			int irq)
 {
 	int i;
@@ -92,9 +88,6 @@ void q40_ide_setup_ports ( hw_regs_t *hw,
 
 	hw->irq = irq;
 	hw->ack_intr = ack_intr;
-/*
- *	hw->iops = iops;
- */
 }
 
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e02fd11..2f66aaa 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -194,17 +194,6 @@ struct ide_drive_s;
 int ide_register_hw(hw_regs_t *, void (*)(struct ide_drive_s *),
 		    struct hwif_s **);
 
-void ide_setup_ports(	hw_regs_t *hw,
-			unsigned long base,
-			int *offsets,
-			unsigned long ctrl,
-			unsigned long intr,
-			ide_ack_intr_t *ack_intr,
-#if 0
-			ide_io_ops_t *iops,
-#endif
-			int irq);
-
 static inline void ide_std_init_ports(hw_regs_t *hw,
 				      unsigned long io_addr,
 				      unsigned long ctl_addr)
-- 
cgit v1.1


From c47137a99c597330b69057158b26061a360c0e09 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:51 +0100
Subject: ide: add ide_read_[alt]status() inline helpers

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/arm/icside.c   |  2 +-
 drivers/ide/ide-cd.c       |  7 ++++---
 drivers/ide/ide-dma.c      |  3 ++-
 drivers/ide/ide-floppy.c   |  4 ++--
 drivers/ide/ide-io.c       | 12 ++++++------
 drivers/ide/ide-iops.c     | 39 ++++++++++++++++++++++++--------------
 drivers/ide/ide-probe.c    | 47 ++++++++++++++++++++++++----------------------
 drivers/ide/ide-tape.c     |  7 ++++---
 drivers/ide/ide-taskfile.c | 30 ++++++++++++++---------------
 drivers/scsi/ide-scsi.c    |  4 ++--
 include/linux/ide.h        | 14 ++++++++++++++
 11 files changed, 100 insertions(+), 69 deletions(-)

diff --git a/drivers/ide/arm/icside.c b/drivers/ide/arm/icside.c
index fb00f38..e816b0f 100644
--- a/drivers/ide/arm/icside.c
+++ b/drivers/ide/arm/icside.c
@@ -365,7 +365,7 @@ static void icside_dma_timeout(ide_drive_t *drive)
 	if (icside_dma_test_irq(drive))
 		return;
 
-	ide_dump_status(drive, "DMA timeout", HWIF(drive)->INB(IDE_STATUS_REG));
+	ide_dump_status(drive, "DMA timeout", ide_read_status(drive));
 
 	icside_dma_end(drive);
 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ee4d458..892e42e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -295,7 +295,8 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 	int stat, err, sense_key;
 	
 	/* Check for errors. */
-	stat = HWIF(drive)->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
+
 	if (stat_ret)
 		*stat_ret = stat;
 
@@ -692,7 +693,7 @@ int ide_cd_check_ireason(ide_drive_t *drive, int len, int ireason, int rw)
 		/* Some drives (ASUS) seem to tell us that status
 		 * info is available. just get it and ignore.
 		 */
-		(void) HWIF(drive)->INB(IDE_STATUS_REG);
+		(void)ide_read_status(drive);
 		return 0;
 	} else {
 		/* Drive wants a command packet, or invalid ireason... */
@@ -1326,7 +1327,7 @@ ide_do_rw_cdrom (ide_drive_t *drive, struct request *rq, sector_t block)
 	if (blk_fs_request(rq)) {
 		if (info->cd_flags & IDE_CD_FLAG_SEEKING) {
 			unsigned long elapsed = jiffies - info->start_seek;
-			int stat = HWIF(drive)->INB(IDE_STATUS_REG);
+			int stat = ide_read_status(drive);
 
 			if ((stat & SEEK_STAT) != SEEK_STAT) {
 				if (elapsed < IDECD_SEEK_TIMEOUT) {
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 3cf59f2..a4bb328 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -147,7 +147,8 @@ ide_startstop_t ide_dma_intr (ide_drive_t *drive)
 	u8 stat = 0, dma_stat = 0;
 
 	dma_stat = HWIF(drive)->ide_dma_end(drive);
-	stat = HWIF(drive)->INB(IDE_STATUS_REG);	/* get drive status */
+	stat = ide_read_status(drive);
+
 	if (OK_STAT(stat,DRIVE_READY,drive->bad_wstat|DRQ_STAT)) {
 		if (!dma_stat) {
 			struct request *rq = HWGROUP(drive)->rq;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f8fe6ee..3ac79ed 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -501,7 +501,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive)
 	}
 
 	/* Clear the interrupt */
-	stat = drive->hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
 
 	/* No more interrupts */
 	if ((stat & DRQ_STAT) == 0) {
@@ -1246,7 +1246,7 @@ static int idefloppy_get_format_progress(ide_drive_t *drive, int __user *arg)
 		u8 stat;
 
 		local_irq_save(flags);
-		stat = drive->hwif->INB(IDE_STATUS_REG);
+		stat = ide_read_status(drive);
 		local_irq_restore(flags);
 
 		progress_indication = ((stat & SEEK_STAT) == 0) ? 0 : 0x10000;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 4bddef0..29cb043 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -466,7 +466,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq, u8
 		return ide_stopped;
 	}
 
-	if (hwif->INB(IDE_STATUS_REG) & (BUSY_STAT|DRQ_STAT))
+	if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT))
 		rq->errors |= ERROR_RESET;
 
 	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
@@ -493,7 +493,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq, u
 		/* add decoding error stuff */
 	}
 
-	if (hwif->INB(IDE_STATUS_REG) & (BUSY_STAT|DRQ_STAT))
+	if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT))
 		/* force an abort */
 		hwif->OUTB(WIN_IDLEIMMEDIATE, IDE_COMMAND_REG);
 
@@ -821,8 +821,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
- 	ide_end_drive_cmd(drive,
-			hwif->INB(IDE_STATUS_REG),
+	ide_end_drive_cmd(drive, ide_read_status(drive),
 			hwif->INB(IDE_ERROR_REG));
  	return ide_stopped;
 }
@@ -1231,7 +1230,7 @@ static ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
 		(void)HWIF(drive)->ide_dma_end(drive);
 		ret = ide_error(drive, "dma timeout error",
-						hwif->INB(IDE_STATUS_REG));
+				ide_read_status(drive));
 	} else {
 		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
 		hwif->dma_timeout(drive);
@@ -1355,7 +1354,8 @@ void ide_timer_expiry (unsigned long data)
 					startstop = ide_dma_timeout_retry(drive, wait);
 				} else
 					startstop =
-					ide_error(drive, "irq timeout", hwif->INB(IDE_STATUS_REG));
+					ide_error(drive, "irq timeout",
+						  ide_read_status(drive));
 			}
 			drive->service_time = jiffies - drive->service_start;
 			spin_lock_irq(&ide_lock);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 7647ac4..716244c 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -430,10 +430,10 @@ int drive_is_ready (ide_drive_t *drive)
 	 * about possible isa-pnp and pci-pnp issues yet.
 	 */
 	if (IDE_CONTROL_REG)
-		stat = hwif->INB(IDE_ALTSTATUS_REG);
+		stat = ide_read_altstatus(drive);
 	else
 		/* Note: this may clear a pending IRQ!! */
-		stat = hwif->INB(IDE_STATUS_REG);
+		stat = ide_read_status(drive);
 
 	if (stat & BUSY_STAT)
 		/* drive busy:  definitely not interrupting */
@@ -458,23 +458,24 @@ EXPORT_SYMBOL(drive_is_ready);
  */
 static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, unsigned long timeout, u8 *rstat)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	unsigned long flags;
 	int i;
 	u8 stat;
 
 	udelay(1);	/* spec allows drive 400ns to assert "BUSY" */
-	if ((stat = hwif->INB(IDE_STATUS_REG)) & BUSY_STAT) {
+	stat = ide_read_status(drive);
+
+	if (stat & BUSY_STAT) {
 		local_irq_set(flags);
 		timeout += jiffies;
-		while ((stat = hwif->INB(IDE_STATUS_REG)) & BUSY_STAT) {
+		while ((stat = ide_read_status(drive)) & BUSY_STAT) {
 			if (time_after(jiffies, timeout)) {
 				/*
 				 * One last read after the timeout in case
 				 * heavy interrupt load made us not make any
 				 * progress during the timeout..
 				 */
-				stat = hwif->INB(IDE_STATUS_REG);
+				stat = ide_read_status(drive);
 				if (!(stat & BUSY_STAT))
 					break;
 
@@ -494,7 +495,9 @@ static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, unsigned long ti
 	 */
 	for (i = 0; i < 10; i++) {
 		udelay(1);
-		if (OK_STAT((stat = hwif->INB(IDE_STATUS_REG)), good, bad)) {
+		stat = ide_read_status(drive);
+
+		if (OK_STAT(stat, good, bad)) {
 			*rstat = stat;
 			return 0;
 		}
@@ -617,6 +620,7 @@ int ide_driveid_update(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	struct hd_driveid *id;
 	unsigned long timeout, flags;
+	u8 stat;
 
 	/*
 	 * Re-read drive->id for possible DMA mode
@@ -633,10 +637,15 @@ int ide_driveid_update(ide_drive_t *drive)
 			SELECT_MASK(drive, 0);
 			return 0;	/* drive timed-out */
 		}
+
 		msleep(50);	/* give drive a breather */
-	} while (hwif->INB(IDE_ALTSTATUS_REG) & BUSY_STAT);
+		stat = ide_read_altstatus(drive);
+	} while (stat & BUSY_STAT);
+
 	msleep(50);	/* wait for IRQ and DRQ_STAT */
-	if (!OK_STAT(hwif->INB(IDE_STATUS_REG),DRQ_STAT,BAD_R_STAT)) {
+	stat = ide_read_status(drive);
+
+	if (!OK_STAT(stat, DRQ_STAT, BAD_R_STAT)) {
 		SELECT_MASK(drive, 0);
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
@@ -649,7 +658,7 @@ int ide_driveid_update(ide_drive_t *drive)
 		return 0;
 	}
 	ata_input_data(drive, id, SECTOR_WORDS);
-	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
+	(void)ide_read_status(drive);	/* clear drive IRQ */
 	local_irq_enable();
 	local_irq_restore(flags);
 	ide_fix_driveid(id);
@@ -850,15 +859,15 @@ static ide_startstop_t do_reset1 (ide_drive_t *, int);
 static ide_startstop_t atapi_reset_pollfunc (ide_drive_t *drive)
 {
 	ide_hwgroup_t *hwgroup	= HWGROUP(drive);
-	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
 	SELECT_DRIVE(drive);
 	udelay (10);
+	stat = ide_read_status(drive);
 
-	if (OK_STAT(stat = hwif->INB(IDE_STATUS_REG), 0, BUSY_STAT)) {
+	if (OK_STAT(stat, 0, BUSY_STAT))
 		printk("%s: ATAPI reset complete\n", drive->name);
-	} else {
+	else {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
 			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
@@ -897,7 +906,9 @@ static ide_startstop_t reset_pollfunc (ide_drive_t *drive)
 		}
 	}
 
-	if (!OK_STAT(tmp = hwif->INB(IDE_STATUS_REG), 0, BUSY_STAT)) {
+	tmp = ide_read_status(drive);
+
+	if (!OK_STAT(tmp, 0, BUSY_STAT)) {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
 			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 9c07bdb..fd0ef82 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -264,8 +264,7 @@ err_misc:
 static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	int rc;
-	unsigned long hd_status;
+	int use_altstatus = 0, rc;
 	unsigned long timeout;
 	u8 s = 0, a = 0;
 
@@ -273,19 +272,17 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 	msleep(50);
 
 	if (IDE_CONTROL_REG) {
-		a = hwif->INB(IDE_ALTSTATUS_REG);
-		s = hwif->INB(IDE_STATUS_REG);
-		if ((a ^ s) & ~INDEX_STAT) {
-			printk(KERN_INFO "%s: probing with STATUS(0x%02x) instead of "
-				"ALTSTATUS(0x%02x)\n", drive->name, s, a);
+		a = ide_read_altstatus(drive);
+		s = ide_read_status(drive);
+		if ((a ^ s) & ~INDEX_STAT)
 			/* ancient Seagate drives, broken interfaces */
-			hd_status = IDE_STATUS_REG;
-		} else {
+			printk(KERN_INFO "%s: probing with STATUS(0x%02x) "
+					 "instead of ALTSTATUS(0x%02x)\n",
+					 drive->name, s, a);
+		else
 			/* use non-intrusive polling */
-			hd_status = IDE_ALTSTATUS_REG;
-		}
-	} else
-		hd_status = IDE_STATUS_REG;
+			use_altstatus = 1;
+	}
 
 	/* set features register for atapi
 	 * identify command to be sure of reply
@@ -306,11 +303,15 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 		}
 		/* give drive a breather */
 		msleep(50);
-	} while ((hwif->INB(hd_status)) & BUSY_STAT);
+		s = use_altstatus ? ide_read_altstatus(drive)
+				  : ide_read_status(drive);
+	} while (s & BUSY_STAT);
 
 	/* wait for IRQ and DRQ_STAT */
 	msleep(50);
-	if (OK_STAT((hwif->INB(IDE_STATUS_REG)), DRQ_STAT, BAD_R_STAT)) {
+	s = ide_read_status(drive);
+
+	if (OK_STAT(s, DRQ_STAT, BAD_R_STAT)) {
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
@@ -320,7 +321,7 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
-		(void) hwif->INB(IDE_STATUS_REG);
+		(void)ide_read_status(drive);
 		local_irq_restore(flags);
 	} else {
 		/* drive refused ID */
@@ -367,7 +368,7 @@ static int try_to_identify (ide_drive_t *drive, u8 cmd)
 
 		ide_set_irq(drive, 0);
 		/* clear drive IRQ */
-		(void) hwif->INB(IDE_STATUS_REG);
+		(void)ide_read_status(drive);
 		udelay(5);
 		irq = probe_irq_off(cookie);
 		if (!hwif->irq) {
@@ -455,7 +456,9 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 		return 3;
 	}
 
-	if (OK_STAT((hwif->INB(IDE_STATUS_REG)), READY_STAT, BUSY_STAT) ||
+	stat = ide_read_status(drive);
+
+	if (OK_STAT(stat, READY_STAT, BUSY_STAT) ||
 	    drive->present || cmd == WIN_PIDENTIFY) {
 		/* send cmd and wait */
 		if ((rc = try_to_identify(drive, cmd))) {
@@ -463,7 +466,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			rc = try_to_identify(drive,cmd);
 		}
 
-		stat = hwif->INB(IDE_STATUS_REG);
+		stat = ide_read_status(drive);
 
 		if (stat == (BUSY_STAT | READY_STAT))
 			return 4;
@@ -482,7 +485,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 		}
 
 		/* ensure drive IRQ is clear */
-		stat = hwif->INB(IDE_STATUS_REG);
+		stat = ide_read_status(drive);
 
 		if (rc == 1)
 			printk(KERN_ERR "%s: no response (status = 0x%02x)\n",
@@ -496,7 +499,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 		SELECT_DRIVE(&hwif->drives[0]);
 		msleep(50);
 		/* ensure drive irq is clear */
-		(void) hwif->INB(IDE_STATUS_REG);
+		(void)ide_read_status(drive);
 	}
 	return rc;
 }
@@ -521,7 +524,7 @@ static void enable_nest (ide_drive_t *drive)
 
 	msleep(50);
 
-	stat = hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
 
 	if (!OK_STAT(stat, 0, BAD_STAT))
 		printk(KERN_CONT "failed (status = 0x%02x)\n", stat);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index bf40d8c..66801c0 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1178,7 +1178,7 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 #endif /* IDETAPE_DEBUG_LOG */	
 
 	/* Clear the interrupt */
-	stat = hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
 
 	if (test_bit(PC_DMA_IN_PROGRESS, &pc->flags)) {
 		if (hwif->ide_dma_end(drive) || (stat & ERR_STAT)) {
@@ -1598,7 +1598,8 @@ static ide_startstop_t idetape_media_access_finished (ide_drive_t *drive)
 	idetape_pc_t *pc = tape->pc;
 	u8 stat;
 
-	stat = drive->hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
+
 	if (stat & SEEK_STAT) {
 		if (stat & ERR_STAT) {
 			/* Error detected */
@@ -1758,7 +1759,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	 * If the tape is still busy, postpone our request and service
 	 * the other device meanwhile.
 	 */
-	stat = drive->hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
 
 	if (!drive->dsc_overlap && !(rq->cmd[0] & REQ_IDETAPE_PC2))
 		set_bit(IDETAPE_IGNORE_DSC, &tape->flags);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4e1da1c..2545dde 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -189,12 +189,11 @@ EXPORT_SYMBOL_GPL(do_rw_taskfile);
  */
 static ide_startstop_t set_multmode_intr(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-	u8 stat;
+	u8 stat = ide_read_status(drive);
 
-	if (OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
+	if (OK_STAT(stat, READY_STAT, BAD_STAT))
 		drive->mult_count = drive->mult_req;
-	} else {
+	else {
 		drive->mult_req = drive->mult_count = 0;
 		drive->special.b.recalibrate = 1;
 		(void) ide_dump_status(drive, "set_multmode", stat);
@@ -207,11 +206,10 @@ static ide_startstop_t set_multmode_intr(ide_drive_t *drive)
  */
 static ide_startstop_t set_geometry_intr(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
 	int retries = 5;
 	u8 stat;
 
-	while (((stat = hwif->INB(IDE_STATUS_REG)) & BUSY_STAT) && retries--)
+	while (((stat = ide_read_status(drive)) & BUSY_STAT) && retries--)
 		udelay(10);
 
 	if (OK_STAT(stat, READY_STAT, BAD_STAT))
@@ -230,10 +228,9 @@ static ide_startstop_t set_geometry_intr(ide_drive_t *drive)
  */
 static ide_startstop_t recal_intr(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-	u8 stat;
+	u8 stat = ide_read_status(drive);
 
-	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG), READY_STAT, BAD_STAT))
+	if (!OK_STAT(stat, READY_STAT, BAD_STAT))
 		return ide_error(drive, "recal_intr", stat);
 	return ide_stopped;
 }
@@ -248,10 +245,12 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	u8 stat;
 
 	local_irq_enable_in_hardirq();
-	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
+	stat = ide_read_status(drive);
+
+	if (!OK_STAT(stat, READY_STAT, BAD_STAT))
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
-	}
+
 	if (args)
 		ide_end_drive_cmd(drive, stat, hwif->INB(IDE_ERROR_REG));
 
@@ -260,7 +259,6 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 
 static u8 wait_drive_not_busy(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = HWIF(drive);
 	int retries;
 	u8 stat;
 
@@ -269,7 +267,9 @@ static u8 wait_drive_not_busy(ide_drive_t *drive)
 	 * This can take up to 10 usec, but we will wait max 1 ms.
 	 */
 	for (retries = 0; retries < 100; retries++) {
-		if ((stat = hwif->INB(IDE_STATUS_REG)) & BUSY_STAT)
+		stat = ide_read_status(drive);
+
+		if (stat & BUSY_STAT)
 			udelay(10);
 		else
 			break;
@@ -430,7 +430,7 @@ static ide_startstop_t task_in_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = HWGROUP(drive)->rq;
-	u8 stat = hwif->INB(IDE_STATUS_REG);
+	u8 stat = ide_read_status(drive);
 
 	/* new way for dealing with premature shared PCI interrupts */
 	if (!OK_STAT(stat, DRQ_STAT, BAD_R_STAT)) {
@@ -465,7 +465,7 @@ static ide_startstop_t task_out_intr (ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = HWGROUP(drive)->rq;
-	u8 stat = hwif->INB(IDE_STATUS_REG);
+	u8 stat = ide_read_status(drive);
 
 	if (!OK_STAT(stat, DRIVE_READY, drive->bad_wstat))
 		return task_error(drive, rq, __FUNCTION__, stat);
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 6c4f0f0..68e5c632 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -287,7 +287,7 @@ static int idescsi_end_request(ide_drive_t *, int, int);
 static ide_startstop_t
 idescsi_atapi_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err)
 {
-	if (HWIF(drive)->INB(IDE_STATUS_REG) & (BUSY_STAT|DRQ_STAT))
+	if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT))
 		/* force an abort */
 		HWIF(drive)->OUTB(WIN_IDLEIMMEDIATE,IDE_COMMAND_REG);
 
@@ -423,7 +423,7 @@ static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive)
 	}
 
 	/* Clear the interrupt */
-	stat = drive->hwif->INB(IDE_STATUS_REG);
+	stat = ide_read_status(drive);
 
 	if ((stat & DRQ_STAT) == 0) {
 		/* No more interrupts */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2f66aaa..d212492 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1309,4 +1309,18 @@ static inline void ide_set_irq(ide_drive_t *drive, int on)
 	drive->hwif->OUTB(drive->ctl | (on ? 0 : 2), IDE_CONTROL_REG);
 }
 
+static inline u8 ide_read_status(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	return hwif->INB(hwif->io_ports[IDE_STATUS_OFFSET]);
+}
+
+static inline u8 ide_read_altstatus(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	return hwif->INB(hwif->io_ports[IDE_CONTROL_OFFSET]);
+}
+
 #endif /* _IDE_H */
-- 
cgit v1.1


From 64a57fe4393bae920d03c253173f59d8a7ec8e25 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 6 Feb 2008 02:57:51 +0100
Subject: ide: add ide_read_error() inline helper

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c       | 2 +-
 drivers/ide/ide-floppy.c   | 2 +-
 drivers/ide/ide-io.c       | 4 ++--
 drivers/ide/ide-iops.c     | 4 +++-
 drivers/ide/ide-lib.c      | 2 +-
 drivers/ide/ide-tape.c     | 2 +-
 drivers/ide/ide-taskfile.c | 5 ++---
 include/linux/ide.h        | 7 +++++++
 8 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 892e42e..5e42c19 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -304,7 +304,7 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret)
 		return 0;
 
 	/* Get the IDE error register. */
-	err = HWIF(drive)->INB(IDE_ERROR_REG);
+	err = ide_read_error(drive);
 	sense_key = err >> 4;
 
 	if (rq == NULL) {
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3ac79ed..faf22d7 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -465,7 +465,7 @@ static void idefloppy_retry_pc(ide_drive_t *drive)
 	idefloppy_pc_t *pc;
 	struct request *rq;
 
-	(void)drive->hwif->INB(IDE_ERROR_REG);
+	(void)ide_read_error(drive);
 	pc = idefloppy_next_pc_storage(drive);
 	rq = idefloppy_next_rq_storage(drive);
 	idefloppy_create_request_sense_cmd(pc);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 29cb043..3addbe4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -821,8 +821,8 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
-	ide_end_drive_cmd(drive, ide_read_status(drive),
-			hwif->INB(IDE_ERROR_REG));
+	ide_end_drive_cmd(drive, ide_read_status(drive), ide_read_error(drive));
+
  	return ide_stopped;
 }
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 716244c..c32e759 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -918,7 +918,9 @@ static ide_startstop_t reset_pollfunc (ide_drive_t *drive)
 		drive->failures++;
 	} else  {
 		printk("%s: reset: ", hwif->name);
-		if ((tmp = hwif->INB(IDE_ERROR_REG)) == 1) {
+		tmp = ide_read_error(drive);
+
+		if (tmp == 1) {
 			printk("success\n");
 			drive->failures = 0;
 		} else {
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index b42940d..1ff676cc 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -578,7 +578,7 @@ u8 ide_dump_status(ide_drive_t *drive, const char *msg, u8 stat)
 	}
 	printk("}\n");
 	if ((stat & (BUSY_STAT|ERR_STAT)) == ERR_STAT) {
-		err = drive->hwif->INB(IDE_ERROR_REG);
+		err = ide_read_error(drive);
 		printk("%s: %s: error=0x%02x ", drive->name, msg, err);
 		if (drive->media == ide_disk)
 			ide_dump_ata_error(drive, err);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 66801c0..4017313 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1125,7 +1125,7 @@ static ide_startstop_t idetape_retry_pc (ide_drive_t *drive)
 	idetape_pc_t *pc;
 	struct request *rq;
 
-	(void)drive->hwif->INB(IDE_ERROR_REG);
+	(void)ide_read_error(drive);
 	pc = idetape_next_pc_storage(drive);
 	rq = idetape_next_rq_storage(drive);
 	idetape_create_request_sense_cmd(pc);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 2545dde..0518a2e 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -241,7 +241,6 @@ static ide_startstop_t recal_intr(ide_drive_t *drive)
 static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 {
 	ide_task_t *args	= HWGROUP(drive)->rq->special;
-	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
 	local_irq_enable_in_hardirq();
@@ -252,7 +251,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 		/* calls ide_end_drive_cmd */
 
 	if (args)
-		ide_end_drive_cmd(drive, stat, hwif->INB(IDE_ERROR_REG));
+		ide_end_drive_cmd(drive, stat, ide_read_error(drive));
 
 	return ide_stopped;
 }
@@ -408,7 +407,7 @@ static ide_startstop_t task_error(ide_drive_t *drive, struct request *rq,
 void task_end_request(ide_drive_t *drive, struct request *rq, u8 stat)
 {
 	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		u8 err = drive->hwif->INB(IDE_ERROR_REG);
+		u8 err = ide_read_error(drive);
 
 		ide_end_drive_cmd(drive, stat, err);
 		return;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d212492..acec99d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1323,4 +1323,11 @@ static inline u8 ide_read_altstatus(ide_drive_t *drive)
 	return hwif->INB(hwif->io_ports[IDE_CONTROL_OFFSET]);
 }
 
+static inline u8 ide_read_error(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+
+	return hwif->INB(hwif->io_ports[IDE_ERROR_OFFSET]);
+}
+
 #endif /* _IDE_H */
-- 
cgit v1.1


From 8004a8c9744842a5a32b71d3a8093c652972bb23 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:51 +0100
Subject: ide-tape: refactor the debug logging facility

Teach the debug logging macro to differentiate between log levels based on the
type of debug level enabled specifically instead of a threshold-based one.
Thus, convert tape->debug_level to a bitmask that is written to over /proc.

Also,
- cleanup and simplify the debug macro thus removing a lot of code lines,
- get rid of unused debug levels,
- adjust the loglevel at several places where it was simply missing (e.g.
  idetape_chrdev_open())
- move the tape ptr initialization up in idetape_chrdev_open() so that we can
  use it in the debug_log macro earlier in the function.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 344 ++++++++++++++++++-------------------------------
 1 file changed, 122 insertions(+), 222 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4017313..64d46fe 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -45,6 +45,32 @@
 #include <asm/unaligned.h>
 #include <linux/mtio.h>
 
+enum {
+	/* output errors only */
+	DBG_ERR =		(1 << 0),
+	/* output all sense key/asc */
+	DBG_SENSE =		(1 << 1),
+	/* info regarding all chrdev-related procedures */
+	DBG_CHRDEV =		(1 << 2),
+	/* all remaining procedures */
+	DBG_PROCS =		(1 << 3),
+	/* buffer alloc info (pc_stack & rq_stack) */
+	DBG_PCRQ_STACK =	(1 << 4),
+};
+
+/* define to see debug info */
+#define IDETAPE_DEBUG_LOG		0
+
+#if IDETAPE_DEBUG_LOG
+#define debug_log(lvl, fmt, args...)			\
+{							\
+	if (tape->debug_mask & lvl)			\
+	printk(KERN_INFO "ide-tape: " fmt, ## args);	\
+}
+#else
+#define debug_log(lvl, fmt, args...) do {} while (0)
+#endif
+
 /**************************** Tunable parameters *****************************/
 
 
@@ -68,23 +94,6 @@
 #define IDETAPE_INCREASE_STAGES_RATE	 20
 
 /*
- *	The following are used to debug the driver:
- *
- *	Setting IDETAPE_DEBUG_LOG to 1 will log driver flow control.
- *
- *	Setting them to 0 will restore normal operation mode:
- *
- *		1.	Disable logging normal successful operations.
- *		2.	Disable self-sanity checks.
- *		3.	Errors will still be logged, of course.
- *
- *	All the #if DEBUG code will be removed some day, when the driver
- *	is verified to be stable enough. This will make it much more
- *	esthetic.
- */
-#define IDETAPE_DEBUG_LOG		0
-
-/*
  *	After each failed packet command we issue a request sense command
  *	and retry the packet command IDETAPE_MAX_PC_RETRIES times.
  *
@@ -451,18 +460,7 @@ typedef struct ide_tape_obj {
 	unsigned long uncontrolled_previous_head_time;
 	int restart_speed_control_req;
 
-        /*
-         * Debug_level determines amount of debugging output;
-         * can be changed using /proc/ide/hdx/settings
-         * 0 : almost no debugging output
-         * 1 : 0+output errors only
-         * 2 : 1+output all sensekey/asc
-         * 3 : 2+follow all chrdev related procedures
-         * 4 : 3+follow all procedures
-         * 5 : 4+include pc_stack rq_stack info
-         * 6 : 5+USE_COUNT updates
-         */
-         int debug_level; 
+	u32 debug_mask;
 } idetape_tape_t;
 
 static DEFINE_MUTEX(idetape_ref_mutex);
@@ -716,11 +714,8 @@ static idetape_pc_t *idetape_next_pc_storage (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 5)
-		printk(KERN_INFO "ide-tape: pc_stack_index=%d\n",
-			tape->pc_stack_index);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PCRQ_STACK, "pc_stack_index=%d\n", tape->pc_stack_index);
+
 	if (tape->pc_stack_index == IDETAPE_PC_STACK)
 		tape->pc_stack_index=0;
 	return (&tape->pc_stack[tape->pc_stack_index++]);
@@ -743,11 +738,8 @@ static struct request *idetape_next_rq_storage (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 5)
-		printk(KERN_INFO "ide-tape: rq_stack_index=%d\n",
-			tape->rq_stack_index);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PCRQ_STACK, "rq_stack_index=%d\n", tape->rq_stack_index);
+
 	if (tape->rq_stack_index == IDETAPE_PC_STACK)
 		tape->rq_stack_index=0;
 	return (&tape->rq_stack[tape->rq_stack_index++]);
@@ -780,17 +772,9 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	tape->sense_key = sense[2] & 0xF;
 	tape->asc       = sense[12];
 	tape->ascq      = sense[13];
-#if IDETAPE_DEBUG_LOG
-	/*
-	 * Without debugging, we only log an error if we decided to give up
-	 * retrying.
-	 */
-	if (tape->debug_level >= 1)
-		printk(KERN_INFO "ide-tape: pc = %x, sense key = %x, "
-			"asc = %x, ascq = %x\n",
-			pc->c[0], tape->sense_key,
-			tape->asc, tape->ascq);
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_ERR, "pc = %x, sense key = %x, asc = %x, ascq = %x\n",
+		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
 	/* Correct pc->actually_transferred by asking the tape.	 */
 	if (test_bit(PC_DMA_ERROR, &pc->flags)) {
@@ -843,12 +827,11 @@ static void idetape_activate_next_stage(ide_drive_t *drive)
 	idetape_stage_t *stage = tape->next_stage;
 	struct request *rq = &stage->rq;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_active_next_stage\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	if (stage == NULL) {
-		printk(KERN_ERR "ide-tape: bug: Trying to activate a non existing stage\n");
+		printk(KERN_ERR "ide-tape: bug: Trying to activate a non"
+				" existing stage\n");
 		return;
 	}
 
@@ -871,11 +854,8 @@ static void idetape_increase_max_pipeline_stages (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int increase = (tape->max_pipeline - tape->min_pipeline) / 10;
-	
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk (KERN_INFO "ide-tape: Reached idetape_increase_max_pipeline_stages\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	tape->max_stages += max(increase, 1);
 	tape->max_stages = max(tape->max_stages, tape->min_pipeline);
@@ -920,17 +900,16 @@ static void idetape_remove_stage_head (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_stage_t *stage;
-	
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_remove_stage_head\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	if (tape->first_stage == NULL) {
 		printk(KERN_ERR "ide-tape: bug: tape->first_stage is NULL\n");
 		return;
 	}
 	if (tape->active_stage == tape->first_stage) {
-		printk(KERN_ERR "ide-tape: bug: Trying to free our active pipeline stage\n");
+		printk(KERN_ERR "ide-tape: bug: Trying to free our active "
+				"pipeline stage\n");
 		return;
 	}
 	stage = tape->first_stage;
@@ -957,10 +936,8 @@ static void idetape_abort_pipeline(ide_drive_t *drive,
 	idetape_stage_t *stage = new_last_stage->next;
 	idetape_stage_t *nstage;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: %s: idetape_abort_pipeline called\n", tape->name);
-#endif
+	debug_log(DBG_PROCS, "%s: Enter %s\n", tape->name, __func__);
+
 	while (stage) {
 		nstage = stage->next;
 		idetape_kfree_stage(tape, stage);
@@ -987,10 +964,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	int remove_stage = 0;
 	idetape_stage_t *active_stage;
 
-#if IDETAPE_DEBUG_LOG
-        if (tape->debug_level >= 4)
-	printk(KERN_INFO "ide-tape: Reached idetape_end_request\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	switch (uptodate) {
 		case 0:	error = IDETAPE_ERROR_GENERAL; break;
@@ -1055,10 +1029,8 @@ static ide_startstop_t idetape_request_sense_callback (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_request_sense_callback\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	if (!tape->pc->error) {
 		idetape_analyze_error(drive, tape->pc->buffer);
 		idetape_end_request(drive, 1, 0);
@@ -1143,10 +1115,8 @@ static void idetape_postpone_request (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: idetape_postpone_request\n");
-#endif
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	tape->postponed_rq = HWGROUP(drive)->rq;
 	ide_stall_queue(drive, tape->dsc_polling_frequency);
 }
@@ -1171,11 +1141,7 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 	u16 bcount;
 	u8 stat, ireason;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_pc_intr "
-				"interrupt handler\n");
-#endif /* IDETAPE_DEBUG_LOG */	
+	debug_log(DBG_PROCS, "Enter %s - interrupt handler\n", __func__);
 
 	/* Clear the interrupt */
 	stat = ide_read_status(drive);
@@ -1208,20 +1174,16 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 			pc->actually_transferred = pc->request_transfer;
 			idetape_update_buffers(pc);
 		}
-#if IDETAPE_DEBUG_LOG
-		if (tape->debug_level >= 4)
-			printk(KERN_INFO "ide-tape: DMA finished\n");
-#endif /* IDETAPE_DEBUG_LOG */
+		debug_log(DBG_PROCS, "DMA finished\n");
+
 	}
 
 	/* No more interrupts */
 	if ((stat & DRQ_STAT) == 0) {
-#if IDETAPE_DEBUG_LOG
-		if (tape->debug_level >= 2)
-			printk(KERN_INFO "ide-tape: Packet command completed, %d bytes transferred\n", pc->actually_transferred);
-#endif /* IDETAPE_DEBUG_LOG */
-		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
+		debug_log(DBG_SENSE, "Packet command completed, %d bytes"
+				" transferred\n", pc->actually_transferred);
 
+		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
 		local_irq_enable();
 
 #if SIMULATE_ERRORS
@@ -1236,19 +1198,15 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 			stat &= ~ERR_STAT;
 		if ((stat & ERR_STAT) || test_bit(PC_DMA_ERROR, &pc->flags)) {
 			/* Error detected */
-#if IDETAPE_DEBUG_LOG
-			if (tape->debug_level >= 1)
-				printk(KERN_INFO "ide-tape: %s: I/O error\n",
-					tape->name);
-#endif /* IDETAPE_DEBUG_LOG */
+			debug_log(DBG_ERR, "%s: I/O error\n", tape->name);
+
 			if (pc->c[0] == REQUEST_SENSE) {
 				printk(KERN_ERR "ide-tape: I/O error in request sense command\n");
 				return ide_do_reset(drive);
 			}
-#if IDETAPE_DEBUG_LOG
-			if (tape->debug_level >= 1)
-				printk(KERN_INFO "ide-tape: [cmd %x]: check condition\n", pc->c[0]);
-#endif
+			debug_log(DBG_ERR, "[cmd %x]: check condition\n",
+					pc->c[0]);
+
 			/* Retry operation */
 			return idetape_retry_pc(drive);
 		}
@@ -1303,10 +1261,9 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 				ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL);
 				return ide_started;
 			}
-#if IDETAPE_DEBUG_LOG
-			if (tape->debug_level >= 2)
-				printk(KERN_NOTICE "ide-tape: The tape wants to send us more data than expected - allowing transfer\n");
-#endif /* IDETAPE_DEBUG_LOG */
+			debug_log(DBG_SENSE, "The tape wants to send us more "
+				"data than expected - allowing transfer\n");
+
 		}
 	}
 	if (test_bit(PC_WRITING, &pc->flags)) {
@@ -1327,11 +1284,10 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 	/* Update the current position */
 	pc->actually_transferred += bcount;
 	pc->current_position += bcount;
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 2)
-		printk(KERN_INFO "ide-tape: [cmd %x] transferred %d bytes "
-				 "on that interrupt\n", pc->c[0], bcount);
-#endif
+
+	debug_log(DBG_SENSE, "[cmd %x] transferred %d bytes on that intr.\n",
+			pc->c[0], bcount);
+
 	/* And set the interrupt handler again */
 	ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL);
 	return ide_started;
@@ -1464,10 +1420,7 @@ static ide_startstop_t idetape_issue_packet_command (ide_drive_t *drive, idetape
 		tape->failed_pc = NULL;
 		return pc->callback(drive);
 	}
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 2)
-		printk(KERN_INFO "ide-tape: Retry number - %d, cmd = %02X\n", pc->retries, pc->c[0]);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
 
 	pc->retries++;
 	/* We haven't transferred any data yet */
@@ -1505,11 +1458,8 @@ static ide_startstop_t idetape_issue_packet_command (ide_drive_t *drive, idetape
 static ide_startstop_t idetape_pc_callback (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_pc_callback\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	idetape_end_request(drive, tape->pc->error ? 0 : 1, 0);
 	return ide_stopped;
@@ -1641,11 +1591,7 @@ static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
 		tape->avg_size = 0;
 		tape->avg_time = jiffies;
 	}
-
-#if IDETAPE_DEBUG_LOG	
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_rw_callback\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	tape->first_frame_position += blocks;
 	rq->current_nr_sectors -= blocks;
@@ -1721,12 +1667,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct request *postponed_rq = tape->postponed_rq;
 	u8 stat;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 2)
-		printk(KERN_INFO "ide-tape: sector: %ld, "
-			"nr_sectors: %ld, current_nr_sectors: %d\n",
+	debug_log(DBG_SENSE, "sector: %ld, nr_sectors: %ld,"
+			" current_nr_sectors: %d\n",
 			rq->sector, rq->nr_sectors, rq->current_nr_sectors);
-#endif /* IDETAPE_DEBUG_LOG */
 
 	if (!blk_special_request(rq)) {
 		/*
@@ -1917,10 +1860,7 @@ static idetape_stage_t *idetape_kmalloc_stage (idetape_tape_t *tape)
 {
 	idetape_stage_t *cache_stage = tape->cache_stage;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_kmalloc_stage\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	if (tape->nr_stages >= tape->max_stages)
 		return NULL;
@@ -2019,11 +1959,9 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
-	
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk (KERN_INFO "ide-tape: Reached idetape_add_stage_tail\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	spin_lock_irqsave(&tape->spinlock, flags);
 	stage->next = NULL;
 	if (tape->last_stage != NULL)
@@ -2066,29 +2004,22 @@ static ide_startstop_t idetape_read_position_callback (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_read_position_result_t *result;
-	
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_read_position_callback\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	if (!tape->pc->error) {
 		result = (idetape_read_position_result_t *) tape->pc->buffer;
-#if IDETAPE_DEBUG_LOG
-		if (tape->debug_level >= 2)
-			printk(KERN_INFO "ide-tape: BOP - %s\n",result->bop ? "Yes":"No");
-		if (tape->debug_level >= 2)
-			printk(KERN_INFO "ide-tape: EOP - %s\n",result->eop ? "Yes":"No");
-#endif /* IDETAPE_DEBUG_LOG */
+		debug_log(DBG_SENSE, "BOP - %s\n", result->bop ? "Yes" : "No");
+		debug_log(DBG_SENSE, "EOP - %s\n", result->eop ? "Yes" : "No");
+
 		if (result->bpu) {
 			printk(KERN_INFO "ide-tape: Block location is unknown to the tape\n");
 			clear_bit(IDETAPE_ADDRESS_VALID, &tape->flags);
 			idetape_end_request(drive, 0, 0);
 		} else {
-#if IDETAPE_DEBUG_LOG
-			if (tape->debug_level >= 2)
-				printk(KERN_INFO "ide-tape: Block Location - %u\n", ntohl(result->first_block));
-#endif /* IDETAPE_DEBUG_LOG */
+			debug_log(DBG_SENSE, "Block Location - %u\n",
+					ntohl(result->first_block));
+
 			tape->partition = result->partition;
 			tape->first_frame_position = ntohl(result->first_block);
 			tape->last_frame_position = ntohl(result->last_block);
@@ -2228,10 +2159,7 @@ static int idetape_read_position (ide_drive_t *drive)
 	idetape_pc_t pc;
 	int position;
 
-#if IDETAPE_DEBUG_LOG
-        if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_read_position\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	idetape_create_read_position_cmd(&pc);
 	if (idetape_queue_pc_tail(drive, &pc))
@@ -2365,12 +2293,11 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct
 	idetape_tape_t *tape = drive->driver_data;
 	struct request rq;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 2)
-		printk(KERN_INFO "ide-tape: idetape_queue_rw_tail: cmd=%d\n",cmd);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+
 	if (idetape_pipeline_active(tape)) {
-		printk(KERN_ERR "ide-tape: bug: the pipeline is active in idetape_queue_rw_tail\n");
+		printk(KERN_ERR "ide-tape: bug: the pipeline is active in %s\n",
+				__func__);
 		return (0);
 	}
 
@@ -2474,10 +2401,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	unsigned long flags;
 	struct request *rq;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 3)
-		printk(KERN_INFO "ide-tape: Reached idetape_add_chrdev_write_request\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
      	/*
      	 *	Attempt to allocate a new stage.
@@ -2715,10 +2639,7 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 	struct request *rq_ptr;
 	int bytes_read;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_add_chrdev_read_request, %d blocks\n", blocks);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
 	/*
 	 * If we are at a filemark, return a read length of 0
@@ -2813,12 +2734,11 @@ static int idetape_rewind_tape (ide_drive_t *drive)
 {
 	int retval;
 	idetape_pc_t pc;
-#if IDETAPE_DEBUG_LOG
-	idetape_tape_t *tape = drive->driver_data;
-	if (tape->debug_level >= 2)
-		printk(KERN_INFO "ide-tape: Reached idetape_rewind_tape\n");
-#endif /* IDETAPE_DEBUG_LOG */	
-	
+	idetape_tape_t *tape;
+	tape = drive->driver_data;
+
+	debug_log(DBG_SENSE, "Enter %s\n", __func__);
+
 	idetape_create_rewind_cmd(drive, &pc);
 	retval = idetape_queue_pc_tail(drive, &pc);
 	if (retval)
@@ -2849,10 +2769,8 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned l
 		int nr_stages;
 	} config;
 
-#if IDETAPE_DEBUG_LOG	
-	if (tape->debug_level >= 4)
-		printk(KERN_INFO "ide-tape: Reached idetape_blkdev_ioctl\n");
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_PROCS, "Enter %s\n", __func__);
+
 	switch (cmd) {
 		case 0x0340:
 			if (copy_from_user(&config, argp, sizeof(config)))
@@ -2985,10 +2903,7 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 3)
-		printk(KERN_INFO "ide-tape: Reached idetape_chrdev_read, count %Zd\n", count);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	if (tape->chrdev_direction != idetape_direction_read) {
 		if (test_bit(IDETAPE_DETECT_BS, &tape->flags))
@@ -3030,10 +2945,8 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 	}
 finish:
 	if (!actually_read && test_bit(IDETAPE_FILEMARK, &tape->flags)) {
-#if IDETAPE_DEBUG_LOG
-		if (tape->debug_level >= 2)
-			printk(KERN_INFO "ide-tape: %s: spacing over filemark\n", tape->name);
-#endif
+		debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
+
 		idetape_space_over_filemarks(drive, MTFSF, 1);
 		return 0;
 	}
@@ -3054,11 +2967,7 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 	if (tape->write_prot)
 		return -EACCES;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 3)
-		printk(KERN_INFO "ide-tape: Reached idetape_chrdev_write, "
-			"count %Zd\n", count);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
 	if (tape->chrdev_direction != idetape_direction_write) {
@@ -3168,11 +3077,8 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 	idetape_pc_t pc;
 	int i,retval;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 1)
-		printk(KERN_INFO "ide-tape: Handling MTIOCTOP ioctl: "
-			"mt_op=%d, mt_count=%d\n", mt_op, mt_count);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_ERR, "Handling MTIOCTOP ioctl: mt_op=%d, mt_count=%d\n",
+			mt_op, mt_count);
 	/*
 	 *	Commands which need our pipelined read-ahead stages.
 	 */
@@ -3292,11 +3198,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 	int block_offset = 0, position = tape->first_frame_position;
 	void __user *argp = (void __user *)arg;
 
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 3)
-		printk(KERN_INFO "ide-tape: Reached idetape_chrdev_ioctl, "
-			"cmd=%u\n", cmd);
-#endif /* IDETAPE_DEBUG_LOG */
+	debug_log(DBG_CHRDEV, "Enter %s, cmd=%u\n", __func__, cmd);
 
 	tape->restart_speed_control_req = 1;
 	if (tape->chrdev_direction == idetape_direction_write) {
@@ -3372,6 +3274,15 @@ static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 	idetape_pc_t pc;
 	int retval;
 
+	if (i >= MAX_HWIFS * MAX_DRIVES)
+		return -ENXIO;
+
+	tape = ide_tape_chrdev_get(i);
+	if (!tape)
+		return -ENXIO;
+
+	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
+
 	/*
 	 * We really want to do nonseekable_open(inode, filp); here, but some
 	 * versions of tar incorrectly call lseek on tapes and bail out if that
@@ -3379,16 +3290,6 @@ static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 	 */
 	filp->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
 
-#if IDETAPE_DEBUG_LOG
-	printk(KERN_INFO "ide-tape: Reached idetape_chrdev_open\n");
-#endif /* IDETAPE_DEBUG_LOG */
-	
-	if (i >= MAX_HWIFS * MAX_DRIVES)
-		return -ENXIO;
-
-	if (!(tape = ide_tape_chrdev_get(i)))
-		return -ENXIO;
-
 	drive = tape->drive;
 
 	filp->private_data = tape;
@@ -3479,10 +3380,8 @@ static int idetape_chrdev_release (struct inode *inode, struct file *filp)
 
 	lock_kernel();
 	tape = drive->driver_data;
-#if IDETAPE_DEBUG_LOG
-	if (tape->debug_level >= 3)
-		printk(KERN_INFO "ide-tape: Reached idetape_chrdev_release\n");
-#endif /* IDETAPE_DEBUG_LOG */
+
+	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
 	if (tape->chrdev_direction == idetape_direction_write)
 		idetape_write_release(drive, minor);
@@ -3650,7 +3549,8 @@ static void idetape_add_settings (ide_drive_t *drive)
 	ide_add_setting(drive,	"pipeline_head_speed_c",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->controlled_pipeline_head_speed,	NULL);
 	ide_add_setting(drive,	"pipeline_head_speed_u",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->uncontrolled_pipeline_head_speed,NULL);
 	ide_add_setting(drive,	"avg_speed",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->avg_speed,			NULL);
-	ide_add_setting(drive,	"debug_level",		SETTING_RW,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->debug_level,			NULL);
+	ide_add_setting(drive, "debug_mask", SETTING_RW, TYPE_INT, 0, 0xffff, 1,
+			1, &tape->debug_mask, NULL);
 }
 #else
 static inline void idetape_add_settings(ide_drive_t *drive) { ; }
-- 
cgit v1.1


From a2f5b7f42a73e99518a719189570da43c6b66657 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:51 +0100
Subject: ide-tape: remove struct idetape_read_position_result_t

There should be no functional changes resulting from this patch.

Bart:
- remove needless "!!"

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 50 ++++++++++++++++++--------------------------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 64d46fe..3c516ff 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -571,24 +571,6 @@ struct idetape_id_gcw {
 	unsigned protocol		:2;	/* Protocol type */
 };
 
-/*
- *	READ POSITION packet command - Data Format (From Table 6-57)
- */
-typedef struct {
-	unsigned	reserved0_10	:2;	/* Reserved */
-	unsigned	bpu		:1;	/* Block Position Unknown */	
-	unsigned	reserved0_543	:3;	/* Reserved */
-	unsigned	eop		:1;	/* End Of Partition */
-	unsigned	bop		:1;	/* Beginning Of Partition */
-	u8		partition;		/* Partition Number */
-	u8		reserved2, reserved3;	/* Reserved */
-	u32		first_block;		/* First Block Location */
-	u32		last_block;		/* Last Block Location (Optional) */
-	u8		reserved12;		/* Reserved */
-	u8		blocks_in_buffer[3];	/* Blocks In Buffer - (Optional) */
-	u32		bytes_in_buffer;	/* Bytes In Buffer (Optional) */
-} idetape_read_position_result_t;
-
 /* Structures related to the SELECT SENSE / MODE SENSE packet commands. */
 #define IDETAPE_BLOCK_DESCRIPTOR	0
 #define	IDETAPE_CAPABILITIES_PAGE	0x2a
@@ -2000,30 +1982,34 @@ static void idetape_wait_for_request (ide_drive_t *drive, struct request *rq)
 	spin_lock_irq(&tape->spinlock);
 }
 
-static ide_startstop_t idetape_read_position_callback (ide_drive_t *drive)
+static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	idetape_read_position_result_t *result;
+	u8 *readpos = tape->pc->buffer;
 
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	if (!tape->pc->error) {
-		result = (idetape_read_position_result_t *) tape->pc->buffer;
-		debug_log(DBG_SENSE, "BOP - %s\n", result->bop ? "Yes" : "No");
-		debug_log(DBG_SENSE, "EOP - %s\n", result->eop ? "Yes" : "No");
-
-		if (result->bpu) {
-			printk(KERN_INFO "ide-tape: Block location is unknown to the tape\n");
+		debug_log(DBG_SENSE, "BOP - %s\n",
+				(readpos[0] & 0x80) ? "Yes" : "No");
+		debug_log(DBG_SENSE, "EOP - %s\n",
+				(readpos[0] & 0x40) ? "Yes" : "No");
+
+		if (readpos[0] & 0x4) {
+			printk(KERN_INFO "ide-tape: Block location is unknown"
+					 "to the tape\n");
 			clear_bit(IDETAPE_ADDRESS_VALID, &tape->flags);
 			idetape_end_request(drive, 0, 0);
 		} else {
 			debug_log(DBG_SENSE, "Block Location - %u\n",
-					ntohl(result->first_block));
-
-			tape->partition = result->partition;
-			tape->first_frame_position = ntohl(result->first_block);
-			tape->last_frame_position = ntohl(result->last_block);
-			tape->blocks_in_buffer = result->blocks_in_buffer[2];
+					be32_to_cpu(*(u32 *)&readpos[4]));
+
+			tape->partition = readpos[1];
+			tape->first_frame_position =
+				be32_to_cpu(*(u32 *)&readpos[4]);
+			tape->last_frame_position =
+				be32_to_cpu(*(u32 *)&readpos[8]);
+			tape->blocks_in_buffer = readpos[15];
 			set_bit(IDETAPE_ADDRESS_VALID, &tape->flags);
 			idetape_end_request(drive, 1, 0);
 		}
-- 
cgit v1.1


From 37016bab601c2fecfe833d2feda42e6c6f9b08c8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: remove unreachable code chunk

tape->speed_control is set to 1 in idetape_setup(), but, in calculate_speeds()
its value is tested for being 0, 1, or 2. Remove the if-branches where
tape->speed_control != 1 since they are never executed. Also, rename
calculate_speeds() by adding driver's prefix as is with the other function
names.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3c516ff..1dc4a9e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -87,7 +87,8 @@ enum {
  *	the optimum value or until we reach MAX.
  *
  *	Setting the following parameter to 0 is illegal: the pipelined mode
- *	cannot be disabled (calculate_speeds() divides by tape->max_stages.)
+ *	cannot be disabled (idetape_calculate_speeds() divides by
+ *	tape->max_stages.)
  */
 #define IDETAPE_MIN_PIPELINE_STAGES	  1
 #define IDETAPE_MAX_PIPELINE_STAGES	400
@@ -1475,10 +1476,9 @@ static void idetape_create_mode_sense_cmd (idetape_pc_t *pc, u8 page_code)
 	pc->callback = &idetape_pc_callback;
 }
 
-static void calculate_speeds(ide_drive_t *drive)
+static void idetape_calculate_speeds(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int full = 125, empty = 75;
 
 	if (time_after(jiffies, tape->controlled_pipeline_head_time + 120 * HZ)) {
 		tape->controlled_previous_pipeline_head = tape->controlled_last_pipeline_head;
@@ -1505,22 +1505,20 @@ static void calculate_speeds(ide_drive_t *drive)
 		}
 	}
 	tape->pipeline_head_speed = max(tape->uncontrolled_pipeline_head_speed, tape->controlled_pipeline_head_speed);
-	if (tape->speed_control == 0) {
-		tape->max_insert_speed = 5000;
-	} else if (tape->speed_control == 1) {
+
+	if (tape->speed_control == 1) {
 		if (tape->nr_pending_stages >= tape->max_stages / 2)
 			tape->max_insert_speed = tape->pipeline_head_speed +
 				(1100 - tape->pipeline_head_speed) * 2 * (tape->nr_pending_stages - tape->max_stages / 2) / tape->max_stages;
 		else
 			tape->max_insert_speed = 500 +
 				(tape->pipeline_head_speed - 500) * 2 * tape->nr_pending_stages / tape->max_stages;
+
 		if (tape->nr_pending_stages >= tape->max_stages * 99 / 100)
 			tape->max_insert_speed = 5000;
-	} else if (tape->speed_control == 2) {
-		tape->max_insert_speed = tape->pipeline_head_speed * empty / 100 +
-			(tape->pipeline_head_speed * full / 100 - tape->pipeline_head_speed * empty / 100) * tape->nr_pending_stages / tape->max_stages;
 	} else
 		tape->max_insert_speed = tape->speed_control;
+
 	tape->max_insert_speed = max(tape->max_insert_speed, 500);
 }
 
@@ -1698,7 +1696,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		tape->measure_insert_time = 1;
 	if (time_after(jiffies, tape->insert_time))
 		tape->insert_speed = tape->insert_size / 1024 * HZ / (jiffies - tape->insert_time);
-	calculate_speeds(drive);
+	idetape_calculate_speeds(drive);
 	if (!test_and_clear_bit(IDETAPE_IGNORE_DSC, &tape->flags) &&
 	    (stat & SEEK_STAT) == 0) {
 		if (postponed_rq == NULL) {
@@ -2419,7 +2417,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	idetape_switch_buffers(tape, new_stage);
 	idetape_add_stage_tail(drive, new_stage);
 	tape->pipeline_head++;
-	calculate_speeds(drive);
+	idetape_calculate_speeds(drive);
 
 	/*
 	 *	Estimate whether the tape has stopped writing by checking
@@ -2659,7 +2657,7 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 		idetape_remove_stage_head(drive);
 		spin_unlock_irqrestore(&tape->spinlock, flags);
 		tape->pipeline_head++;
-		calculate_speeds(drive);
+		idetape_calculate_speeds(drive);
 	}
 	if (bytes_read > blocks * tape->tape_block_size) {
 		printk(KERN_ERR "ide-tape: bug: trying to return more bytes than requested\n");
-- 
cgit v1.1


From a1efc85f0b4d48627ef0b2aeb766a39fb4a00561 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: simplify code branching in the interrupt handler

... by adding a new typedef function pointer idetape_io_buf in order to call
the proper buffer i/o handler depending on the data direction.

Bart:
- move idetape_io_buf before idetape_pc_intr() comment

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 54 ++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 1dc4a9e..8b6af1e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1104,19 +1104,22 @@ static void idetape_postpone_request (ide_drive_t *drive)
 	ide_stall_queue(drive, tape->dsc_polling_frequency);
 }
 
+typedef void idetape_io_buf(ide_drive_t *, idetape_pc_t *, unsigned int);
+
 /*
- *	idetape_pc_intr is the usual interrupt handler which will be called
- *	during a packet command. We will transfer some of the data (as
- *	requested by the drive) and will re-point interrupt handler to us.
- *	When data transfer is finished, we will act according to the
- *	algorithm described before idetape_issue_packet_command.
- *
+ * This is the usual interrupt handler which will be called during a packet
+ * command. We will transfer some of the data (as requested by the drive) and
+ * will re-point interrupt handler to us. When data transfer is finished, we
+ * will act according to the algorithm described before
+ * idetape_issue_packet_command.
  */
-static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
+static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t *pc = tape->pc;
+	xfer_func_t *xferfunc;
+	idetape_io_buf *iobuf;
 	unsigned int temp;
 #if SIMULATE_ERRORS
 	static int error_sim_count = 0;
@@ -1184,7 +1187,8 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 			debug_log(DBG_ERR, "%s: I/O error\n", tape->name);
 
 			if (pc->c[0] == REQUEST_SENSE) {
-				printk(KERN_ERR "ide-tape: I/O error in request sense command\n");
+				printk(KERN_ERR "ide-tape: I/O error in request"
+						" sense command\n");
 				return ide_do_reset(drive);
 			}
 			debug_log(DBG_ERR, "[cmd %x]: check condition\n",
@@ -1223,7 +1227,7 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 	ireason = hwif->INB(IDE_IREASON_REG);
 
 	if (ireason & CD) {
-		printk(KERN_ERR "ide-tape: CoD != 0 in idetape_pc_intr\n");
+		printk(KERN_ERR "ide-tape: CoD != 0 in %s\n", __func__);
 		return ide_do_reset(drive);
 	}
 	if (((ireason & IO) == IO) == test_bit(PC_WRITING, &pc->flags)) {
@@ -1239,31 +1243,29 @@ static ide_startstop_t idetape_pc_intr (ide_drive_t *drive)
 		temp = pc->actually_transferred + bcount;
 		if (temp > pc->request_transfer) {
 			if (temp > pc->buffer_size) {
-				printk(KERN_ERR "ide-tape: The tape wants to send us more data than expected - discarding data\n");
+				printk(KERN_ERR "ide-tape: The tape wants to "
+					"send us more data than expected "
+					"- discarding data\n");
 				idetape_discard_data(drive, bcount);
-				ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL);
+				ide_set_handler(drive, &idetape_pc_intr,
+						IDETAPE_WAIT_CMD, NULL);
 				return ide_started;
 			}
 			debug_log(DBG_SENSE, "The tape wants to send us more "
 				"data than expected - allowing transfer\n");
-
 		}
-	}
-	if (test_bit(PC_WRITING, &pc->flags)) {
-		if (pc->bh != NULL)
-			idetape_output_buffers(drive, pc, bcount);
-		else
-			/* Write the current buffer */
-			hwif->atapi_output_bytes(drive, pc->current_position,
-						 bcount);
+		iobuf = &idetape_input_buffers;
+		xferfunc = hwif->atapi_input_bytes;
 	} else {
-		if (pc->bh != NULL)
-			idetape_input_buffers(drive, pc, bcount);
-		else
-			/* Read the current buffer */
-			hwif->atapi_input_bytes(drive, pc->current_position,
-						bcount);
+		iobuf = &idetape_output_buffers;
+		xferfunc = hwif->atapi_output_bytes;
 	}
+
+	if (pc->bh)
+		iobuf(drive, pc, bcount);
+	else
+		xferfunc(drive, pc->current_position, bcount);
+
 	/* Update the current position */
 	pc->actually_transferred += bcount;
 	pc->current_position += bcount;
-- 
cgit v1.1


From 54abf37e4236288687ee44fef2060092b42f5cec Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: remove typedef idetape_chrdev_direction_t

.. and replace it with plain enums.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 62 ++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8b6af1e..73f06c8 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -184,11 +184,13 @@ enum {
 /*
  *	For general magnetic tape device compatibility.
  */
-typedef enum {
-	idetape_direction_none,
-	idetape_direction_read,
-	idetape_direction_write
-} idetape_chrdev_direction_t;
+
+/* tape directions */
+enum {
+	IDETAPE_DIR_NONE  = (1 << 0),
+	IDETAPE_DIR_READ  = (1 << 1),
+	IDETAPE_DIR_WRITE = (1 << 2),
+};
 
 struct idetape_bh {
 	u32 b_size;
@@ -324,7 +326,7 @@ typedef struct ide_tape_obj {
 	/* device name */
 	char name[4];
 	/* Current character device data transfer direction */
-	idetape_chrdev_direction_t chrdev_direction;
+	u8 chrdev_dir;
 
 	/*
 	 *	Device information
@@ -1916,7 +1918,7 @@ static void idetape_init_merge_stage (idetape_tape_t *tape)
 	struct idetape_bh *bh = tape->merge_stage->bh;
 	
 	tape->bh = bh;
-	if (tape->chrdev_direction == idetape_direction_write)
+	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
 		atomic_set(&bh->b_count, 0);
 	else {
 		tape->b_data = bh->b_data;
@@ -2186,7 +2188,7 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 	unsigned long flags;
 	int cnt;
 
-	if (tape->chrdev_direction != idetape_direction_read)
+	if (tape->chrdev_dir != IDETAPE_DIR_READ)
 		return 0;
 
 	/* Remove merge stage. */
@@ -2201,7 +2203,7 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 
 	/* Clear pipeline flags. */
 	clear_bit(IDETAPE_PIPELINE_ERROR, &tape->flags);
-	tape->chrdev_direction = idetape_direction_none;
+	tape->chrdev_dir = IDETAPE_DIR_NONE;
 
 	/* Remove pipeline stages. */
 	if (tape->first_stage == NULL)
@@ -2241,7 +2243,7 @@ static int idetape_position_tape (ide_drive_t *drive, unsigned int block, u8 par
 	int retval;
 	idetape_pc_t pc;
 
-	if (tape->chrdev_direction == idetape_direction_read)
+	if (tape->chrdev_dir == IDETAPE_DIR_READ)
 		__idetape_discard_read_pipeline(drive);
 	idetape_wait_ready(drive, 60 * 5 * HZ);
 	idetape_create_locate_cmd(drive, &pc, block, partition, skip);
@@ -2468,7 +2470,7 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 	int blocks, min;
 	struct idetape_bh *bh;
 
-	if (tape->chrdev_direction != idetape_direction_write) {
+	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty write pipeline, but we are not writing.\n");
 		return;
 	}
@@ -2511,7 +2513,7 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 		tape->merge_stage = NULL;
 	}
 	clear_bit(IDETAPE_PIPELINE_ERROR, &tape->flags);
-	tape->chrdev_direction = idetape_direction_none;
+	tape->chrdev_dir = IDETAPE_DIR_NONE;
 
 	/*
 	 *	On the next backup, perform the feedback loop again.
@@ -2555,8 +2557,8 @@ static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
 	u16 blocks = *(u16 *)&tape->caps[12];
 
 	/* Initialize read operation */
-	if (tape->chrdev_direction != idetape_direction_read) {
-		if (tape->chrdev_direction == idetape_direction_write) {
+	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
+		if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
 			idetape_empty_write_pipeline(drive);
 			idetape_flush_tape_buffers(drive);
 		}
@@ -2566,7 +2568,7 @@ static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
 		}
 		if ((tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0)) == NULL)
 			return -ENOMEM;
-		tape->chrdev_direction = idetape_direction_read;
+		tape->chrdev_dir = IDETAPE_DIR_READ;
 
 		/*
 		 *	Issue a read 0 command to ensure that DSC handshake
@@ -2580,7 +2582,7 @@ static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
 			if (bytes_read < 0) {
 				__idetape_kfree_stage(tape->merge_stage);
 				tape->merge_stage = NULL;
-				tape->chrdev_direction = idetape_direction_none;
+				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return bytes_read;
 			}
 		}
@@ -2801,7 +2803,7 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 		mt_count = - mt_count;
 	}
 
-	if (tape->chrdev_direction == idetape_direction_read) {
+	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
 		/*
 		 *	We have a read-ahead buffer. Scan it for crossed
 		 *	filemarks.
@@ -2891,7 +2893,7 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
-	if (tape->chrdev_direction != idetape_direction_read) {
+	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
 		if (test_bit(IDETAPE_DETECT_BS, &tape->flags))
 			if (count > tape->tape_block_size &&
 			    (count % tape->tape_block_size) == 0)
@@ -2956,8 +2958,8 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
-	if (tape->chrdev_direction != idetape_direction_write) {
-		if (tape->chrdev_direction == idetape_direction_read)
+	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
+		if (tape->chrdev_dir == IDETAPE_DIR_READ)
 			idetape_discard_read_pipeline(drive, 1);
 		if (tape->merge_stage || tape->merge_stage_size) {
 			printk(KERN_ERR "ide-tape: merge_stage_size "
@@ -2966,7 +2968,7 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 		}
 		if ((tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0)) == NULL)
 			return -ENOMEM;
-		tape->chrdev_direction = idetape_direction_write;
+		tape->chrdev_dir = IDETAPE_DIR_WRITE;
 		idetape_init_merge_stage(tape);
 
 		/*
@@ -2981,7 +2983,7 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 			if (retval < 0) {
 				__idetape_kfree_stage(tape->merge_stage);
 				tape->merge_stage = NULL;
-				tape->chrdev_direction = idetape_direction_none;
+				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return retval;
 			}
 		}
@@ -3187,7 +3189,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 	debug_log(DBG_CHRDEV, "Enter %s, cmd=%u\n", __func__, cmd);
 
 	tape->restart_speed_control_req = 1;
-	if (tape->chrdev_direction == idetape_direction_write) {
+	if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
 		idetape_empty_write_pipeline(drive);
 		idetape_flush_tape_buffers(drive);
 	}
@@ -3218,7 +3220,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 				return -EFAULT;
 			return 0;
 		default:
-			if (tape->chrdev_direction == idetape_direction_read)
+			if (tape->chrdev_dir == IDETAPE_DIR_READ)
 				idetape_discard_read_pipeline(drive, 1);
 			return idetape_blkdev_ioctl(drive, cmd, arg);
 	}
@@ -3296,7 +3298,7 @@ static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 	if (!test_bit(IDETAPE_ADDRESS_VALID, &tape->flags))
 		(void)idetape_rewind_tape(drive);
 
-	if (tape->chrdev_direction != idetape_direction_read)
+	if (tape->chrdev_dir != IDETAPE_DIR_READ)
 		clear_bit(IDETAPE_PIPELINE_ERROR, &tape->flags);
 
 	/* Read block size and write protect status from drive. */
@@ -3321,7 +3323,7 @@ static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 	/*
 	 * Lock the tape drive door so user can't eject.
 	 */
-	if (tape->chrdev_direction == idetape_direction_none) {
+	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
 		if (idetape_create_prevent_cmd(drive, &pc, 1)) {
 			if (!idetape_queue_pc_tail(drive, &pc)) {
 				if (tape->door_locked != DOOR_EXPLICITLY_LOCKED)
@@ -3369,9 +3371,9 @@ static int idetape_chrdev_release (struct inode *inode, struct file *filp)
 
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	if (tape->chrdev_direction == idetape_direction_write)
+	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
 		idetape_write_release(drive, minor);
-	if (tape->chrdev_direction == idetape_direction_read) {
+	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
 		if (minor < 128)
 			idetape_discard_read_pipeline(drive, 1);
 		else
@@ -3383,7 +3385,7 @@ static int idetape_chrdev_release (struct inode *inode, struct file *filp)
 	}
 	if (minor < 128 && test_bit(IDETAPE_MEDIUM_PRESENT, &tape->flags))
 		(void) idetape_rewind_tape(drive);
-	if (tape->chrdev_direction == idetape_direction_none) {
+	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
 		if (tape->door_locked == DOOR_LOCKED) {
 			if (idetape_create_prevent_cmd(drive, &pc, 0)) {
 				if (!idetape_queue_pc_tail(drive, &pc))
@@ -3577,7 +3579,7 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	tape->name[0] = 'h';
 	tape->name[1] = 't';
 	tape->name[2] = '0' + minor;
-	tape->chrdev_direction = idetape_direction_none;
+	tape->chrdev_dir = IDETAPE_DIR_NONE;
 	tape->pc = tape->pc_stack;
 	tape->max_insert_speed = 10000;
 	tape->speed_control = 1;
-- 
cgit v1.1


From 41f81d545b6b1f585a02d1d8545978714f710e91 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: struct idetape_tape_t: remove unused members

- last_frame_position: only being written to once
- firmware_revision, product_id, vendor_id: used once, remove from struct
  idetape_tape_t and deal with them locally
- firmware_revision_num: only written to once
- tape_still_time_begin: completely unused
- tape_still_time: never written to; remove corresponding code chunk
- uncontrolled_last_pipeline_head: only once written to
- blocks_in_buffer: only written to

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 45 +++++++++++----------------------------------
 1 file changed, 11 insertions(+), 34 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 73f06c8..ad13527 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -311,8 +311,6 @@ typedef struct ide_tape_obj {
 	u8 partition;
 	/* Current block */
 	unsigned int first_frame_position;
-	unsigned int last_frame_position;
-	unsigned int blocks_in_buffer;
 
 	/*
 	 *	Last error information
@@ -399,11 +397,6 @@ typedef struct ide_tape_obj {
 	int avg_size;
 	int avg_speed;
 
-	char vendor_id[10];
-	char product_id[18];
-	char firmware_revision[6];
-	int firmware_revision_num;
-
 	/* the door is currently locked */
 	int door_locked;
 	/* the tape hardware is write protected */
@@ -441,12 +434,6 @@ typedef struct ide_tape_obj {
 	int measure_insert_time;
 
 	/*
-	 * Measure tape still time, in milliseconds
-	 */
-	unsigned long tape_still_time_begin;
-	int tape_still_time;
-
-	/*
 	 * Speed regulation negative feedback loop
 	 */
 	int speed_control;
@@ -454,7 +441,6 @@ typedef struct ide_tape_obj {
 	int controlled_pipeline_head_speed;
 	int uncontrolled_pipeline_head_speed;
 	int controlled_last_pipeline_head;
-	int uncontrolled_last_pipeline_head;
 	unsigned long uncontrolled_pipeline_head_time;
 	unsigned long controlled_pipeline_head_time;
 	int controlled_previous_pipeline_head;
@@ -1696,8 +1682,6 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		drive->post_reset = 0;
 	}
 
-	if (tape->tape_still_time > 100 && tape->tape_still_time < 200)
-		tape->measure_insert_time = 1;
 	if (time_after(jiffies, tape->insert_time))
 		tape->insert_speed = tape->insert_size / 1024 * HZ / (jiffies - tape->insert_time);
 	idetape_calculate_speeds(drive);
@@ -2009,9 +1993,6 @@ static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
 			tape->partition = readpos[1];
 			tape->first_frame_position =
 				be32_to_cpu(*(u32 *)&readpos[4]);
-			tape->last_frame_position =
-				be32_to_cpu(*(u32 *)&readpos[8]);
-			tape->blocks_in_buffer = readpos[15];
 			set_bit(IDETAPE_ADDRESS_VALID, &tape->flags);
 			idetape_end_request(drive, 1, 0);
 		}
@@ -2540,7 +2521,7 @@ static void idetape_restart_speed_control (ide_drive_t *drive)
 
 	tape->restart_speed_control_req = 0;
 	tape->pipeline_head = 0;
-	tape->controlled_last_pipeline_head = tape->uncontrolled_last_pipeline_head = 0;
+	tape->controlled_last_pipeline_head = 0;
 	tape->controlled_previous_pipeline_head = tape->uncontrolled_previous_pipeline_head = 0;
 	tape->pipeline_head_speed = tape->controlled_pipeline_head_speed = 5000;
 	tape->uncontrolled_pipeline_head_speed = 0;
@@ -3438,9 +3419,9 @@ static int idetape_identify_device (ide_drive_t *drive)
 
 static void idetape_get_inquiry_results(ide_drive_t *drive)
 {
-	char *r;
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t pc;
+	char fw_rev[6], vendor_id[10], product_id[18];
 
 	idetape_create_inquiry_cmd(&pc);
 	if (idetape_queue_pc_tail(drive, &pc)) {
@@ -3448,20 +3429,16 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
 				tape->name);
 		return;
 	}
-	memcpy(tape->vendor_id, &pc.buffer[8], 8);
-	memcpy(tape->product_id, &pc.buffer[16], 16);
-	memcpy(tape->firmware_revision, &pc.buffer[32], 4);
-
-	ide_fixstring(tape->vendor_id, 10, 0);
-	ide_fixstring(tape->product_id, 18, 0);
-	ide_fixstring(tape->firmware_revision, 6, 0);
-	r = tape->firmware_revision;
-	if (*(r + 1) == '.')
-		tape->firmware_revision_num = (*r - '0') * 100 +
-			(*(r + 2) - '0') * 10 +	*(r + 3) - '0';
+	memcpy(vendor_id, &pc.buffer[8], 8);
+	memcpy(product_id, &pc.buffer[16], 16);
+	memcpy(fw_rev, &pc.buffer[32], 4);
+
+	ide_fixstring(vendor_id, 10, 0);
+	ide_fixstring(product_id, 18, 0);
+	ide_fixstring(fw_rev, 6, 0);
+
 	printk(KERN_INFO "ide-tape: %s <-> %s: %s %s rev %s\n",
-			drive->name, tape->name, tape->vendor_id,
-			tape->product_id, tape->firmware_revision);
+			drive->name, tape->name, vendor_id, product_id, fw_rev);
 }
 
 /*
-- 
cgit v1.1


From 54bb2074ce52fc8fce0d898b3c9921f4a951eb80 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: struct idetape_tape_t: shorten member names v2

Shorten some member names not too aggressively since this driver might be gone
anyway soon.

Bart:
- minor fixes

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 210 ++++++++++++++++++++++++++-----------------------
 1 file changed, 113 insertions(+), 97 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ad13527..2fe4e8f 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -299,10 +299,8 @@ typedef struct ide_tape_obj {
 	/* Timer used to poll for dsc */
 	struct timer_list dsc_timer;
 	/* Read/Write dsc polling frequency */
-	unsigned long best_dsc_rw_frequency;
-	/* The current polling frequency */
-	unsigned long dsc_polling_frequency;
-	/* Maximum waiting time */
+	unsigned long best_dsc_rw_freq;
+	unsigned long dsc_poll_freq;
 	unsigned long dsc_timeout;
 
 	/*
@@ -310,7 +308,7 @@ typedef struct ide_tape_obj {
 	 */
 	u8 partition;
 	/* Current block */
-	unsigned int first_frame_position;
+	unsigned int first_frame;
 
 	/*
 	 *	Last error information
@@ -326,11 +324,8 @@ typedef struct ide_tape_obj {
 	/* Current character device data transfer direction */
 	u8 chrdev_dir;
 
-	/*
-	 *	Device information
-	 */
-	/* Usually 512 or 1024 bytes */
-	unsigned short tape_block_size;
+	/* tape block size, usually 512 or 1024 bytes */
+	unsigned short blk_size;
 	int user_bs_factor;
 
 	/* Copy of the tape's Capabilities and Mechanical Page */
@@ -349,8 +344,8 @@ typedef struct ide_tape_obj {
 	 *	The data buffer size is chosen based on the tape's
 	 *	recommendation.
 	 */
-	/* Pointer to the request which is waiting in the device request queue */
-	struct request *active_data_request;
+	/* Ptr to the request which is waiting in the device request queue */
+	struct request *active_data_rq;
 	/* Data buffer size (chosen based on the tape's recommendation */
 	int stage_size;
 	idetape_stage_t *merge_stage;
@@ -388,7 +383,7 @@ typedef struct ide_tape_obj {
 	/* Status/Action flags: long for set_bit */
 	unsigned long flags;
 	/* protects the ide-tape queue */
-	spinlock_t spinlock;
+	spinlock_t lock;
 
 	/*
 	 * Measures average tape speed
@@ -750,7 +745,7 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	/* Correct pc->actually_transferred by asking the tape.	 */
 	if (test_bit(PC_DMA_ERROR, &pc->flags)) {
 		pc->actually_transferred = pc->request_transfer -
-			tape->tape_block_size *
+			tape->blk_size *
 			be32_to_cpu(get_unaligned((u32 *)&sense[3]));
 		idetape_update_buffers(pc);
 	}
@@ -809,7 +804,7 @@ static void idetape_activate_next_stage(ide_drive_t *drive)
 	rq->rq_disk = tape->disk;
 	rq->buffer = NULL;
 	rq->special = (void *)stage->bh;
-	tape->active_data_request = rq;
+	tape->active_data_rq = rq;
 	tape->active_stage = stage;
 	tape->next_stage = stage->next;
 }
@@ -951,13 +946,13 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 		return 0;
 	}
 
-	spin_lock_irqsave(&tape->spinlock, flags);
+	spin_lock_irqsave(&tape->lock, flags);
 
 	/* The request was a pipelined data transfer request */
-	if (tape->active_data_request == rq) {
+	if (tape->active_data_rq == rq) {
 		active_stage = tape->active_stage;
 		tape->active_stage = NULL;
-		tape->active_data_request = NULL;
+		tape->active_data_rq = NULL;
 		tape->nr_pending_stages--;
 		if (rq->cmd[0] & REQ_IDETAPE_WRITE) {
 			remove_stage = 1;
@@ -978,7 +973,8 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 			/*
 			 * Insert the next request into the request queue.
 			 */
-			(void) ide_do_drive_cmd(drive, tape->active_data_request, ide_end);
+			(void)ide_do_drive_cmd(drive, tape->active_data_rq,
+						ide_end);
 		} else if (!error) {
 				idetape_increase_max_pipeline_stages(drive);
 		}
@@ -990,9 +986,9 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 
 	if (remove_stage)
 		idetape_remove_stage_head(drive);
-	if (tape->active_data_request == NULL)
+	if (tape->active_data_rq == NULL)
 		clear_bit(IDETAPE_PIPELINE_ACTIVE, &tape->flags);
-	spin_unlock_irqrestore(&tape->spinlock, flags);
+	spin_unlock_irqrestore(&tape->lock, flags);
 	return 0;
 }
 
@@ -1089,7 +1085,7 @@ static void idetape_postpone_request (ide_drive_t *drive)
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	tape->postponed_rq = HWGROUP(drive)->rq;
-	ide_stall_queue(drive, tape->dsc_polling_frequency);
+	ide_stall_queue(drive, tape->dsc_poll_freq);
 }
 
 typedef void idetape_io_buf(ide_drive_t *, idetape_pc_t *, unsigned int);
@@ -1190,7 +1186,7 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 		    (stat & SEEK_STAT) == 0) {
 			/* Media access command */
 			tape->dsc_polling_start = jiffies;
-			tape->dsc_polling_frequency = IDETAPE_DSC_MA_FAST;
+			tape->dsc_poll_freq = IDETAPE_DSC_MA_FAST;
 			tape->dsc_timeout = jiffies + IDETAPE_DSC_MA_TIMEOUT;
 			/* Allow ide.c to handle other requests */
 			idetape_postpone_request(drive);
@@ -1543,10 +1539,10 @@ static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct request *rq = HWGROUP(drive)->rq;
-	int blocks = tape->pc->actually_transferred / tape->tape_block_size;
+	int blocks = tape->pc->actually_transferred / tape->blk_size;
 
-	tape->avg_size += blocks * tape->tape_block_size;
-	tape->insert_size += blocks * tape->tape_block_size;
+	tape->avg_size += blocks * tape->blk_size;
+	tape->insert_size += blocks * tape->blk_size;
 	if (tape->insert_size > 1024 * 1024)
 		tape->measure_insert_time = 1;
 	if (tape->measure_insert_time) {
@@ -1563,7 +1559,7 @@ static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
 	}
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
-	tape->first_frame_position += blocks;
+	tape->first_frame += blocks;
 	rq->current_nr_sectors -= blocks;
 
 	if (!tape->pc->error)
@@ -1583,7 +1579,8 @@ static void idetape_create_read_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsi
 	pc->bh = bh;
 	atomic_set(&bh->b_count, 0);
 	pc->buffer = NULL;
-	pc->request_transfer = pc->buffer_size = length * tape->tape_block_size;
+	pc->buffer_size = length * tape->blk_size;
+	pc->request_transfer = pc->buffer_size;
 	if (pc->request_transfer == tape->stage_size)
 		set_bit(PC_DMA_RECOMMENDED, &pc->flags);
 }
@@ -1621,7 +1618,8 @@ static void idetape_create_write_cmd(idetape_tape_t *tape, idetape_pc_t *pc, uns
 	pc->b_data = bh->b_data;
 	pc->b_count = atomic_read(&bh->b_count);
 	pc->buffer = NULL;
-	pc->request_transfer = pc->buffer_size = length * tape->tape_block_size;
+	pc->buffer_size = length * tape->blk_size;
+	pc->request_transfer = pc->buffer_size;
 	if (pc->request_transfer == tape->stage_size)
 		set_bit(PC_DMA_RECOMMENDED, &pc->flags);
 }
@@ -1689,7 +1687,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	    (stat & SEEK_STAT) == 0) {
 		if (postponed_rq == NULL) {
 			tape->dsc_polling_start = jiffies;
-			tape->dsc_polling_frequency = tape->best_dsc_rw_frequency;
+			tape->dsc_poll_freq = tape->best_dsc_rw_freq;
 			tape->dsc_timeout = jiffies + IDETAPE_DSC_RW_TIMEOUT;
 		} else if (time_after(jiffies, tape->dsc_timeout)) {
 			printk(KERN_ERR "ide-tape: %s: DSC timeout\n",
@@ -1701,7 +1699,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 				return ide_do_reset(drive);
 			}
 		} else if (time_after(jiffies, tape->dsc_polling_start + IDETAPE_DSC_MA_THRESHOLD))
-			tape->dsc_polling_frequency = IDETAPE_DSC_MA_SLOW;
+			tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW;
 		idetape_postpone_request(drive);
 		return ide_stopped;
 	}
@@ -1748,7 +1746,7 @@ static inline int idetape_pipeline_active (idetape_tape_t *tape)
 	int rc1, rc2;
 
 	rc1 = test_bit(IDETAPE_PIPELINE_ACTIVE, &tape->flags);
-	rc2 = (tape->active_data_request != NULL);
+	rc2 = (tape->active_data_rq != NULL);
 	return rc1;
 }
 
@@ -1930,7 +1928,7 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
-	spin_lock_irqsave(&tape->spinlock, flags);
+	spin_lock_irqsave(&tape->lock, flags);
 	stage->next = NULL;
 	if (tape->last_stage != NULL)
 		tape->last_stage->next=stage;
@@ -1941,7 +1939,7 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 		tape->next_stage = tape->last_stage;
 	tape->nr_stages++;
 	tape->nr_pending_stages++;
-	spin_unlock_irqrestore(&tape->spinlock, flags);
+	spin_unlock_irqrestore(&tape->lock, flags);
 }
 
 /*
@@ -1962,10 +1960,10 @@ static void idetape_wait_for_request (ide_drive_t *drive, struct request *rq)
 	}
 	rq->end_io_data = &wait;
 	rq->end_io = blk_end_sync_rq;
-	spin_unlock_irq(&tape->spinlock);
+	spin_unlock_irq(&tape->lock);
 	wait_for_completion(&wait);
 	/* The stage and its struct request have been deallocated */
-	spin_lock_irq(&tape->spinlock);
+	spin_lock_irq(&tape->lock);
 }
 
 static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
@@ -1991,7 +1989,7 @@ static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
 					be32_to_cpu(*(u32 *)&readpos[4]));
 
 			tape->partition = readpos[1];
-			tape->first_frame_position =
+			tape->first_frame =
 				be32_to_cpu(*(u32 *)&readpos[4]);
 			set_bit(IDETAPE_ADDRESS_VALID, &tape->flags);
 			idetape_end_request(drive, 1, 0);
@@ -2133,7 +2131,7 @@ static int idetape_read_position (ide_drive_t *drive)
 	idetape_create_read_position_cmd(&pc);
 	if (idetape_queue_pc_tail(drive, &pc))
 		return -1;
-	position = tape->first_frame_position;
+	position = tape->first_frame;
 	return position;
 }
 
@@ -2173,7 +2171,7 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 		return 0;
 
 	/* Remove merge stage. */
-	cnt = tape->merge_stage_size / tape->tape_block_size;
+	cnt = tape->merge_stage_size / tape->blk_size;
 	if (test_and_clear_bit(IDETAPE_FILEMARK, &tape->flags))
 		++cnt;		/* Filemarks count as 1 sector */
 	tape->merge_stage_size = 0;
@@ -2190,11 +2188,11 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 	if (tape->first_stage == NULL)
 		return 0;
 
-	spin_lock_irqsave(&tape->spinlock, flags);
+	spin_lock_irqsave(&tape->lock, flags);
 	tape->next_stage = NULL;
 	if (idetape_pipeline_active(tape))
-		idetape_wait_for_request(drive, tape->active_data_request);
-	spin_unlock_irqrestore(&tape->spinlock, flags);
+		idetape_wait_for_request(drive, tape->active_data_rq);
+	spin_unlock_irqrestore(&tape->lock, flags);
 
 	while (tape->first_stage != NULL) {
 		struct request *rq_ptr = &tape->first_stage->rq;
@@ -2273,7 +2271,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct
 	idetape_init_rq(&rq, cmd);
 	rq.rq_disk = tape->disk;
 	rq.special = (void *)bh;
-	rq.sector = tape->first_frame_position;
+	rq.sector = tape->first_frame;
 	rq.nr_sectors = rq.current_nr_sectors = blocks;
 	(void) ide_do_drive_cmd(drive, &rq, ide_wait);
 
@@ -2284,7 +2282,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct
 		idetape_init_merge_stage(tape);
 	if (rq.errors == IDETAPE_ERROR_GENERAL)
 		return -EIO;
-	return (tape->tape_block_size * (blocks-rq.current_nr_sectors));
+	return (tape->blk_size * (blocks-rq.current_nr_sectors));
 }
 
 /*
@@ -2300,7 +2298,7 @@ static void idetape_insert_pipeline_into_queue (ide_drive_t *drive)
 	if (!idetape_pipeline_active(tape)) {
 		set_bit(IDETAPE_PIPELINE_ACTIVE, &tape->flags);
 		idetape_activate_next_stage(drive);
-		(void) ide_do_drive_cmd(drive, tape->active_data_request, ide_end);
+		(void) ide_do_drive_cmd(drive, tape->active_data_rq, ide_end);
 	}
 }
 
@@ -2346,10 +2344,10 @@ static void idetape_wait_first_stage (ide_drive_t *drive)
 
 	if (tape->first_stage == NULL)
 		return;
-	spin_lock_irqsave(&tape->spinlock, flags);
+	spin_lock_irqsave(&tape->lock, flags);
 	if (tape->active_stage == tape->first_stage)
-		idetape_wait_for_request(drive, tape->active_data_request);
-	spin_unlock_irqrestore(&tape->spinlock, flags);
+		idetape_wait_for_request(drive, tape->active_data_rq);
+	spin_unlock_irqrestore(&tape->lock, flags);
 }
 
 /*
@@ -2377,12 +2375,12 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	 *	Pay special attention to possible race conditions.
 	 */
 	while ((new_stage = idetape_kmalloc_stage(tape)) == NULL) {
-		spin_lock_irqsave(&tape->spinlock, flags);
+		spin_lock_irqsave(&tape->lock, flags);
 		if (idetape_pipeline_active(tape)) {
-			idetape_wait_for_request(drive, tape->active_data_request);
-			spin_unlock_irqrestore(&tape->spinlock, flags);
+			idetape_wait_for_request(drive, tape->active_data_rq);
+			spin_unlock_irqrestore(&tape->lock, flags);
 		} else {
-			spin_unlock_irqrestore(&tape->spinlock, flags);
+			spin_unlock_irqrestore(&tape->lock, flags);
 			idetape_insert_pipeline_into_queue(drive);
 			if (idetape_pipeline_active(tape))
 				continue;
@@ -2396,7 +2394,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	rq = &new_stage->rq;
 	idetape_init_rq(rq, REQ_IDETAPE_WRITE);
 	/* Doesn't actually matter - We always assume sequential access */
-	rq->sector = tape->first_frame_position;
+	rq->sector = tape->first_frame;
 	rq->nr_sectors = rq->current_nr_sectors = blocks;
 
 	idetape_switch_buffers(tape, new_stage);
@@ -2413,7 +2411,9 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	 */
 	if (!idetape_pipeline_active(tape)) {
 		if (tape->nr_stages >= tape->max_stages * 9 / 10 ||
-		    tape->nr_stages >= tape->max_stages - tape->uncontrolled_pipeline_head_speed * 3 * 1024 / tape->tape_block_size) {
+			tape->nr_stages >= tape->max_stages -
+			tape->uncontrolled_pipeline_head_speed * 3 * 1024 /
+			tape->blk_size) {
 			tape->measure_insert_time = 1;
 			tape->insert_time = jiffies;
 			tape->insert_size = 0;
@@ -2438,10 +2438,10 @@ static void idetape_wait_for_pipeline (ide_drive_t *drive)
 
 	while (tape->next_stage || idetape_pipeline_active(tape)) {
 		idetape_insert_pipeline_into_queue(drive);
-		spin_lock_irqsave(&tape->spinlock, flags);
+		spin_lock_irqsave(&tape->lock, flags);
 		if (idetape_pipeline_active(tape))
-			idetape_wait_for_request(drive, tape->active_data_request);
-		spin_unlock_irqrestore(&tape->spinlock, flags);
+			idetape_wait_for_request(drive, tape->active_data_rq);
+		spin_unlock_irqrestore(&tape->lock, flags);
 	}
 }
 
@@ -2460,12 +2460,13 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 		tape->merge_stage_size = tape->stage_size;
 	}
 	if (tape->merge_stage_size) {
-		blocks = tape->merge_stage_size / tape->tape_block_size;
-		if (tape->merge_stage_size % tape->tape_block_size) {
+		blocks = tape->merge_stage_size / tape->blk_size;
+		if (tape->merge_stage_size % tape->blk_size) {
 			unsigned int i;
 
 			blocks++;
-			i = tape->tape_block_size - tape->merge_stage_size % tape->tape_block_size;
+			i = tape->blk_size - tape->merge_stage_size %
+				tape->blk_size;
 			bh = tape->bh->b_reqnext;
 			while (bh) {
 				atomic_set(&bh->b_count, 0);
@@ -2571,7 +2572,7 @@ static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
 	if (tape->restart_speed_control_req)
 		idetape_restart_speed_control(drive);
 	idetape_init_rq(&rq, REQ_IDETAPE_READ);
-	rq.sector = tape->first_frame_position;
+	rq.sector = tape->first_frame;
 	rq.nr_sectors = rq.current_nr_sectors = blocks;
 	if (!test_bit(IDETAPE_PIPELINE_ERROR, &tape->flags) &&
 	    tape->nr_stages < max_stages) {
@@ -2624,11 +2625,13 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 	if (tape->first_stage == NULL) {
 		if (test_bit(IDETAPE_PIPELINE_ERROR, &tape->flags))
 			return 0;
-		return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks, tape->merge_stage->bh);
+		return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
+					tape->merge_stage->bh);
 	}
 	idetape_wait_first_stage(drive);
 	rq_ptr = &tape->first_stage->rq;
-	bytes_read = tape->tape_block_size * (rq_ptr->nr_sectors - rq_ptr->current_nr_sectors);
+	bytes_read = tape->blk_size * (rq_ptr->nr_sectors -
+					rq_ptr->current_nr_sectors);
 	rq_ptr->nr_sectors = rq_ptr->current_nr_sectors = 0;
 
 
@@ -2638,15 +2641,15 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 		idetape_switch_buffers(tape, tape->first_stage);
 		if (rq_ptr->errors == IDETAPE_ERROR_FILEMARK)
 			set_bit(IDETAPE_FILEMARK, &tape->flags);
-		spin_lock_irqsave(&tape->spinlock, flags);
+		spin_lock_irqsave(&tape->lock, flags);
 		idetape_remove_stage_head(drive);
-		spin_unlock_irqrestore(&tape->spinlock, flags);
+		spin_unlock_irqrestore(&tape->lock, flags);
 		tape->pipeline_head++;
 		idetape_calculate_speeds(drive);
 	}
-	if (bytes_read > blocks * tape->tape_block_size) {
+	if (bytes_read > blocks * tape->blk_size) {
 		printk(KERN_ERR "ide-tape: bug: trying to return more bytes than requested\n");
-		bytes_read = blocks * tape->tape_block_size;
+		bytes_read = blocks * tape->blk_size;
 	}
 	return (bytes_read);
 }
@@ -2663,7 +2666,7 @@ static void idetape_pad_zeros (ide_drive_t *drive, int bcount)
 		bh = tape->merge_stage->bh;
 		count = min(tape->stage_size, bcount);
 		bcount -= count;
-		blocks = count / tape->tape_block_size;
+		blocks = count / tape->blk_size;
 		while (count) {
 			atomic_set(&bh->b_count, min(count, (unsigned int)bh->b_size));
 			memset(bh->b_data, 0, atomic_read(&bh->b_count));
@@ -2685,9 +2688,10 @@ static int idetape_pipeline_size (ide_drive_t *drive)
 	stage = tape->first_stage;
 	while (stage != NULL) {
 		rq = &stage->rq;
-		size += tape->tape_block_size * (rq->nr_sectors-rq->current_nr_sectors);
+		size += tape->blk_size * (rq->nr_sectors -
+				rq->current_nr_sectors);
 		if (rq->errors == IDETAPE_ERROR_FILEMARK)
-			size += tape->tape_block_size;
+			size += tape->blk_size;
 		stage = stage->next;
 	}
 	size += tape->merge_stage_size;
@@ -2744,11 +2748,11 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned l
 		case 0x0340:
 			if (copy_from_user(&config, argp, sizeof(config)))
 				return -EFAULT;
-			tape->best_dsc_rw_frequency = config.dsc_rw_frequency;
+			tape->best_dsc_rw_freq = config.dsc_rw_frequency;
 			tape->max_stages = config.nr_stages;
 			break;
 		case 0x0350:
-			config.dsc_rw_frequency = (int) tape->best_dsc_rw_frequency;
+			config.dsc_rw_frequency = (int) tape->best_dsc_rw_freq;
 			config.nr_stages = tape->max_stages; 
 			if (copy_to_user(argp, &config, sizeof(config)))
 				return -EFAULT;
@@ -2798,7 +2802,7 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 					set_bit(IDETAPE_FILEMARK, &tape->flags);
 				return 0;
 			}
-			spin_lock_irqsave(&tape->spinlock, flags);
+			spin_lock_irqsave(&tape->lock, flags);
 			if (tape->first_stage == tape->active_stage) {
 				/*
 				 *	We have reached the active stage in the read pipeline.
@@ -2810,11 +2814,11 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 				 *	__idetape_discard_read_pipeline(), for example.
 				 */
 				tape->next_stage = NULL;
-				spin_unlock_irqrestore(&tape->spinlock, flags);
+				spin_unlock_irqrestore(&tape->lock, flags);
 				idetape_wait_first_stage(drive);
 				tape->next_stage = tape->first_stage->next;
 			} else
-				spin_unlock_irqrestore(&tape->spinlock, flags);
+				spin_unlock_irqrestore(&tape->lock, flags);
 			if (tape->first_stage->rq.errors == IDETAPE_ERROR_FILEMARK)
 				++count;
 			idetape_remove_stage_head(drive);
@@ -2876,9 +2880,9 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 
 	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
 		if (test_bit(IDETAPE_DETECT_BS, &tape->flags))
-			if (count > tape->tape_block_size &&
-			    (count % tape->tape_block_size) == 0)
-				tape->user_bs_factor = count / tape->tape_block_size;
+			if (count > tape->blk_size &&
+			    (count % tape->blk_size) == 0)
+				tape->user_bs_factor = count / tape->blk_size;
 	}
 	if ((rc = idetape_initiate_read(drive, tape->max_stages)) < 0)
 		return rc;
@@ -3115,9 +3119,11 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 			return (idetape_queue_pc_tail(drive, &pc));
 		case MTSETBLK:
 			if (mt_count) {
-				if (mt_count < tape->tape_block_size || mt_count % tape->tape_block_size)
+				if (mt_count < tape->blk_size ||
+				    mt_count % tape->blk_size)
 					return -EIO;
-				tape->user_bs_factor = mt_count / tape->tape_block_size;
+				tape->user_bs_factor = mt_count /
+							tape->blk_size;
 				clear_bit(IDETAPE_DETECT_BS, &tape->flags);
 			} else
 				set_bit(IDETAPE_DETECT_BS, &tape->flags);
@@ -3164,7 +3170,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 	struct mtop mtop;
 	struct mtget mtget;
 	struct mtpos mtpos;
-	int block_offset = 0, position = tape->first_frame_position;
+	int block_offset = 0, position = tape->first_frame;
 	void __user *argp = (void __user *)arg;
 
 	debug_log(DBG_CHRDEV, "Enter %s, cmd=%u\n", __func__, cmd);
@@ -3175,7 +3181,8 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 		idetape_flush_tape_buffers(drive);
 	}
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = idetape_pipeline_size(drive) / (tape->tape_block_size * tape->user_bs_factor);
+		block_offset = idetape_pipeline_size(drive) /
+			(tape->blk_size * tape->user_bs_factor);
 		if ((position = idetape_read_position(drive)) < 0)
 			return -EIO;
 	}
@@ -3188,7 +3195,10 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 			memset(&mtget, 0, sizeof (struct mtget));
 			mtget.mt_type = MT_ISSCSI2;
 			mtget.mt_blkno = position / tape->user_bs_factor - block_offset;
-			mtget.mt_dsreg = ((tape->tape_block_size * tape->user_bs_factor) << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK;
+			mtget.mt_dsreg =
+				((tape->blk_size * tape->user_bs_factor)
+				 << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK;
+
 			if (tape->drv_write_prot) {
 				mtget.mt_gstat |= GMT_WR_PROT(0xffffffff);
 			}
@@ -3219,14 +3229,14 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
 	idetape_create_mode_sense_cmd(&pc, IDETAPE_BLOCK_DESCRIPTOR);
 	if (idetape_queue_pc_tail(drive, &pc)) {
 		printk(KERN_ERR "ide-tape: Can't get block descriptor\n");
-		if (tape->tape_block_size == 0) {
+		if (tape->blk_size == 0) {
 			printk(KERN_WARNING "ide-tape: Cannot deal with zero "
 					    "block size, assuming 32k\n");
-			tape->tape_block_size = 32768;
+			tape->blk_size = 32768;
 		}
 		return;
 	}
-	tape->tape_block_size = (pc.buffer[4 + 5] << 16) +
+	tape->blk_size = (pc.buffer[4 + 5] << 16) +
 				(pc.buffer[4 + 6] << 8)  +
 				 pc.buffer[4 + 7];
 	tape->drv_write_prot = (pc.buffer[2] & 0x80) >> 7;
@@ -3328,7 +3338,8 @@ static void idetape_write_release (ide_drive_t *drive, unsigned int minor)
 	idetape_empty_write_pipeline(drive);
 	tape->merge_stage = __idetape_kmalloc_stage(tape, 1, 0);
 	if (tape->merge_stage != NULL) {
-		idetape_pad_zeros(drive, tape->tape_block_size * (tape->user_bs_factor - 1));
+		idetape_pad_zeros(drive, tape->blk_size *
+				(tape->user_bs_factor - 1));
 		__idetape_kfree_stage(tape->merge_stage);
 		tape->merge_stage = NULL;
 	}
@@ -3456,7 +3467,7 @@ static void idetape_get_mode_sense_results (ide_drive_t *drive)
 	if (idetape_queue_pc_tail(drive, &pc)) {
 		printk(KERN_ERR "ide-tape: Can't get tape parameters - assuming"
 				" some default values\n");
-		tape->tape_block_size = 512;
+		tape->blk_size = 512;
 		put_unaligned(52,   (u16 *)&tape->caps[12]);
 		put_unaligned(540,  (u16 *)&tape->caps[14]);
 		put_unaligned(6*52, (u16 *)&tape->caps[16]);
@@ -3486,9 +3497,9 @@ static void idetape_get_mode_sense_results (ide_drive_t *drive)
 
 	memcpy(&tape->caps, caps, 20);
 	if (caps[7] & 0x02)
-		tape->tape_block_size = 512;
+		tape->blk_size = 512;
 	else if (caps[7] & 0x04)
-		tape->tape_block_size = 1024;
+		tape->blk_size = 1024;
 }
 
 #ifdef CONFIG_IDE_PROC_FS
@@ -3508,8 +3519,11 @@ static void idetape_add_settings (ide_drive_t *drive)
 	ide_add_setting(drive,	"pipeline_pending",	SETTING_READ,	TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,		&tape->nr_pending_stages,		NULL);
 	ide_add_setting(drive, "speed", SETTING_READ, TYPE_SHORT, 0, 0xffff,
 			1, 1, (u16 *)&tape->caps[14], NULL);
-	ide_add_setting(drive,	"stage",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1024,		&tape->stage_size,			NULL);
-	ide_add_setting(drive,	"tdsc",			SETTING_RW,	TYPE_INT,	IDETAPE_DSC_RW_MIN,	IDETAPE_DSC_RW_MAX,	1000,				HZ,		&tape->best_dsc_rw_frequency,		NULL);
+	ide_add_setting(drive, "stage", SETTING_READ, TYPE_INT,	0, 0xffff, 1,
+			1024, &tape->stage_size, NULL);
+	ide_add_setting(drive, "tdsc", SETTING_RW, TYPE_INT, IDETAPE_DSC_RW_MIN,
+			IDETAPE_DSC_RW_MAX, 1000, HZ, &tape->best_dsc_rw_freq,
+			NULL);
 	ide_add_setting(drive,	"dsc_overlap",		SETTING_RW,	TYPE_BYTE,	0,			1,			1,				1,		&drive->dsc_overlap,			NULL);
 	ide_add_setting(drive,	"pipeline_head_speed_c",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->controlled_pipeline_head_speed,	NULL);
 	ide_add_setting(drive,	"pipeline_head_speed_u",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->uncontrolled_pipeline_head_speed,NULL);
@@ -3542,7 +3556,7 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	struct sysinfo si;
 	u16 *ctl = (u16 *)&tape->caps[12];
 
-	spin_lock_init(&tape->spinlock);
+	spin_lock_init(&tape->lock);
 	drive->dsc_overlap = 1;
 	if (drive->hwif->host_flags & IDE_HFLAG_NO_DSC) {
 		printk(KERN_INFO "ide-tape: %s: disabling DSC overlap\n",
@@ -3570,11 +3584,11 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	idetape_get_mode_sense_results(drive);
 	ide_tape_get_bsize_from_bdesc(drive);
 	tape->user_bs_factor = 1;
-	tape->stage_size = *ctl * tape->tape_block_size;
+	tape->stage_size = *ctl * tape->blk_size;
 	while (tape->stage_size > 0xffff) {
 		printk(KERN_NOTICE "ide-tape: decreasing stage size\n");
 		*ctl /= 2;
-		tape->stage_size = *ctl * tape->tape_block_size;
+		tape->stage_size = *ctl * tape->blk_size;
 	}
 	stage_size = tape->stage_size;
 	tape->pages_per_stage = stage_size / PAGE_SIZE;
@@ -3613,14 +3627,16 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	 *	Ensure that the number we got makes sense; limit
 	 *	it within IDETAPE_DSC_RW_MIN and IDETAPE_DSC_RW_MAX.
 	 */
-	tape->best_dsc_rw_frequency = max_t(unsigned long, min_t(unsigned long, t, IDETAPE_DSC_RW_MAX), IDETAPE_DSC_RW_MIN);
+	tape->best_dsc_rw_freq = max_t(unsigned long,
+				min_t(unsigned long, t, IDETAPE_DSC_RW_MAX),
+				IDETAPE_DSC_RW_MIN);
 	printk(KERN_INFO "ide-tape: %s <-> %s: %dKBps, %d*%dkB buffer, "
 		"%dkB pipeline, %lums tDSC%s\n",
 		drive->name, tape->name, *(u16 *)&tape->caps[14],
 		(*(u16 *)&tape->caps[16] * 512) / tape->stage_size,
 		tape->stage_size / 1024,
 		tape->max_stages * tape->stage_size / 1024,
-		tape->best_dsc_rw_frequency * 1000 / HZ,
+		tape->best_dsc_rw_freq * 1000 / HZ,
 		drive->using_dma ? ", DMA":"");
 
 	idetape_add_settings(drive);
-- 
cgit v1.1


From 97219851b92fd083539003bca48c379d415566ac Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:52 +0100
Subject: ide-tape: remove idetape_increase_max_pipeline_stages()

This function was being used only at one place so fold it in there.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2fe4e8f..27fd33d 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -810,25 +810,6 @@ static void idetape_activate_next_stage(ide_drive_t *drive)
 }
 
 /*
- *	idetape_increase_max_pipeline_stages is a part of the feedback
- *	loop which tries to find the optimum number of stages. In the
- *	feedback loop, we are starting from a minimum maximum number of
- *	stages, and if we sense that the pipeline is empty, we try to
- *	increase it, until we reach the user compile time memory limit.
- */
-static void idetape_increase_max_pipeline_stages (ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	int increase = (tape->max_pipeline - tape->min_pipeline) / 10;
-
-	debug_log(DBG_PROCS, "Enter %s\n", __func__);
-
-	tape->max_stages += max(increase, 1);
-	tape->max_stages = max(tape->max_stages, tape->min_pipeline);
-	tape->max_stages = min(tape->max_stages, tape->max_pipeline);
-}
-
-/*
  *	idetape_kfree_stage calls kfree to completely free a stage, along with
  *	its related buffers.
  */
@@ -976,7 +957,21 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 			(void)ide_do_drive_cmd(drive, tape->active_data_rq,
 						ide_end);
 		} else if (!error) {
-				idetape_increase_max_pipeline_stages(drive);
+			/*
+			 * This is a part of the feedback loop which tries to
+			 * find the optimum number of stages. We are starting
+			 * from a minimum maximum number of stages, and if we
+			 * sense that the pipeline is empty, we try to increase
+			 * it, until we reach the user compile time memory
+			 * limit.
+			 */
+			int i = (tape->max_pipeline - tape->min_pipeline) / 10;
+
+			tape->max_stages += max(i, 1);
+			tape->max_stages = max(tape->max_stages,
+						tape->min_pipeline);
+			tape->max_stages = min(tape->max_stages,
+						tape->max_pipeline);
 		}
 	}
 	ide_end_drive_cmd(drive, 0, 0);
-- 
cgit v1.1


From 8d06bfadb44bfec067603fbc8ee2faced3b13ad9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:53 +0100
Subject: ide-tape: shorten some function names

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 27fd33d..bdf02fc 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1090,7 +1090,7 @@ typedef void idetape_io_buf(ide_drive_t *, idetape_pc_t *, unsigned int);
  * command. We will transfer some of the data (as requested by the drive) and
  * will re-point interrupt handler to us. When data transfer is finished, we
  * will act according to the algorithm described before
- * idetape_issue_packet_command.
+ * idetape_issue_pc.
  */
 static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 {
@@ -1267,7 +1267,7 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
  *
  *	The handling will be done in three stages:
  *
- *	1.	idetape_issue_packet_command will send the packet command to the
+ *	1.	idetape_issue_pc will send the packet command to the
  *		drive, and will set the interrupt handler to idetape_pc_intr.
  *
  *	2.	On each interrupt, idetape_pc_intr will be called. This step
@@ -1342,7 +1342,7 @@ static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive)
 	return ide_started;
 }
 
-static ide_startstop_t idetape_issue_packet_command (ide_drive_t *drive, idetape_pc_t *pc)
+static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, idetape_pc_t *pc)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	idetape_tape_t *tape = drive->driver_data;
@@ -1649,7 +1649,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	 */
 	if (tape->failed_pc != NULL &&
 	    tape->pc->c[0] == REQUEST_SENSE) {
-		return idetape_issue_packet_command(drive, tape->failed_pc);
+		return idetape_issue_pc(drive, tape->failed_pc);
 	}
 	if (postponed_rq != NULL)
 		if (rq != postponed_rq) {
@@ -1730,7 +1730,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 	BUG();
 out:
-	return idetape_issue_packet_command(drive, pc);
+	return idetape_issue_pc(drive, pc);
 }
 
 /*
@@ -2280,11 +2280,8 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct
 	return (tape->blk_size * (blocks-rq.current_nr_sectors));
 }
 
-/*
- *	idetape_insert_pipeline_into_queue is used to start servicing the
- *	pipeline stages, starting from tape->next_stage.
- */
-static void idetape_insert_pipeline_into_queue (ide_drive_t *drive)
+/* start servicing the pipeline stages, starting from tape->next_stage. */
+static void idetape_plug_pipeline(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -2376,7 +2373,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 			spin_unlock_irqrestore(&tape->lock, flags);
 		} else {
 			spin_unlock_irqrestore(&tape->lock, flags);
-			idetape_insert_pipeline_into_queue(drive);
+			idetape_plug_pipeline(drive);
 			if (idetape_pipeline_active(tape))
 				continue;
 			/*
@@ -2413,7 +2410,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 			tape->insert_time = jiffies;
 			tape->insert_size = 0;
 			tape->insert_speed = 0;
-			idetape_insert_pipeline_into_queue(drive);
+			idetape_plug_pipeline(drive);
 		}
 	}
 	if (test_and_clear_bit(IDETAPE_PIPELINE_ERROR, &tape->flags))
@@ -2432,7 +2429,7 @@ static void idetape_wait_for_pipeline (ide_drive_t *drive)
 	unsigned long flags;
 
 	while (tape->next_stage || idetape_pipeline_active(tape)) {
-		idetape_insert_pipeline_into_queue(drive);
+		idetape_plug_pipeline(drive);
 		spin_lock_irqsave(&tape->lock, flags);
 		if (idetape_pipeline_active(tape))
 			idetape_wait_for_request(drive, tape->active_data_rq);
@@ -2525,7 +2522,7 @@ static void idetape_restart_speed_control (ide_drive_t *drive)
 	tape->controlled_previous_head_time = tape->uncontrolled_previous_head_time = jiffies;
 }
 
-static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
+static int idetape_init_read(ide_drive_t *drive, int max_stages)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_stage_t *new_stage;
@@ -2586,7 +2583,7 @@ static int idetape_initiate_read (ide_drive_t *drive, int max_stages)
 			tape->insert_time = jiffies;
 			tape->insert_size = 0;
 			tape->insert_speed = 0;
-			idetape_insert_pipeline_into_queue(drive);
+			idetape_plug_pipeline(drive);
 		}
 	}
 	return 0;
@@ -2616,7 +2613,7 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 	 * Wait for the next block to be available at the head
 	 * of the pipeline
 	 */
-	idetape_initiate_read(drive, tape->max_stages);
+	idetape_init_read(drive, tape->max_stages);
 	if (tape->first_stage == NULL) {
 		if (test_bit(IDETAPE_PIPELINE_ERROR, &tape->flags))
 			return 0;
@@ -2879,7 +2876,8 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
-	if ((rc = idetape_initiate_read(drive, tape->max_stages)) < 0)
+	rc = idetape_init_read(drive, tape->max_stages);
+	if (rc < 0)
 		return rc;
 	if (count == 0)
 		return (0);
-- 
cgit v1.1


From 3c98bf347d95cf9c43104db2fda848d0c7decebd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:53 +0100
Subject: ide-tape: cleanup and fix comments

Also, remove redundant ones and cleanup whitespace.

Bart:
- minor fixups

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 681 ++++++++++++++++++++-----------------------------
 1 file changed, 271 insertions(+), 410 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index bdf02fc..d8b02fb 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -75,37 +75,34 @@ enum {
 
 
 /*
- *	Pipelined mode parameters.
+ * Pipelined mode parameters.
  *
- *	We try to use the minimum number of stages which is enough to
- *	keep the tape constantly streaming. To accomplish that, we implement
- *	a feedback loop around the maximum number of stages:
+ * We try to use the minimum number of stages which is enough to keep the tape
+ * constantly streaming. To accomplish that, we implement a feedback loop around
+ * the maximum number of stages:
  *
- *	We start from MIN maximum stages (we will not even use MIN stages
- *      if we don't need them), increment it by RATE*(MAX-MIN)
- *	whenever we sense that the pipeline is empty, until we reach
- *	the optimum value or until we reach MAX.
+ * We start from MIN maximum stages (we will not even use MIN stages if we don't
+ * need them), increment it by RATE*(MAX-MIN) whenever we sense that the
+ * pipeline is empty, until we reach the optimum value or until we reach MAX.
  *
- *	Setting the following parameter to 0 is illegal: the pipelined mode
- *	cannot be disabled (idetape_calculate_speeds() divides by
- *	tape->max_stages.)
+ * Setting the following parameter to 0 is illegal: the pipelined mode cannot be
+ * disabled (idetape_calculate_speeds() divides by tape->max_stages.)
  */
 #define IDETAPE_MIN_PIPELINE_STAGES	  1
 #define IDETAPE_MAX_PIPELINE_STAGES	400
 #define IDETAPE_INCREASE_STAGES_RATE	 20
 
 /*
- *	After each failed packet command we issue a request sense command
- *	and retry the packet command IDETAPE_MAX_PC_RETRIES times.
+ * After each failed packet command we issue a request sense command and retry
+ * the packet command IDETAPE_MAX_PC_RETRIES times.
  *
- *	Setting IDETAPE_MAX_PC_RETRIES to 0 will disable retries.
+ * Setting IDETAPE_MAX_PC_RETRIES to 0 will disable retries.
  */
 #define IDETAPE_MAX_PC_RETRIES		3
 
 /*
- *	With each packet command, we allocate a buffer of
- *	IDETAPE_PC_BUFFER_SIZE bytes. This is used for several packet
- *	commands (Not for READ/WRITE commands).
+ * With each packet command, we allocate a buffer of IDETAPE_PC_BUFFER_SIZE
+ * bytes. This is used for several packet commands (Not for READ/WRITE commands)
  */
 #define IDETAPE_PC_BUFFER_SIZE		256
 
@@ -124,48 +121,39 @@ enum {
 #define IDETAPE_WAIT_CMD		(900*HZ)
 
 /*
- *	The following parameter is used to select the point in the internal
- *	tape fifo in which we will start to refill the buffer. Decreasing
- *	the following parameter will improve the system's latency and
- *	interactive response, while using a high value might improve system
- *	throughput.
+ * The following parameter is used to select the point in the internal tape fifo
+ * in which we will start to refill the buffer. Decreasing the following
+ * parameter will improve the system's latency and interactive response, while
+ * using a high value might improve system throughput.
  */
-#define IDETAPE_FIFO_THRESHOLD 		2
+#define IDETAPE_FIFO_THRESHOLD		2
 
 /*
- *	DSC polling parameters.
+ * DSC polling parameters.
  *
- *	Polling for DSC (a single bit in the status register) is a very
- *	important function in ide-tape. There are two cases in which we
- *	poll for DSC:
+ * Polling for DSC (a single bit in the status register) is a very important
+ * function in ide-tape. There are two cases in which we poll for DSC:
  *
- *	1.	Before a read/write packet command, to ensure that we
- *		can transfer data from/to the tape's data buffers, without
- *		causing an actual media access. In case the tape is not
- *		ready yet, we take out our request from the device
- *		request queue, so that ide.c will service requests from
- *		the other device on the same interface meanwhile.
+ * 1. Before a read/write packet command, to ensure that we can transfer data
+ * from/to the tape's data buffers, without causing an actual media access.
+ * In case the tape is not ready yet, we take out our request from the device
+ * request queue, so that ide.c could service requests from the other device
+ * on the same interface in the meantime.
  *
- *	2.	After the successful initialization of a "media access
- *		packet command", which is a command which can take a long
- *		time to complete (it can be several seconds or even an hour).
+ * 2. After the successful initialization of a "media access packet command",
+ * which is a command that can take a long time to complete (the interval can
+ * range from several seconds to even an hour). Again, we postpone our request
+ * in the middle to free the bus for the other device. The polling frequency
+ * here should be lower than the read/write frequency since those media access
+ * commands are slow. We start from a "fast" frequency - IDETAPE_DSC_MA_FAST
+ * (1 second), and if we don't receive DSC after IDETAPE_DSC_MA_THRESHOLD
+ * (5 min), we switch it to a lower frequency - IDETAPE_DSC_MA_SLOW (1 min).
  *
- *		Again, we postpone our request in the middle to free the bus
- *		for the other device. The polling frequency here should be
- *		lower than the read/write frequency since those media access
- *		commands are slow. We start from a "fast" frequency -
- *		IDETAPE_DSC_MA_FAST (one second), and if we don't receive DSC
- *		after IDETAPE_DSC_MA_THRESHOLD (5 minutes), we switch it to a
- *		lower frequency - IDETAPE_DSC_MA_SLOW (1 minute).
- *
- *	We also set a timeout for the timer, in case something goes wrong.
- *	The timeout should be longer then the maximum execution time of a
- *	tape operation.
- */
- 
-/*
- *	DSC timings.
+ * We also set a timeout for the timer, in case something goes wrong. The
+ * timeout should be longer then the maximum execution time of a tape operation.
  */
+
+/* DSC timings. */
 #define IDETAPE_DSC_RW_MIN		5*HZ/100	/* 50 msec */
 #define IDETAPE_DSC_RW_MAX		40*HZ/100	/* 400 msec */
 #define IDETAPE_DSC_RW_TIMEOUT		2*60*HZ		/* 2 minutes */
@@ -176,15 +164,9 @@ enum {
 
 /*************************** End of tunable parameters ***********************/
 
-/*
- *	Read/Write error simulation
- */
+/* Read/Write error simulation */
 #define SIMULATE_ERRORS			0
 
-/*
- *	For general magnetic tape device compatibility.
- */
-
 /* tape directions */
 enum {
 	IDETAPE_DIR_NONE  = (1 << 0),
@@ -199,24 +181,32 @@ struct idetape_bh {
 	char *b_data;
 };
 
-/*
- *	Our view of a packet command.
- */
 typedef struct idetape_packet_command_s {
-	u8 c[12];				/* Actual packet bytes */
-	int retries;				/* On each retry, we increment retries */
-	int error;				/* Error code */
-	int request_transfer;			/* Bytes to transfer */
-	int actually_transferred;		/* Bytes actually transferred */
-	int buffer_size;			/* Size of our data buffer */
+	/* Actual packet bytes */
+	u8 c[12];
+	/* On each retry, we increment retries */
+	int retries;
+	/* Error code */
+	int error;
+	/* Bytes to transfer */
+	int request_transfer;
+	/* Bytes actually transferred */
+	int actually_transferred;
+	/* Size of our data buffer */
+	int buffer_size;
 	struct idetape_bh *bh;
 	char *b_data;
 	int b_count;
-	u8 *buffer;				/* Data buffer */
-	u8 *current_position;			/* Pointer into the above buffer */
-	ide_startstop_t (*callback) (ide_drive_t *);	/* Called when this packet command is completed */
-	u8 pc_buffer[IDETAPE_PC_BUFFER_SIZE];	/* Temporary buffer */
-	unsigned long flags;			/* Status/Action bit flags: long for set_bit */
+	/* Data buffer */
+	u8 *buffer;
+	/* Pointer into the above buffer */
+	u8 *current_position;
+	/* Called when this packet command is completed */
+	ide_startstop_t (*callback) (ide_drive_t *);
+	/* Temporary buffer */
+	u8 pc_buffer[IDETAPE_PC_BUFFER_SIZE];
+	/* Status/Action bit flags: long for set_bit */
+	unsigned long flags;
 } idetape_pc_t;
 
 /*
@@ -235,9 +225,7 @@ typedef struct idetape_packet_command_s {
 /* Data direction */
 #define	PC_WRITING			5
 
-/*
- *	A pipeline stage.
- */
+/* A pipeline stage. */
 typedef struct idetape_stage_s {
 	struct request rq;			/* The corresponding request */
 	struct idetape_bh *bh;			/* The data buffers */
@@ -245,9 +233,8 @@ typedef struct idetape_stage_s {
 } idetape_stage_t;
 
 /*
- *	Most of our global data which we need to save even as we leave the
- *	driver due to an interrupt or a timer event is stored in a variable
- *	of type idetape_tape_t, defined below.
+ * Most of our global data which we need to save even as we leave the driver due
+ * to an interrupt or a timer event is stored in the struct defined below.
  */
 typedef struct ide_tape_obj {
 	ide_drive_t	*drive;
@@ -283,15 +270,14 @@ typedef struct ide_tape_obj {
 	int rq_stack_index;
 
 	/*
-	 *	DSC polling variables.
+	 * DSC polling variables.
 	 *
-	 *	While polling for DSC we use postponed_rq to postpone the
-	 *	current request so that ide.c will be able to service
-	 *	pending requests on the other device. Note that at most
-	 *	we will have only one DSC (usually data transfer) request
-	 *	in the device request queue. Additional requests can be
-	 *	queued in our internal pipeline, but they will be visible
-	 *	to ide.c only one at a time.
+	 * While polling for DSC we use postponed_rq to postpone the current
+	 * request so that ide.c will be able to service pending requests on the
+	 * other device. Note that at most we will have only one DSC (usually
+	 * data transfer) request in the device request queue. Additional
+	 * requests can be queued in our internal pipeline, but they will be
+	 * visible to ide.c only one at a time.
 	 */
 	struct request *postponed_rq;
 	/* The time in which we started polling for DSC */
@@ -303,21 +289,15 @@ typedef struct ide_tape_obj {
 	unsigned long dsc_poll_freq;
 	unsigned long dsc_timeout;
 
-	/*
-	 *	Read position information
-	 */
+	/* Read position information */
 	u8 partition;
 	/* Current block */
 	unsigned int first_frame;
 
-	/*
-	 *	Last error information
-	 */
+	/* Last error information */
 	u8 sense_key, asc, ascq;
 
-	/*
-	 *	Character device operation
-	 */
+	/* Character device operation */
 	unsigned int minor;
 	/* device name */
 	char name[4];
@@ -332,33 +312,30 @@ typedef struct ide_tape_obj {
 	u8 caps[20];
 
 	/*
-	 *	Active data transfer request parameters.
-	 *
-	 *	At most, there is only one ide-tape originated data transfer
-	 *	request in the device request queue. This allows ide.c to
-	 *	easily service requests from the other device when we
-	 *	postpone our active request. In the pipelined operation
-	 *	mode, we use our internal pipeline structure to hold
-	 *	more data requests.
+	 * Active data transfer request parameters.
 	 *
-	 *	The data buffer size is chosen based on the tape's
-	 *	recommendation.
+	 * At most, there is only one ide-tape originated data transfer request
+	 * in the device request queue. This allows ide.c to easily service
+	 * requests from the other device when we postpone our active request.
+	 * In the pipelined operation mode, we use our internal pipeline
+	 * structure to hold more data requests. The data buffer size is chosen
+	 * based on the tape's recommendation.
 	 */
-	/* Ptr to the request which is waiting in the device request queue */
+	/* ptr to the request which is waiting in the device request queue */
 	struct request *active_data_rq;
-	/* Data buffer size (chosen based on the tape's recommendation */
+	/* Data buffer size chosen based on the tape's recommendation */
 	int stage_size;
 	idetape_stage_t *merge_stage;
 	int merge_stage_size;
 	struct idetape_bh *bh;
 	char *b_data;
 	int b_count;
-	
+
 	/*
-	 *	Pipeline parameters.
+	 * Pipeline parameters.
 	 *
-	 *	To accomplish non-pipelined mode, we simply set the following
-	 *	variables to zero (or NULL, where appropriate).
+	 * To accomplish non-pipelined mode, we simply set the following
+	 * variables to zero (or NULL, where appropriate).
 	 */
 	/* Number of currently used stages */
 	int nr_stages;
@@ -385,9 +362,7 @@ typedef struct ide_tape_obj {
 	/* protects the ide-tape queue */
 	spinlock_t lock;
 
-	/*
-	 * Measures average tape speed
-	 */
+	/* Measures average tape speed */
 	unsigned long avg_time;
 	int avg_size;
 	int avg_speed;
@@ -400,11 +375,9 @@ typedef struct ide_tape_obj {
 	char write_prot;
 
 	/*
-	 * Limit the number of times a request can
-	 * be postponed, to avoid an infinite postpone
-	 * deadlock.
+	 * Limit the number of times a request can be postponed, to avoid an
+	 * infinite postpone deadlock.
 	 */
-	/* request postpone count limit */
 	int postpone_cnt;
 
 	/*
@@ -419,18 +392,14 @@ typedef struct ide_tape_obj {
 	int tape_head;
 	int last_tape_head;
 
-	/*
-	 * Speed control at the tape buffers input/output
-	 */
+	/* Speed control at the tape buffers input/output */
 	unsigned long insert_time;
 	int insert_size;
 	int insert_speed;
 	int max_insert_speed;
 	int measure_insert_time;
 
-	/*
-	 * Speed regulation negative feedback loop
-	 */
+	/* Speed regulation negative feedback loop */
 	int speed_control;
 	int pipeline_head_speed;
 	int controlled_pipeline_head_speed;
@@ -477,9 +446,7 @@ static void ide_tape_put(struct ide_tape_obj *tape)
 	mutex_unlock(&idetape_ref_mutex);
 }
 
-/*
- *	Tape door status
- */
+/* Tape door status */
 #define DOOR_UNLOCKED			0
 #define DOOR_LOCKED			1
 #define DOOR_EXPLICITLY_LOCKED		2
@@ -499,30 +466,23 @@ static void ide_tape_put(struct ide_tape_obj *tape)
 /* 0 = no tape is loaded, so we don't rewind after ejecting */
 #define IDETAPE_MEDIUM_PRESENT		9
 
-/*
- *	Some defines for the READ BUFFER command
- */
+/* A define for the READ BUFFER command */
 #define IDETAPE_RETRIEVE_FAULTY_BLOCK	6
 
-/*
- *	Some defines for the SPACE command
- */
+/* Some defines for the SPACE command */
 #define IDETAPE_SPACE_OVER_FILEMARK	1
 #define IDETAPE_SPACE_TO_EOD		3
 
-/*
- *	Some defines for the LOAD UNLOAD command
- */
+/* Some defines for the LOAD UNLOAD command */
 #define IDETAPE_LU_LOAD_MASK		1
 #define IDETAPE_LU_RETENSION_MASK	2
 #define IDETAPE_LU_EOT_MASK		4
 
 /*
- *	Special requests for our block device strategy routine.
+ * Special requests for our block device strategy routine.
  *
- *	In order to service a character device command, we add special
- *	requests to the tail of our block device request queue and wait
- *	for their completion.
+ * In order to service a character device command, we add special requests to
+ * the tail of our block device request queue and wait for their completion.
  */
 
 enum {
@@ -533,10 +493,7 @@ enum {
 	REQ_IDETAPE_READ_BUFFER	= (1 << 4),
 };
 
-/*
- *	Error codes which are returned in rq->errors to the higher part
- *	of the driver.
- */
+/* Error codes returned in rq->errors to the higher part of the driver. */
 #define	IDETAPE_ERROR_GENERAL		101
 #define	IDETAPE_ERROR_FILEMARK		102
 #define	IDETAPE_ERROR_EOD		103
@@ -560,8 +517,8 @@ struct idetape_id_gcw {
 #define	IDETAPE_CAPABILITIES_PAGE	0x2a
 
 /*
- *	The variables below are used for the character device interface.
- *	Additional state variables are defined in our ide_drive_t structure.
+ * The variables below are used for the character device interface. Additional
+ * state variables are defined in our ide_drive_t structure.
  */
 static struct ide_tape_obj * idetape_devs[MAX_HWIFS * MAX_DRIVES];
 
@@ -579,10 +536,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-/*
- *      Function declarations
- *
- */
 static int idetape_chrdev_release (struct inode *inode, struct file *filp);
 static void idetape_write_release (ide_drive_t *drive, unsigned int minor);
 
@@ -711,9 +664,6 @@ static struct request *idetape_next_rq_storage (ide_drive_t *drive)
 	return (&tape->rq_stack[tape->rq_stack_index++]);
 }
 
-/*
- *	idetape_init_pc initializes a packet command.
- */
 static void idetape_init_pc (idetape_pc_t *pc)
 {
 	memset(pc->c, 0, 12);
@@ -809,10 +759,7 @@ static void idetape_activate_next_stage(ide_drive_t *drive)
 	tape->next_stage = stage->next;
 }
 
-/*
- *	idetape_kfree_stage calls kfree to completely free a stage, along with
- *	its related buffers.
- */
+/* Free a stage along with its related buffers completely. */
 static void __idetape_kfree_stage (idetape_stage_t *stage)
 {
 	struct idetape_bh *prev_bh, *bh = stage->bh;
@@ -840,8 +787,8 @@ static void idetape_kfree_stage (idetape_tape_t *tape, idetape_stage_t *stage)
 }
 
 /*
- *	idetape_remove_stage_head removes tape->first_stage from the pipeline.
- *	The caller should avoid race conditions.
+ * Remove tape->first_stage from the pipeline. The caller should avoid race
+ * conditions.
  */
 static void idetape_remove_stage_head (ide_drive_t *drive)
 {
@@ -899,8 +846,8 @@ static void idetape_abort_pipeline(ide_drive_t *drive,
 }
 
 /*
- *	idetape_end_request is used to finish servicing a request, and to
- *	insert a pending pipeline request into the main device queue.
+ * Finish servicing a request and insert a pending pipeline request into the
+ * main device queue.
  */
 static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 {
@@ -951,9 +898,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 		if (tape->next_stage != NULL) {
 			idetape_activate_next_stage(drive);
 
-			/*
-			 * Insert the next request into the request queue.
-			 */
+			/* Insert the next request into the request queue. */
 			(void)ide_do_drive_cmd(drive, tape->active_data_rq,
 						ide_end);
 		} else if (!error) {
@@ -1020,23 +965,19 @@ static void idetape_init_rq(struct request *rq, u8 cmd)
 }
 
 /*
- *	idetape_queue_pc_head generates a new packet command request in front
- *	of the request queue, before the current request, so that it will be
- *	processed immediately, on the next pass through the driver.
- *
- *	idetape_queue_pc_head is called from the request handling part of
- *	the driver (the "bottom" part). Safe storage for the request should
- *	be allocated with idetape_next_pc_storage and idetape_next_rq_storage
- *	before calling idetape_queue_pc_head.
+ * Generate a new packet command request in front of the request queue, before
+ * the current request, so that it will be processed immediately, on the next
+ * pass through the driver. The function below is called from the request
+ * handling part of the driver (the "bottom" part). Safe storage for the request
+ * should be allocated with ide_tape_next_{pc,rq}_storage() prior to that.
  *
- *	Memory for those requests is pre-allocated at initialization time, and
- *	is limited to IDETAPE_PC_STACK requests. We assume that we have enough
- *	space for the maximum possible number of inter-dependent packet commands.
+ * Memory for those requests is pre-allocated at initialization time, and is
+ * limited to IDETAPE_PC_STACK requests. We assume that we have enough space for
+ * the maximum possible number of inter-dependent packet commands.
  *
- *	The higher level of the driver - The ioctl handler and the character
- *	device handling functions should queue request to the lower level part
- *	and wait for their completion using idetape_queue_pc_tail or
- *	idetape_queue_rw_tail.
+ * The higher level of the driver - The ioctl handler and the character device
+ * handling functions should queue request to the lower level part and wait for
+ * their completion using idetape_queue_pc_tail or idetape_queue_rw_tail.
  */
 static void idetape_queue_pc_head (ide_drive_t *drive, idetape_pc_t *pc,struct request *rq)
 {
@@ -1069,9 +1010,8 @@ static ide_startstop_t idetape_retry_pc (ide_drive_t *drive)
 }
 
 /*
- *	idetape_postpone_request postpones the current request so that
- *	ide.c will be able to service requests from another device on
- *	the same hwgroup while we are polling for DSC.
+ * Postpone the current request so that ide.c will be able to service requests
+ * from another device on the same hwgroup while we are polling for DSC.
  */
 static void idetape_postpone_request (ide_drive_t *drive)
 {
@@ -1258,46 +1198,40 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 }
 
 /*
- *	Packet Command Interface
+ * Packet Command Interface
  *
- *	The current Packet Command is available in tape->pc, and will not
- *	change until we finish handling it. Each packet command is associated
- *	with a callback function that will be called when the command is
- *	finished.
+ * The current Packet Command is available in tape->pc, and will not change
+ * until we finish handling it. Each packet command is associated with a
+ * callback function that will be called when the command is finished.
  *
- *	The handling will be done in three stages:
+ * The handling will be done in three stages:
  *
- *	1.	idetape_issue_pc will send the packet command to the
- *		drive, and will set the interrupt handler to idetape_pc_intr.
+ * 1. idetape_issue_pc will send the packet command to the drive, and will set
+ * the interrupt handler to idetape_pc_intr.
  *
- *	2.	On each interrupt, idetape_pc_intr will be called. This step
- *		will be repeated until the device signals us that no more
- *		interrupts will be issued.
+ * 2. On each interrupt, idetape_pc_intr will be called. This step will be
+ * repeated until the device signals us that no more interrupts will be issued.
  *
- *	3.	ATAPI Tape media access commands have immediate status with a
- *		delayed process. In case of a successful initiation of a
- *		media access packet command, the DSC bit will be set when the
- *		actual execution of the command is finished. 
- *		Since the tape drive will not issue an interrupt, we have to
- *		poll for this event. In this case, we define the request as
- *		"low priority request" by setting rq_status to
- *		IDETAPE_RQ_POSTPONED, 	set a timer to poll for DSC and exit
- *		the driver.
+ * 3. ATAPI Tape media access commands have immediate status with a delayed
+ * process. In case of a successful initiation of a media access packet command,
+ * the DSC bit will be set when the actual execution of the command is finished.
+ * Since the tape drive will not issue an interrupt, we have to poll for this
+ * event. In this case, we define the request as "low priority request" by
+ * setting rq_status to IDETAPE_RQ_POSTPONED, set a timer to poll for DSC and
+ * exit the driver.
  *
- *		ide.c will then give higher priority to requests which
- *		originate from the other device, until will change rq_status
- *		to RQ_ACTIVE.
+ * ide.c will then give higher priority to requests which originate from the
+ * other device, until will change rq_status to RQ_ACTIVE.
  *
- *	4.	When the packet command is finished, it will be checked for errors.
+ * 4. When the packet command is finished, it will be checked for errors.
  *
- *	5.	In case an error was found, we queue a request sense packet
- *		command in front of the request queue and retry the operation
- *		up to IDETAPE_MAX_PC_RETRIES times.
- *
- *	6.	In case no error was found, or we decided to give up and not
- *		to retry again, the callback function will be called and then
- *		we will handle the next request.
+ * 5. In case an error was found, we queue a request sense packet command in
+ * front of the request queue and retry the operation up to
+ * IDETAPE_MAX_PC_RETRIES times.
  *
+ * 6. In case no error was found, or we decided to give up and not to retry
+ * again, the callback function will be called and then we will handle the next
+ * request.
  */
 static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive)
 {
@@ -1363,9 +1297,9 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, idetape_pc_t *pc)
 	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
 	    test_bit(PC_ABORT, &pc->flags)) {
 		/*
-		 *	We will "abort" retrying a packet command in case
-		 *	a legitimate error code was received (crossing a
-		 *	filemark, or end of the media, for example).
+		 * We will "abort" retrying a packet command in case legitimate
+		 * error code was received (crossing a filemark, or end of the
+		 * media, for example).
 		 */
 		if (!test_bit(PC_ABORT, &pc->flags)) {
 			if (!(pc->c[0] == TEST_UNIT_READY &&
@@ -1416,9 +1350,6 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, idetape_pc_t *pc)
 	}
 }
 
-/*
- *	General packet command callback function.
- */
 static ide_startstop_t idetape_pc_callback (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1429,15 +1360,14 @@ static ide_startstop_t idetape_pc_callback (ide_drive_t *drive)
 	return ide_stopped;
 }
 
-/*
- *	A mode sense command is used to "sense" tape parameters.
- */
+/* A mode sense command is used to "sense" tape parameters. */
 static void idetape_create_mode_sense_cmd (idetape_pc_t *pc, u8 page_code)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = MODE_SENSE;
 	if (page_code != IDETAPE_BLOCK_DESCRIPTOR)
-		pc->c[1] = 8;	/* DBD = 1 - Don't return block descriptors */
+		/* DBD = 1 - Don't return block descriptors */
+		pc->c[1] = 8;
 	pc->c[2] = page_code;
 	/*
 	 * Changed pc->c[3] to 0 (255 will at best return unused info).
@@ -1447,7 +1377,8 @@ static void idetape_create_mode_sense_cmd (idetape_pc_t *pc, u8 page_code)
 	 * and return an error when 255 is used.
 	 */
 	pc->c[3] = 0;
-	pc->c[4] = 255;		/* (We will just discard data in that case) */
+	/* We will just discard data in that case */
+	pc->c[4] = 255;
 	if (page_code == IDETAPE_BLOCK_DESCRIPTOR)
 		pc->request_transfer = 12;
 	else if (page_code == IDETAPE_CAPABILITIES_PAGE)
@@ -1619,9 +1550,6 @@ static void idetape_create_write_cmd(idetape_tape_t *tape, idetape_pc_t *pc, uns
 		set_bit(PC_DMA_RECOMMENDED, &pc->flags);
 }
 
-/*
- * idetape_do_request is our request handling function.	
- */
 static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 					  struct request *rq, sector_t block)
 {
@@ -1635,18 +1563,14 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			rq->sector, rq->nr_sectors, rq->current_nr_sectors);
 
 	if (!blk_special_request(rq)) {
-		/*
-		 * We do not support buffer cache originated requests.
-		 */
+		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
 		ide_end_request(drive, 0, 0);
 		return ide_stopped;
 	}
 
-	/*
-	 *	Retry a failed packet command
-	 */
+	/* Retry a failed packet command */
 	if (tape->failed_pc != NULL &&
 	    tape->pc->c[0] == REQUEST_SENSE) {
 		return idetape_issue_pc(drive, tape->failed_pc);
@@ -1733,9 +1657,7 @@ out:
 	return idetape_issue_pc(drive, pc);
 }
 
-/*
- *	Pipeline related functions
- */
+/* Pipeline related functions */
 static inline int idetape_pipeline_active (idetape_tape_t *tape)
 {
 	int rc1, rc2;
@@ -1746,16 +1668,16 @@ static inline int idetape_pipeline_active (idetape_tape_t *tape)
 }
 
 /*
- *	idetape_kmalloc_stage uses __get_free_page to allocate a pipeline
- *	stage, along with all the necessary small buffers which together make
- *	a buffer of size tape->stage_size (or a bit more). We attempt to
- *	combine sequential pages as much as possible.
+ * The function below uses __get_free_page to allocate a pipeline stage, along
+ * with all the necessary small buffers which together make a buffer of size
+ * tape->stage_size (or a bit more). We attempt to combine sequential pages as
+ * much as possible.
  *
- *	Returns a pointer to the new allocated stage, or NULL if we
- *	can't (or don't want to) allocate a stage.
+ * It returns a pointer to the new allocated stage, or NULL if we can't (or
+ * don't want to) allocate a stage.
  *
- *	Pipeline stages are optional and are used to increase performance.
- *	If we can't allocate them, we'll manage without them.
+ * Pipeline stages are optional and are used to increase performance. If we
+ * can't allocate them, we'll manage without them.
  */
 static idetape_stage_t *__idetape_kmalloc_stage (idetape_tape_t *tape, int full, int clear)
 {
@@ -1913,9 +1835,7 @@ static void idetape_switch_buffers (idetape_tape_t *tape, idetape_stage_t *stage
 	idetape_init_merge_stage(tape);
 }
 
-/*
- *	idetape_add_stage_tail adds a new stage at the end of the pipeline.
- */
+/* Add a new stage at the end of the pipeline. */
 static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1937,12 +1857,9 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 	spin_unlock_irqrestore(&tape->lock, flags);
 }
 
-/*
- *	idetape_wait_for_request installs a completion in a pending request
- *	and sleeps until it is serviced.
- *
- *	The caller should ensure that the request will not be serviced
- *	before we install the completion (usually by disabling interrupts).
+/* Install a completion in a pending request and sleep until it is serviced. The
+ * caller should ensure that the request will not be serviced before we install
+ * the completion (usually by disabling interrupts).
  */
 static void idetape_wait_for_request (ide_drive_t *drive, struct request *rq)
 {
@@ -1996,12 +1913,8 @@ static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
 }
 
 /*
- *	idetape_create_write_filemark_cmd will:
- *
- *		1.	Write a filemark if write_filemark=1.
- *		2.	Flush the device buffers without writing a filemark
- *			if write_filemark=0.
- *
+ * Write a filemark if write_filemark=1. Flush the device buffers without
+ * writing a filemark otherwise.
  */
 static void idetape_create_write_filemark_cmd (ide_drive_t *drive, idetape_pc_t *pc,int write_filemark)
 {
@@ -2020,24 +1933,17 @@ static void idetape_create_test_unit_ready_cmd(idetape_pc_t *pc)
 }
 
 /*
- *	idetape_queue_pc_tail is based on the following functions:
- *
- *	ide_do_drive_cmd from ide.c
- *	cdrom_queue_request and cdrom_queue_packet_command from ide-cd.c
- *
- *	We add a special packet command request to the tail of the request
- *	queue, and wait for it to be serviced.
+ * We add a special packet command request to the tail of the request queue, and
+ * wait for it to be serviced. This is not to be called from within the request
+ * handling part of the driver! We allocate here data on the stack and it is
+ * valid until the request is finished. This is not the case for the bottom part
+ * of the driver, where we are always leaving the functions to wait for an
+ * interrupt or a timer event.
  *
- *	This is not to be called from within the request handling part
- *	of the driver ! We allocate here data in the stack, and it is valid
- *	until the request is finished. This is not the case for the bottom
- *	part of the driver, where we are always leaving the functions to wait
- *	for an interrupt or a timer event.
- *
- *	From the bottom part of the driver, we should allocate safe memory
- *	using idetape_next_pc_storage and idetape_next_rq_storage, and add
- *	the request to the request list without waiting for it to be serviced !
- *	In that case, we usually use idetape_queue_pc_head.
+ * From the bottom part of the driver, we should allocate safe memory using
+ * idetape_next_pc_storage() and ide_tape_next_rq_storage(), and add the request
+ * to the request list without waiting for it to be serviced! In that case, we
+ * usually use idetape_queue_pc_head().
  */
 static int __idetape_queue_pc_tail (ide_drive_t *drive, idetape_pc_t *pc)
 {
@@ -2065,9 +1971,7 @@ static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
 	idetape_pc_t pc;
 	int load_attempted = 0;
 
-	/*
-	 * Wait for the tape to become ready
-	 */
+	/* Wait for the tape to become ready */
 	set_bit(IDETAPE_MEDIUM_PRESENT, &tape->flags);
 	timeout += jiffies;
 	while (time_before(jiffies, timeout)) {
@@ -2075,7 +1979,8 @@ static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
 		if (!__idetape_queue_pc_tail(drive, &pc))
 			return 0;
 		if ((tape->sense_key == 2 && tape->asc == 4 && tape->ascq == 2)
-		    || (tape->asc == 0x3A)) {	/* no media */
+		    || (tape->asc == 0x3A)) {
+			/* no media */
 			if (load_attempted)
 				return -ENOMEDIUM;
 			idetape_create_load_unload_cmd(drive, &pc, IDETAPE_LU_LOAD_MASK);
@@ -2203,13 +2108,10 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 }
 
 /*
- *	idetape_position_tape positions the tape to the requested block
- *	using the LOCATE packet command. A READ POSITION command is then
- *	issued to check where we are positioned.
- *
- *	Like all higher level operations, we queue the commands at the tail
- *	of the request queue and wait for their completion.
- *	
+ * Position the tape to the requested block using the LOCATE packet command.
+ * A READ POSITION command is then issued to check where we are positioned. Like
+ * all higher level operations, we queue the commands at the tail of the request
+ * queue and wait for their completion.
  */
 static int idetape_position_tape (ide_drive_t *drive, unsigned int block, u8 partition, int skip)
 {
@@ -2247,8 +2149,8 @@ static void idetape_discard_read_pipeline (ide_drive_t *drive, int restore_posit
 }
 
 /*
- * idetape_queue_rw_tail generates a read/write request for the block
- * device interface and wait for it to be serviced.
+ * Generate a read/write request for the block device interface and wait for it
+ * to be serviced.
  */
 static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct idetape_bh *bh)
 {
@@ -2343,15 +2245,15 @@ static void idetape_wait_first_stage (ide_drive_t *drive)
 }
 
 /*
- *	idetape_add_chrdev_write_request tries to add a character device
- *	originated write request to our pipeline. In case we don't succeed,
- *	we revert to non-pipelined operation mode for this request.
+ * Try to add a character device originated write request to our pipeline. In
+ * case we don't succeed, we revert to non-pipelined operation mode for this
+ * request. In order to accomplish that, we
  *
- *	1.	Try to allocate a new pipeline stage.
- *	2.	If we can't, wait for more and more requests to be serviced
- *		and try again each time.
- *	3.	If we still can't allocate a stage, fallback to
- *		non-pipelined operation mode for this request.
+ * 1. Try to allocate a new pipeline stage.
+ * 2. If we can't, wait for more and more requests to be serviced and try again
+ * each time.
+ * 3. If we still can't allocate a stage, fallback to non-pipelined operation
+ * mode for this request.
  */
 static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 {
@@ -2362,10 +2264,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-     	/*
-     	 *	Attempt to allocate a new stage.
-	 *	Pay special attention to possible race conditions.
-	 */
+	/* Attempt to allocate a new stage. Beware possible race conditions. */
 	while ((new_stage = idetape_kmalloc_stage(tape)) == NULL) {
 		spin_lock_irqsave(&tape->lock, flags);
 		if (idetape_pipeline_active(tape)) {
@@ -2377,8 +2276,8 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 			if (idetape_pipeline_active(tape))
 				continue;
 			/*
-			 *	Linux is short on memory. Fallback to
-			 *	non-pipelined operation mode for this request.
+			 * The machine is short on memory. Fallback to non-
+			 * pipelined operation mode for this request.
 			 */
 			return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, tape->merge_stage->bh);
 		}
@@ -2395,11 +2294,11 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 	idetape_calculate_speeds(drive);
 
 	/*
-	 *	Estimate whether the tape has stopped writing by checking
-	 *	if our write pipeline is currently empty. If we are not
-	 *	writing anymore, wait for the pipeline to be full enough
-	 *	(90%) before starting to service requests, so that we will
-	 *	be able to keep up with the higher speeds of the tape.
+	 * Estimate whether the tape has stopped writing by checking if our
+	 * write pipeline is currently empty. If we are not writing anymore,
+	 * wait for the pipeline to be almost completely full (90%) before
+	 * starting to service requests, so that we will be able to keep up with
+	 * the higher speeds of the tape.
 	 */
 	if (!idetape_pipeline_active(tape)) {
 		if (tape->nr_stages >= tape->max_stages * 9 / 10 ||
@@ -2420,8 +2319,8 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 }
 
 /*
- *	idetape_wait_for_pipeline will wait until all pending pipeline
- *	requests are serviced. Typically called on device close.
+ * Wait until all pending pipeline requests are serviced. Typically called on
+ * device close.
  */
 static void idetape_wait_for_pipeline (ide_drive_t *drive)
 {
@@ -2490,10 +2389,10 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 
 	/*
-	 *	On the next backup, perform the feedback loop again.
-	 *	(I don't want to keep sense information between backups,
-	 *	 as some systems are constantly on, and the system load
-	 *	 can be totally different on the next backup).
+	 * On the next backup, perform the feedback loop again. (I don't want to
+	 * keep sense information between backups, as some systems are
+	 * constantly on, and the system load can be totally different on the
+	 * next backup).
 	 */
 	tape->max_stages = tape->min_pipeline;
 	if (tape->first_stage != NULL ||
@@ -2545,11 +2444,10 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
 		tape->chrdev_dir = IDETAPE_DIR_READ;
 
 		/*
-		 *	Issue a read 0 command to ensure that DSC handshake
-		 *	is switched from completion mode to buffer available
-		 *	mode.
-		 *	No point in issuing this if DSC overlap isn't supported,
-		 *	some drives (Seagate STT3401A) will return an error.
+		 * Issue a read 0 command to ensure that DSC handshake is
+		 * switched from completion mode to buffer available mode.
+		 * No point in issuing this if DSC overlap isn't supported, some
+		 * drives (Seagate STT3401A) will return an error.
 		 */
 		if (drive->dsc_overlap) {
 			bytes_read = idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, 0, tape->merge_stage->bh);
@@ -2590,9 +2488,8 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
 }
 
 /*
- *	idetape_add_chrdev_read_request is called from idetape_chrdev_read
- *	to service a character device read request and add read-ahead
- *	requests to our pipeline.
+ * Called from idetape_chrdev_read() to service a character device read request
+ * and add read-ahead requests to our pipeline.
  */
 static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 {
@@ -2603,16 +2500,11 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 
 	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
-	/*
-	 * If we are at a filemark, return a read length of 0
-	 */
+	/* If we are at a filemark, return a read length of 0 */
 	if (test_bit(IDETAPE_FILEMARK, &tape->flags))
 		return 0;
 
-	/*
-	 * Wait for the next block to be available at the head
-	 * of the pipeline
-	 */
+	/* Wait for the next block to reach the head of the pipeline. */
 	idetape_init_read(drive, tape->max_stages);
 	if (tape->first_stage == NULL) {
 		if (test_bit(IDETAPE_PIPELINE_ERROR, &tape->flags))
@@ -2651,7 +2543,7 @@ static void idetape_pad_zeros (ide_drive_t *drive, int bcount)
 	idetape_tape_t *tape = drive->driver_data;
 	struct idetape_bh *bh;
 	int blocks;
-	
+
 	while (bcount) {
 		unsigned int count;
 
@@ -2691,10 +2583,9 @@ static int idetape_pipeline_size (ide_drive_t *drive)
 }
 
 /*
- *	Rewinds the tape to the Beginning Of the current Partition (BOP).
- *
- *	We currently support only one partition.
- */ 
+ * Rewinds the tape to the Beginning Of the current Partition (BOP). We
+ * currently support only one partition.
+ */
 static int idetape_rewind_tape (ide_drive_t *drive)
 {
 	int retval;
@@ -2716,13 +2607,7 @@ static int idetape_rewind_tape (ide_drive_t *drive)
 	return 0;
 }
 
-/*
- *	Our special ide-tape ioctl's.
- *
- *	Currently there aren't any ioctl's.
- *	mtio.h compatible commands should be issued to the character device
- *	interface.
- */
+/* mtio.h compatible commands should be issued to the chrdev interface. */
 static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned long arg)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -2756,13 +2641,11 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned l
 }
 
 /*
- *	idetape_space_over_filemarks is now a bit more complicated than just
- *	passing the command to the tape since we may have crossed some
- *	filemarks during our pipelined read-ahead mode.
- *
- *	As a minor side effect, the pipeline enables us to support MTFSFM when
- *	the filemark is in our internal pipeline even if the tape doesn't
- *	support spacing over filemarks in the reverse direction.
+ * The function below is now a bit more complicated than just passing the
+ * command to the tape since we may have crossed some filemarks during our
+ * pipelined read-ahead mode. As a minor side effect, the pipeline enables us to
+ * support MTFSFM when the filemark is in our internal pipeline even if the tape
+ * doesn't support spacing over filemarks in the reverse direction.
  */
 static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_count)
 {
@@ -2781,10 +2664,7 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		/*
-		 *	We have a read-ahead buffer. Scan it for crossed
-		 *	filemarks.
-		 */
+		/* its a read-ahead buffer, scan it for crossed filemarks. */
 		tape->merge_stage_size = 0;
 		if (test_and_clear_bit(IDETAPE_FILEMARK, &tape->flags))
 			++count;
@@ -2797,13 +2677,15 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 			spin_lock_irqsave(&tape->lock, flags);
 			if (tape->first_stage == tape->active_stage) {
 				/*
-				 *	We have reached the active stage in the read pipeline.
-				 *	There is no point in allowing the drive to continue
-				 *	reading any farther, so we stop the pipeline.
+				 * We have reached the active stage in the read
+				 * pipeline. There is no point in allowing the
+				 * drive to continue reading any farther, so we
+				 * stop the pipeline.
 				 *
-				 *	This section should be moved to a separate subroutine,
-				 *	because a similar function is performed in
-				 *	__idetape_discard_read_pipeline(), for example.
+				 * This section should be moved to a separate
+				 * subroutine because similar operations are
+				 * done in __idetape_discard_read_pipeline(),
+				 * for example.
 				 */
 				tape->next_stage = NULL;
 				spin_unlock_irqrestore(&tape->lock, flags);
@@ -2819,8 +2701,8 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 	}
 
 	/*
-	 *	The filemark was not found in our internal pipeline.
-	 *	Now we can issue the space command.
+	 * The filemark was not found in our internal pipeline;	now we can issue
+	 * the space command.
 	 */
 	switch (mt_op) {
 		case MTFSF:
@@ -2843,21 +2725,19 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 
 
 /*
- *	Our character device read / write functions.
+ * Our character device read / write functions.
  *
- *	The tape is optimized to maximize throughput when it is transferring
- *	an integral number of the "continuous transfer limit", which is
- *	a parameter of the specific tape (26 KB on my particular tape).
- *      (32 kB for Onstream)
+ * The tape is optimized to maximize throughput when it is transferring an
+ * integral number of the "continuous transfer limit", which is a parameter of
+ * the specific tape (26kB on my particular tape, 32kB for Onstream).
  *
- *	As of version 1.3 of the driver, the character device provides an
- *	abstract continuous view of the media - any mix of block sizes (even 1
- *	byte) on the same backup/restore procedure is supported. The driver
- *	will internally convert the requests to the recommended transfer unit,
- *	so that an unmatch between the user's block size to the recommended
- *	size will only result in a (slightly) increased driver overhead, but
- *	will no longer hit performance.
- *      This is not applicable to Onstream.
+ * As of version 1.3 of the driver, the character device provides an abstract
+ * continuous view of the media - any mix of block sizes (even 1 byte) on the
+ * same backup/restore procedure is supported. The driver will internally
+ * convert the requests to the recommended transfer unit, so that an unmatch
+ * between the user's block size to the recommended size will only result in a
+ * (slightly) increased driver overhead, but will no longer hit performance.
+ * This is not applicable to Onstream.
  */
 static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 				    size_t count, loff_t *ppos)
@@ -2950,11 +2830,10 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 		idetape_init_merge_stage(tape);
 
 		/*
-		 *	Issue a write 0 command to ensure that DSC handshake
-		 *	is switched from completion mode to buffer available
-		 *	mode.
-		 *	No point in issuing this if DSC overlap isn't supported,
-		 *	some drives (Seagate STT3401A) will return an error.
+		 * Issue a write 0 command to ensure that DSC handshake is
+		 * switched from completion mode to buffer available mode. No
+		 * point in issuing this if DSC overlap isn't supported, some
+		 * drives (Seagate STT3401A) will return an error.
 		 */
 		if (drive->dsc_overlap) {
 			ssize_t retval = idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, 0, tape->merge_stage->bh);
@@ -3045,9 +2924,7 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 
 	debug_log(DBG_ERR, "Handling MTIOCTOP ioctl: mt_op=%d, mt_count=%d\n",
 			mt_op, mt_count);
-	/*
-	 *	Commands which need our pipelined read-ahead stages.
-	 */
+	/* Commands which need our pipelined read-ahead stages. */
 	switch (mt_op) {
 		case MTFSF:
 		case MTFSFM:
@@ -3235,9 +3112,6 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
 	tape->drv_write_prot = (pc.buffer[2] & 0x80) >> 7;
 }
 
-/*
- *	Our character device open function.
- */
 static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 {
 	unsigned int minor = iminor(inode), i = minor & ~0xc0;
@@ -3304,9 +3178,7 @@ static int idetape_chrdev_open (struct inode *inode, struct file *filp)
 		}
 	}
 
-	/*
-	 * Lock the tape drive door so user can't eject.
-	 */
+	/* Lock the tape drive door so user can't eject. */
 	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
 		if (idetape_create_prevent_cmd(drive, &pc, 1)) {
 			if (!idetape_queue_pc_tail(drive, &pc)) {
@@ -3341,9 +3213,6 @@ static void idetape_write_release (ide_drive_t *drive, unsigned int minor)
 	idetape_flush_tape_buffers(drive);
 }
 
-/*
- *	Our character device release function.
- */
 static int idetape_chrdev_release (struct inode *inode, struct file *filp)
 {
 	struct ide_tape_obj *tape = ide_tape_f(filp);
@@ -3500,9 +3369,6 @@ static void idetape_add_settings (ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-/*
- *			drive	setting name		read/write	data type	min			max			mul_factor			div_factor	data pointer				set function
- */
 	ide_add_setting(drive, "buffer", SETTING_READ, TYPE_SHORT, 0, 0xffff,
 			1, 2, (u16 *)&tape->caps[16], NULL);
 	ide_add_setting(drive,	"pipeline_min",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->min_pipeline,			NULL);
@@ -3529,16 +3395,15 @@ static inline void idetape_add_settings(ide_drive_t *drive) { ; }
 #endif
 
 /*
- *	ide_setup is called to:
+ * The function below is called to:
  *
- *		1.	Initialize our various state variables.
- *		2.	Ask the tape for its capabilities.
- *		3.	Allocate a buffer which will be used for data
- *			transfer. The buffer size is chosen based on
- *			the recommendation which we received in step (2).
+ * 1. Initialize our various state variables.
+ * 2. Ask the tape for its capabilities.
+ * 3. Allocate a buffer which will be used for data transfer. The buffer size
+ * is chosen based on the recommendation which we received in step 2.
  *
- *	Note that at this point ide.c already assigned us an irq, so that
- *	we can queue requests here and wait for their completion.
+ * Note that at this point ide.c already assigned us an irq, so that we can
+ * queue requests here and wait for their completion.
  */
 static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 {
@@ -3572,7 +3437,7 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		set_bit(IDETAPE_DRQ_INTERRUPT, &tape->flags);
 
 	tape->min_pipeline = tape->max_pipeline = tape->max_stages = 10;
-	
+
 	idetape_get_inquiry_results(drive);
 	idetape_get_mode_sense_results(drive);
 	ide_tape_get_bsize_from_bdesc(drive);
@@ -3595,9 +3460,7 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 
 	tape->max_stages = speed * 1000 * 10 / tape->stage_size;
 
-	/*
-	 * 	Limit memory use for pipeline to 10% of physical memory
-	 */
+	/* Limit memory use for pipeline to 10% of physical memory */
 	si_meminfo(&si);
 	if (tape->max_stages * tape->stage_size > si.totalram * si.mem_unit / 10)
 		tape->max_stages = si.totalram * si.mem_unit / (10 * tape->stage_size);
@@ -3617,8 +3480,8 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		t = t1;
 
 	/*
-	 *	Ensure that the number we got makes sense; limit
-	 *	it within IDETAPE_DSC_RW_MIN and IDETAPE_DSC_RW_MAX.
+	 * Ensure that the number we got makes sense; limit it within
+	 * IDETAPE_DSC_RW_MIN and IDETAPE_DSC_RW_MAX.
 	 */
 	tape->best_dsc_rw_freq = max_t(unsigned long,
 				min_t(unsigned long, t, IDETAPE_DSC_RW_MAX),
@@ -3706,9 +3569,7 @@ static ide_driver_t idetape_driver = {
 #endif
 };
 
-/*
- *	Our character device supporting functions, passed to register_chrdev.
- */
+/* Our character device supporting functions, passed to register_chrdev. */
 static const struct file_operations idetape_fops = {
 	.owner		= THIS_MODULE,
 	.read		= idetape_chrdev_read,
-- 
cgit v1.1


From 71071b8e60d6dab130e428a016b872e2623eddaa Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:53 +0100
Subject: ide-tape: remove struct idetape_id_gcw

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 56 +++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d8b02fb..aed2559 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -498,20 +498,6 @@ enum {
 #define	IDETAPE_ERROR_FILEMARK		102
 #define	IDETAPE_ERROR_EOD		103
 
-/*
- *	The following is used to format the general configuration word of
- *	the ATAPI IDENTIFY DEVICE command.
- */
-struct idetape_id_gcw {	
-	unsigned packet_size		:2;	/* Packet Size */
-	unsigned reserved234		:3;	/* Reserved */
-	unsigned drq_type		:2;	/* Command packet DRQ type */
-	unsigned removable		:1;	/* Removable media */
-	unsigned device_type		:5;	/* Device type */
-	unsigned reserved13		:1;	/* Reserved */
-	unsigned protocol		:2;	/* Protocol type */
-};
-
 /* Structures related to the SELECT SENSE / MODE SENSE packet commands. */
 #define IDETAPE_BLOCK_DESCRIPTOR	0
 #define	IDETAPE_CAPABILITIES_PAGE	0x2a
@@ -3254,37 +3240,39 @@ static int idetape_chrdev_release (struct inode *inode, struct file *filp)
 }
 
 /*
- *	idetape_identify_device is called to check the contents of the
- *	ATAPI IDENTIFY command results. We return:
+ * check the contents of the ATAPI IDENTIFY command results. We return:
  *
- *	1	If the tape can be supported by us, based on the information
- *		we have so far.
+ * 1 - If the tape can be supported by us, based on the information we have so
+ * far.
  *
- *	0 	If this tape driver is not currently supported by us.
+ * 0 - If this tape driver is not currently supported by us.
  */
-static int idetape_identify_device (ide_drive_t *drive)
+static int idetape_identify_device(ide_drive_t *drive)
 {
-	struct idetape_id_gcw gcw;
-	struct hd_driveid *id = drive->id;
+	u8 gcw[2], protocol, device_type, removable, packet_size;
 
 	if (drive->id_read == 0)
 		return 1;
 
-	*((unsigned short *) &gcw) = id->config;
+	*((unsigned short *) &gcw) = drive->id->config;
 
-	/* Check that we can support this device */
+	protocol	=   (gcw[1] & 0xC0) >> 6;
+	device_type	=    gcw[1] & 0x1F;
+	removable	= !!(gcw[0] & 0x80);
+	packet_size	=    gcw[0] & 0x3;
 
-	if (gcw.protocol != 2)
+	/* Check that we can support this device */
+	if (protocol != 2)
 		printk(KERN_ERR "ide-tape: Protocol (0x%02x) is not ATAPI\n",
-				gcw.protocol);
-	else if (gcw.device_type != 1)
+				protocol);
+	else if (device_type != 1)
 		printk(KERN_ERR "ide-tape: Device type (0x%02x) is not set "
-				"to tape\n", gcw.device_type);
-	else if (!gcw.removable)
+				"to tape\n", device_type);
+	else if (!removable)
 		printk(KERN_ERR "ide-tape: The removable flag is not set\n");
-	else if (gcw.packet_size != 0) {
+	else if (packet_size != 0) {
 		printk(KERN_ERR "ide-tape: Packet size (0x%02x) is not 12 "
-				"bytes long\n", gcw.packet_size);
+				"bytes long\n", packet_size);
 	} else
 		return 1;
 	return 0;
@@ -3409,8 +3397,8 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 {
 	unsigned long t1, tmid, tn, t;
 	int speed;
-	struct idetape_id_gcw gcw;
 	int stage_size;
+	u8 gcw[2];
 	struct sysinfo si;
 	u16 *ctl = (u16 *)&tape->caps[12];
 
@@ -3433,7 +3421,9 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	tape->max_insert_speed = 10000;
 	tape->speed_control = 1;
 	*((unsigned short *) &gcw) = drive->id->config;
-	if (gcw.drq_type == 1)
+
+	/* Command packet DRQ type */
+	if (((gcw[0] & 0x60) >> 5) == 1)
 		set_bit(IDETAPE_DRQ_INTERRUPT, &tape->flags);
 
 	tape->min_pipeline = tape->max_pipeline = tape->max_stages = 10;
-- 
cgit v1.1


From 1f27e38dd312867295670c29a301fce3f5b5d3b3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: remove unused "length" arg from
 idetape_create_read_buffer_cmd()

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aed2559..fc57cd0 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1497,7 +1497,8 @@ static void idetape_create_read_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsi
 		set_bit(PC_DMA_RECOMMENDED, &pc->flags);
 }
 
-static void idetape_create_read_buffer_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsigned int length, struct idetape_bh *bh)
+static void idetape_create_read_buffer_cmd(idetape_tape_t *tape,
+		idetape_pc_t *pc, struct idetape_bh *bh)
 {
 	int size = 32768;
 	struct idetape_bh *p = bh;
@@ -1515,7 +1516,8 @@ static void idetape_create_read_buffer_cmd(idetape_tape_t *tape, idetape_pc_t *p
 		atomic_set(&p->b_count, 0);
 		p = p->b_reqnext;
 	}
-	pc->request_transfer = pc->buffer_size = size;
+	pc->request_transfer = size;
+	pc->buffer_size = size;
 }
 
 static void idetape_create_write_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsigned int length, struct idetape_bh *bh)
@@ -1625,7 +1627,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	if (rq->cmd[0] & REQ_IDETAPE_READ_BUFFER) {
 		tape->postpone_cnt = 0;
 		pc = idetape_next_pc_storage(drive);
-		idetape_create_read_buffer_cmd(tape, pc, rq->current_nr_sectors, (struct idetape_bh *)rq->special);
+		idetape_create_read_buffer_cmd(tape, pc,
+				(struct idetape_bh *)rq->special);
 		goto out;
 	}
 	if (rq->cmd[0] & REQ_IDETAPE_PC1) {
-- 
cgit v1.1


From c837cfa5b61f0ef92cf2c01f3f48808751f68897 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: include proper headers

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fc57cd0..2d14841 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -39,9 +39,9 @@
 #include <scsi/scsi.h>
 
 #include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
+#include <linux/irq.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
 #include <asm/unaligned.h>
 #include <linux/mtio.h>
 
-- 
cgit v1.1


From 9c14576886bb4e3cfe624c9ec95d980d58a109de Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: collect module-related macro calls at the end

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2d14841..916b422 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -3684,9 +3684,6 @@ failed:
 	return -ENODEV;
 }
 
-MODULE_DESCRIPTION("ATAPI Streaming TAPE Driver");
-MODULE_LICENSE("GPL");
-
 static void __exit idetape_exit (void)
 {
 	driver_unregister(&idetape_driver.gen_driver);
@@ -3729,3 +3726,5 @@ MODULE_ALIAS("ide:*m-tape*");
 module_init(idetape_init);
 module_exit(idetape_exit);
 MODULE_ALIAS_CHARDEV_MAJOR(IDETAPE_MAJOR);
+MODULE_DESCRIPTION("ATAPI Streaming TAPE Driver");
+MODULE_LICENSE("GPL");
-- 
cgit v1.1


From bf6296b68848219f585c597de422621e236afc3c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: remove leftover OnStream support warning

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 916b422..42f1b65 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -3632,10 +3632,6 @@ static int ide_tape_probe(ide_drive_t *drive)
 		printk("ide-tape: passing drive %s to ide-scsi emulation.\n", drive->name);
 		goto failed;
 	}
-	if (strstr(drive->id->model, "OnStream DI-")) {
-		printk(KERN_WARNING "ide-tape: Use drive %s with ide-scsi emulation and osst.\n", drive->name);
-		printk(KERN_WARNING "ide-tape: OnStream support will be removed soon from ide-tape!\n");
-	}
 	tape = kzalloc(sizeof (idetape_tape_t), GFP_KERNEL);
 	if (tape == NULL) {
 		printk(KERN_ERR "ide-tape: %s: Can't allocate a tape structure\n", drive->name);
-- 
cgit v1.1


From 24d57f8b2880755b3704c110cd431b4dd6b75580 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: fix syntax error in idetape_identify_device()

Spotted by Sergei Shtylyov.

CC: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 42f1b65..0c56abe4 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -3274,8 +3274,8 @@ static int idetape_identify_device(ide_drive_t *drive)
 	else if (!removable)
 		printk(KERN_ERR "ide-tape: The removable flag is not set\n");
 	else if (packet_size != 0) {
-		printk(KERN_ERR "ide-tape: Packet size (0x%02x) is not 12 "
-				"bytes long\n", packet_size);
+		printk(KERN_ERR "ide-tape: Packet size (0x%02x) is not 12"
+				" bytes\n", packet_size);
 	} else
 		return 1;
 	return 0;
-- 
cgit v1.1


From 5a04cfa911f9c3c648240bd95002479d83619260 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:54 +0100
Subject: ide-tape: cleanup the remaining codestyle issues

... thus decreasing checkpatch.pl errors to 0.

Bart:
- remove needless function prototypes while at it
- remove needless parentheses while at it
- add missing KERN_ level to ide_tape_probe()
- other minor fixups

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 832 ++++++++++++++++++++++++++++---------------------
 1 file changed, 480 insertions(+), 352 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 0c56abe4..11e1383 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -506,7 +506,7 @@ enum {
  * The variables below are used for the character device interface. Additional
  * state variables are defined in our ide_drive_t structure.
  */
-static struct ide_tape_obj * idetape_devs[MAX_HWIFS * MAX_DRIVES];
+static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES];
 
 #define ide_tape_f(file) ((file)->private_data)
 
@@ -522,20 +522,18 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static int idetape_chrdev_release (struct inode *inode, struct file *filp);
-static void idetape_write_release (ide_drive_t *drive, unsigned int minor);
-
 /*
  * Too bad. The drive wants to send us data which we are not ready to accept.
  * Just throw it away.
  */
-static void idetape_discard_data (ide_drive_t *drive, unsigned int bcount)
+static void idetape_discard_data(ide_drive_t *drive, unsigned int bcount)
 {
 	while (bcount--)
 		(void) HWIF(drive)->INB(IDE_DATA_REG);
 }
 
-static void idetape_input_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsigned int bcount)
+static void idetape_input_buffers(ide_drive_t *drive, idetape_pc_t *pc,
+				  unsigned int bcount)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
@@ -547,8 +545,11 @@ static void idetape_input_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsigne
 			idetape_discard_data(drive, bcount);
 			return;
 		}
-		count = min((unsigned int)(bh->b_size - atomic_read(&bh->b_count)), bcount);
-		HWIF(drive)->atapi_input_bytes(drive, bh->b_data + atomic_read(&bh->b_count), count);
+		count = min(
+			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
+			bcount);
+		HWIF(drive)->atapi_input_bytes(drive, bh->b_data +
+					atomic_read(&bh->b_count), count);
 		bcount -= count;
 		atomic_add(count, &bh->b_count);
 		if (atomic_read(&bh->b_count) == bh->b_size) {
@@ -560,15 +561,16 @@ static void idetape_input_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsigne
 	pc->bh = bh;
 }
 
-static void idetape_output_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsigned int bcount)
+static void idetape_output_buffers(ide_drive_t *drive, idetape_pc_t *pc,
+				   unsigned int bcount)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
 	while (bcount) {
 		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in "
-				"idetape_output_buffers\n");
+			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
+					__func__);
 			return;
 		}
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
@@ -577,7 +579,8 @@ static void idetape_output_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsign
 		pc->b_data += count;
 		pc->b_count -= count;
 		if (!pc->b_count) {
-			pc->bh = bh = bh->b_reqnext;
+			bh = bh->b_reqnext;
+			pc->bh = bh;
 			if (bh) {
 				pc->b_data = bh->b_data;
 				pc->b_count = atomic_read(&bh->b_count);
@@ -586,7 +589,7 @@ static void idetape_output_buffers (ide_drive_t *drive, idetape_pc_t *pc, unsign
 	}
 }
 
-static void idetape_update_buffers (idetape_pc_t *pc)
+static void idetape_update_buffers(idetape_pc_t *pc)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
@@ -596,8 +599,8 @@ static void idetape_update_buffers (idetape_pc_t *pc)
 		return;
 	while (bcount) {
 		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in "
-				"idetape_update_buffers\n");
+			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
+					__func__);
 			return;
 		}
 		count = min((unsigned int)bh->b_size, (unsigned int)bcount);
@@ -615,14 +618,14 @@ static void idetape_update_buffers (idetape_pc_t *pc)
  *	driver. A storage space for a maximum of IDETAPE_PC_STACK packet
  *	commands is allocated at initialization time.
  */
-static idetape_pc_t *idetape_next_pc_storage (ide_drive_t *drive)
+static idetape_pc_t *idetape_next_pc_storage(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
 	debug_log(DBG_PCRQ_STACK, "pc_stack_index=%d\n", tape->pc_stack_index);
 
 	if (tape->pc_stack_index == IDETAPE_PC_STACK)
-		tape->pc_stack_index=0;
+		tape->pc_stack_index = 0;
 	return (&tape->pc_stack[tape->pc_stack_index++]);
 }
 
@@ -631,26 +634,26 @@ static idetape_pc_t *idetape_next_pc_storage (ide_drive_t *drive)
  *	Since we queue packet commands in the request queue, we need to
  *	allocate a request, along with the allocation of a packet command.
  */
- 
+
 /**************************************************************
  *                                                            *
  *  This should get fixed to use kmalloc(.., GFP_ATOMIC)      *
  *  followed later on by kfree().   -ml                       *
  *                                                            *
  **************************************************************/
- 
-static struct request *idetape_next_rq_storage (ide_drive_t *drive)
+
+static struct request *idetape_next_rq_storage(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
 	debug_log(DBG_PCRQ_STACK, "rq_stack_index=%d\n", tape->rq_stack_index);
 
 	if (tape->rq_stack_index == IDETAPE_PC_STACK)
-		tape->rq_stack_index=0;
+		tape->rq_stack_index = 0;
 	return (&tape->rq_stack[tape->rq_stack_index++]);
 }
 
-static void idetape_init_pc (idetape_pc_t *pc)
+static void idetape_init_pc(idetape_pc_t *pc)
 {
 	memset(pc->c, 0, 12);
 	pc->retries = 0;
@@ -746,7 +749,7 @@ static void idetape_activate_next_stage(ide_drive_t *drive)
 }
 
 /* Free a stage along with its related buffers completely. */
-static void __idetape_kfree_stage (idetape_stage_t *stage)
+static void __idetape_kfree_stage(idetape_stage_t *stage)
 {
 	struct idetape_bh *prev_bh, *bh = stage->bh;
 	int size;
@@ -767,7 +770,7 @@ static void __idetape_kfree_stage (idetape_stage_t *stage)
 	kfree(stage);
 }
 
-static void idetape_kfree_stage (idetape_tape_t *tape, idetape_stage_t *stage)
+static void idetape_kfree_stage(idetape_tape_t *tape, idetape_stage_t *stage)
 {
 	__idetape_kfree_stage(stage);
 }
@@ -776,7 +779,7 @@ static void idetape_kfree_stage (idetape_tape_t *tape, idetape_stage_t *stage)
  * Remove tape->first_stage from the pipeline. The caller should avoid race
  * conditions.
  */
-static void idetape_remove_stage_head (ide_drive_t *drive)
+static void idetape_remove_stage_head(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_stage_t *stage;
@@ -799,9 +802,11 @@ static void idetape_remove_stage_head (ide_drive_t *drive)
 	if (tape->first_stage == NULL) {
 		tape->last_stage = NULL;
 		if (tape->next_stage != NULL)
-			printk(KERN_ERR "ide-tape: bug: tape->next_stage != NULL\n");
+			printk(KERN_ERR "ide-tape: bug: tape->next_stage !="
+					" NULL\n");
 		if (tape->nr_stages)
-			printk(KERN_ERR "ide-tape: bug: nr_stages should be 0 now\n");
+			printk(KERN_ERR "ide-tape: bug: nr_stages should be 0 "
+					"now\n");
 	}
 }
 
@@ -847,9 +852,9 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	switch (uptodate) {
-		case 0:	error = IDETAPE_ERROR_GENERAL; break;
-		case 1: error = 0; break;
-		default: error = uptodate;
+	case 0:	error = IDETAPE_ERROR_GENERAL; break;
+	case 1: error = 0; break;
+	default: error = uptodate;
 	}
 	rq->errors = error;
 	if (error)
@@ -873,7 +878,8 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 			if (error) {
 				set_bit(IDETAPE_PIPELINE_ERROR, &tape->flags);
 				if (error == IDETAPE_ERROR_EOD)
-					idetape_abort_pipeline(drive, active_stage);
+					idetape_abort_pipeline(drive,
+								active_stage);
 			}
 		} else if (rq->cmd[0] & REQ_IDETAPE_READ) {
 			if (error == IDETAPE_ERROR_EOD) {
@@ -906,9 +912,6 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 		}
 	}
 	ide_end_drive_cmd(drive, 0, 0);
-//	blkdev_dequeue_request(rq);
-//	drive->rq = NULL;
-//	end_that_request_last(rq);
 
 	if (remove_stage)
 		idetape_remove_stage_head(drive);
@@ -918,7 +921,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	return 0;
 }
 
-static ide_startstop_t idetape_request_sense_callback (ide_drive_t *drive)
+static ide_startstop_t idetape_request_sense_callback(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -928,15 +931,16 @@ static ide_startstop_t idetape_request_sense_callback (ide_drive_t *drive)
 		idetape_analyze_error(drive, tape->pc->buffer);
 		idetape_end_request(drive, 1, 0);
 	} else {
-		printk(KERN_ERR "ide-tape: Error in REQUEST SENSE itself - Aborting request!\n");
+		printk(KERN_ERR "ide-tape: Error in REQUEST SENSE itself - "
+				"Aborting request!\n");
 		idetape_end_request(drive, 0, 0);
 	}
 	return ide_stopped;
 }
 
-static void idetape_create_request_sense_cmd (idetape_pc_t *pc)
+static void idetape_create_request_sense_cmd(idetape_pc_t *pc)
 {
-	idetape_init_pc(pc);	
+	idetape_init_pc(pc);
 	pc->c[0] = REQUEST_SENSE;
 	pc->c[4] = 20;
 	pc->request_transfer = 20;
@@ -965,7 +969,8 @@ static void idetape_init_rq(struct request *rq, u8 cmd)
  * handling functions should queue request to the lower level part and wait for
  * their completion using idetape_queue_pc_tail or idetape_queue_rw_tail.
  */
-static void idetape_queue_pc_head (ide_drive_t *drive, idetape_pc_t *pc,struct request *rq)
+static void idetape_queue_pc_head(ide_drive_t *drive, idetape_pc_t *pc,
+				  struct request *rq)
 {
 	struct ide_tape_obj *tape = drive->driver_data;
 
@@ -999,7 +1004,7 @@ static ide_startstop_t idetape_retry_pc (ide_drive_t *drive)
  * Postpone the current request so that ide.c will be able to service requests
  * from another device on the same hwgroup while we are polling for DSC.
  */
-static void idetape_postpone_request (ide_drive_t *drive)
+static void idetape_postpone_request(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -1027,7 +1032,7 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 	idetape_io_buf *iobuf;
 	unsigned int temp;
 #if SIMULATE_ERRORS
-	static int error_sim_count = 0;
+	static int error_sim_count;
 #endif
 	u16 bcount;
 	u8 stat, ireason;
@@ -1228,8 +1233,9 @@ static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive)
 	ide_startstop_t startstop;
 	u8 ireason;
 
-	if (ide_wait_stat(&startstop,drive,DRQ_STAT,BUSY_STAT,WAIT_READY)) {
-		printk(KERN_ERR "ide-tape: Strange, packet command initiated yet DRQ isn't asserted\n");
+	if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) {
+		printk(KERN_ERR "ide-tape: Strange, packet command initiated "
+				"yet DRQ isn't asserted\n");
 		return startstop;
 	}
 	ireason = hwif->INB(IDE_IREASON_REG);
@@ -1336,7 +1342,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, idetape_pc_t *pc)
 	}
 }
 
-static ide_startstop_t idetape_pc_callback (ide_drive_t *drive)
+static ide_startstop_t idetape_pc_callback(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -1347,7 +1353,7 @@ static ide_startstop_t idetape_pc_callback (ide_drive_t *drive)
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
-static void idetape_create_mode_sense_cmd (idetape_pc_t *pc, u8 page_code)
+static void idetape_create_mode_sense_cmd(idetape_pc_t *pc, u8 page_code)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = MODE_SENSE;
@@ -1378,39 +1384,56 @@ static void idetape_calculate_speeds(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	if (time_after(jiffies, tape->controlled_pipeline_head_time + 120 * HZ)) {
-		tape->controlled_previous_pipeline_head = tape->controlled_last_pipeline_head;
-		tape->controlled_previous_head_time = tape->controlled_pipeline_head_time;
+	if (time_after(jiffies,
+			tape->controlled_pipeline_head_time + 120 * HZ)) {
+		tape->controlled_previous_pipeline_head =
+			tape->controlled_last_pipeline_head;
+		tape->controlled_previous_head_time =
+			tape->controlled_pipeline_head_time;
 		tape->controlled_last_pipeline_head = tape->pipeline_head;
 		tape->controlled_pipeline_head_time = jiffies;
 	}
 	if (time_after(jiffies, tape->controlled_pipeline_head_time + 60 * HZ))
-		tape->controlled_pipeline_head_speed = (tape->pipeline_head - tape->controlled_last_pipeline_head) * 32 * HZ / (jiffies - tape->controlled_pipeline_head_time);
+		tape->controlled_pipeline_head_speed = (tape->pipeline_head -
+				tape->controlled_last_pipeline_head) * 32 * HZ /
+				(jiffies - tape->controlled_pipeline_head_time);
 	else if (time_after(jiffies, tape->controlled_previous_head_time))
-		tape->controlled_pipeline_head_speed = (tape->pipeline_head - tape->controlled_previous_pipeline_head) * 32 * HZ / (jiffies - tape->controlled_previous_head_time);
+		tape->controlled_pipeline_head_speed = (tape->pipeline_head -
+				tape->controlled_previous_pipeline_head) * 32 *
+			HZ / (jiffies - tape->controlled_previous_head_time);
 
-	if (tape->nr_pending_stages < tape->max_stages /*- 1 */) {
+	if (tape->nr_pending_stages < tape->max_stages/*- 1 */) {
 		/* -1 for read mode error recovery */
-		if (time_after(jiffies, tape->uncontrolled_previous_head_time + 10 * HZ)) {
+		if (time_after(jiffies, tape->uncontrolled_previous_head_time +
+					10 * HZ)) {
 			tape->uncontrolled_pipeline_head_time = jiffies;
-			tape->uncontrolled_pipeline_head_speed = (tape->pipeline_head - tape->uncontrolled_previous_pipeline_head) * 32 * HZ / (jiffies - tape->uncontrolled_previous_head_time);
+			tape->uncontrolled_pipeline_head_speed =
+				(tape->pipeline_head -
+				 tape->uncontrolled_previous_pipeline_head) *
+				32 * HZ / (jiffies -
+					tape->uncontrolled_previous_head_time);
 		}
 	} else {
 		tape->uncontrolled_previous_head_time = jiffies;
 		tape->uncontrolled_previous_pipeline_head = tape->pipeline_head;
-		if (time_after(jiffies, tape->uncontrolled_pipeline_head_time + 30 * HZ)) {
+		if (time_after(jiffies, tape->uncontrolled_pipeline_head_time +
+					30 * HZ))
 			tape->uncontrolled_pipeline_head_time = jiffies;
-		}
+
 	}
-	tape->pipeline_head_speed = max(tape->uncontrolled_pipeline_head_speed, tape->controlled_pipeline_head_speed);
+	tape->pipeline_head_speed = max(tape->uncontrolled_pipeline_head_speed,
+					tape->controlled_pipeline_head_speed);
 
 	if (tape->speed_control == 1) {
 		if (tape->nr_pending_stages >= tape->max_stages / 2)
 			tape->max_insert_speed = tape->pipeline_head_speed +
-				(1100 - tape->pipeline_head_speed) * 2 * (tape->nr_pending_stages - tape->max_stages / 2) / tape->max_stages;
+				(1100 - tape->pipeline_head_speed) * 2 *
+				(tape->nr_pending_stages - tape->max_stages / 2)
+				/ tape->max_stages;
 		else
 			tape->max_insert_speed = 500 +
-				(tape->pipeline_head_speed - 500) * 2 * tape->nr_pending_stages / tape->max_stages;
+				(tape->pipeline_head_speed - 500) * 2 *
+				tape->nr_pending_stages / tape->max_stages;
 
 		if (tape->nr_pending_stages >= tape->max_stages * 99 / 100)
 			tape->max_insert_speed = 5000;
@@ -1420,7 +1443,7 @@ static void idetape_calculate_speeds(ide_drive_t *drive)
 	tape->max_insert_speed = max(tape->max_insert_speed, 500);
 }
 
-static ide_startstop_t idetape_media_access_finished (ide_drive_t *drive)
+static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t *pc = tape->pc;
@@ -1447,7 +1470,7 @@ static ide_startstop_t idetape_media_access_finished (ide_drive_t *drive)
 	return pc->callback(drive);
 }
 
-static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
+static ide_startstop_t idetape_rw_callback(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct request *rq = HWGROUP(drive)->rq;
@@ -1463,9 +1486,11 @@ static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
 		tape->insert_size = 0;
 	}
 	if (time_after(jiffies, tape->insert_time))
-		tape->insert_speed = tape->insert_size / 1024 * HZ / (jiffies - tape->insert_time);
+		tape->insert_speed = tape->insert_size / 1024 * HZ /
+					(jiffies - tape->insert_time);
 	if (time_after_eq(jiffies, tape->avg_time + HZ)) {
-		tape->avg_speed = tape->avg_size * HZ / (jiffies - tape->avg_time) / 1024;
+		tape->avg_speed = tape->avg_size * HZ /
+				(jiffies - tape->avg_time) / 1024;
 		tape->avg_size = 0;
 		tape->avg_time = jiffies;
 	}
@@ -1481,7 +1506,8 @@ static ide_startstop_t idetape_rw_callback (ide_drive_t *drive)
 	return ide_stopped;
 }
 
-static void idetape_create_read_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsigned int length, struct idetape_bh *bh)
+static void idetape_create_read_cmd(idetape_tape_t *tape, idetape_pc_t *pc,
+		unsigned int length, struct idetape_bh *bh)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = READ_6;
@@ -1520,7 +1546,8 @@ static void idetape_create_read_buffer_cmd(idetape_tape_t *tape,
 	pc->buffer_size = size;
 }
 
-static void idetape_create_write_cmd(idetape_tape_t *tape, idetape_pc_t *pc, unsigned int length, struct idetape_bh *bh)
+static void idetape_create_write_cmd(idetape_tape_t *tape, idetape_pc_t *pc,
+		unsigned int length, struct idetape_bh *bh)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = WRITE_6;
@@ -1559,10 +1586,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 
 	/* Retry a failed packet command */
-	if (tape->failed_pc != NULL &&
-	    tape->pc->c[0] == REQUEST_SENSE) {
+	if (tape->failed_pc && tape->pc->c[0] == REQUEST_SENSE)
 		return idetape_issue_pc(drive, tape->failed_pc);
-	}
+
 	if (postponed_rq != NULL)
 		if (rq != postponed_rq) {
 			printk(KERN_ERR "ide-tape: ide-tape.c bug - "
@@ -1588,7 +1614,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 
 	if (time_after(jiffies, tape->insert_time))
-		tape->insert_speed = tape->insert_size / 1024 * HZ / (jiffies - tape->insert_time);
+		tape->insert_speed = tape->insert_size / 1024 * HZ /
+					(jiffies - tape->insert_time);
 	idetape_calculate_speeds(drive);
 	if (!test_and_clear_bit(IDETAPE_IGNORE_DSC, &tape->flags) &&
 	    (stat & SEEK_STAT) == 0) {
@@ -1605,7 +1632,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			} else {
 				return ide_do_reset(drive);
 			}
-		} else if (time_after(jiffies, tape->dsc_polling_start + IDETAPE_DSC_MA_THRESHOLD))
+		} else if (time_after(jiffies,
+					tape->dsc_polling_start +
+					IDETAPE_DSC_MA_THRESHOLD))
 			tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW;
 		idetape_postpone_request(drive);
 		return ide_stopped;
@@ -1614,14 +1643,16 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		tape->buffer_head++;
 		tape->postpone_cnt = 0;
 		pc = idetape_next_pc_storage(drive);
-		idetape_create_read_cmd(tape, pc, rq->current_nr_sectors, (struct idetape_bh *)rq->special);
+		idetape_create_read_cmd(tape, pc, rq->current_nr_sectors,
+					(struct idetape_bh *)rq->special);
 		goto out;
 	}
 	if (rq->cmd[0] & REQ_IDETAPE_WRITE) {
 		tape->buffer_head++;
 		tape->postpone_cnt = 0;
 		pc = idetape_next_pc_storage(drive);
-		idetape_create_write_cmd(tape, pc, rq->current_nr_sectors, (struct idetape_bh *)rq->special);
+		idetape_create_write_cmd(tape, pc, rq->current_nr_sectors,
+					 (struct idetape_bh *)rq->special);
 		goto out;
 	}
 	if (rq->cmd[0] & REQ_IDETAPE_READ_BUFFER) {
@@ -1647,7 +1678,7 @@ out:
 }
 
 /* Pipeline related functions */
-static inline int idetape_pipeline_active (idetape_tape_t *tape)
+static inline int idetape_pipeline_active(idetape_tape_t *tape)
 {
 	int rc1, rc2;
 
@@ -1668,22 +1699,26 @@ static inline int idetape_pipeline_active (idetape_tape_t *tape)
  * Pipeline stages are optional and are used to increase performance. If we
  * can't allocate them, we'll manage without them.
  */
-static idetape_stage_t *__idetape_kmalloc_stage (idetape_tape_t *tape, int full, int clear)
+static idetape_stage_t *__idetape_kmalloc_stage(idetape_tape_t *tape, int full,
+						int clear)
 {
 	idetape_stage_t *stage;
 	struct idetape_bh *prev_bh, *bh;
 	int pages = tape->pages_per_stage;
 	char *b_data = NULL;
 
-	if ((stage = kmalloc(sizeof (idetape_stage_t),GFP_KERNEL)) == NULL)
+	stage = kmalloc(sizeof(idetape_stage_t), GFP_KERNEL);
+	if (!stage)
 		return NULL;
 	stage->next = NULL;
 
-	bh = stage->bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	stage->bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	bh = stage->bh;
 	if (bh == NULL)
 		goto abort;
 	bh->b_reqnext = NULL;
-	if ((bh->b_data = (char *) __get_free_page (GFP_KERNEL)) == NULL)
+	bh->b_data = (char *) __get_free_page(GFP_KERNEL);
+	if (!bh->b_data)
 		goto abort;
 	if (clear)
 		memset(bh->b_data, 0, PAGE_SIZE);
@@ -1691,7 +1726,8 @@ static idetape_stage_t *__idetape_kmalloc_stage (idetape_tape_t *tape, int full,
 	atomic_set(&bh->b_count, full ? bh->b_size : 0);
 
 	while (--pages) {
-		if ((b_data = (char *) __get_free_page (GFP_KERNEL)) == NULL)
+		b_data = (char *) __get_free_page(GFP_KERNEL);
+		if (!b_data)
 			goto abort;
 		if (clear)
 			memset(b_data, 0, PAGE_SIZE);
@@ -1709,7 +1745,8 @@ static idetape_stage_t *__idetape_kmalloc_stage (idetape_tape_t *tape, int full,
 			continue;
 		}
 		prev_bh = bh;
-		if ((bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL)) == NULL) {
+		bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+		if (!bh) {
 			free_page((unsigned long) b_data);
 			goto abort;
 		}
@@ -1728,7 +1765,7 @@ abort:
 	return NULL;
 }
 
-static idetape_stage_t *idetape_kmalloc_stage (idetape_tape_t *tape)
+static idetape_stage_t *idetape_kmalloc_stage(idetape_tape_t *tape)
 {
 	idetape_stage_t *cache_stage = tape->cache_stage;
 
@@ -1743,7 +1780,8 @@ static idetape_stage_t *idetape_kmalloc_stage (idetape_tape_t *tape)
 	return __idetape_kmalloc_stage(tape, 0, 0);
 }
 
-static int idetape_copy_stage_from_user (idetape_tape_t *tape, idetape_stage_t *stage, const char __user *buf, int n)
+static int idetape_copy_stage_from_user(idetape_tape_t *tape,
+		idetape_stage_t *stage, const char __user *buf, int n)
 {
 	struct idetape_bh *bh = tape->bh;
 	int count;
@@ -1751,12 +1789,15 @@ static int idetape_copy_stage_from_user (idetape_tape_t *tape, idetape_stage_t *
 
 	while (n) {
 		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in "
-				"idetape_copy_stage_from_user\n");
+			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
+					__func__);
 			return 1;
 		}
-		count = min((unsigned int)(bh->b_size - atomic_read(&bh->b_count)), (unsigned int)n);
-		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf, count))
+		count = min((unsigned int)
+				(bh->b_size - atomic_read(&bh->b_count)),
+				(unsigned int)n);
+		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
+				count))
 			ret = 1;
 		n -= count;
 		atomic_add(count, &bh->b_count);
@@ -1771,7 +1812,8 @@ static int idetape_copy_stage_from_user (idetape_tape_t *tape, idetape_stage_t *
 	return ret;
 }
 
-static int idetape_copy_stage_to_user (idetape_tape_t *tape, char __user *buf, idetape_stage_t *stage, int n)
+static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
+		idetape_stage_t *stage, int n)
 {
 	struct idetape_bh *bh = tape->bh;
 	int count;
@@ -1779,8 +1821,8 @@ static int idetape_copy_stage_to_user (idetape_tape_t *tape, char __user *buf, i
 
 	while (n) {
 		if (bh == NULL) {
-			printk(KERN_ERR "ide-tape: bh == NULL in "
-				"idetape_copy_stage_to_user\n");
+			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
+					__func__);
 			return 1;
 		}
 		count = min(tape->b_count, n);
@@ -1791,7 +1833,8 @@ static int idetape_copy_stage_to_user (idetape_tape_t *tape, char __user *buf, i
 		tape->b_count -= count;
 		buf += count;
 		if (!tape->b_count) {
-			tape->bh = bh = bh->b_reqnext;
+			bh = bh->b_reqnext;
+			tape->bh = bh;
 			if (bh) {
 				tape->b_data = bh->b_data;
 				tape->b_count = atomic_read(&bh->b_count);
@@ -1801,10 +1844,10 @@ static int idetape_copy_stage_to_user (idetape_tape_t *tape, char __user *buf, i
 	return ret;
 }
 
-static void idetape_init_merge_stage (idetape_tape_t *tape)
+static void idetape_init_merge_stage(idetape_tape_t *tape)
 {
 	struct idetape_bh *bh = tape->merge_stage->bh;
-	
+
 	tape->bh = bh;
 	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
 		atomic_set(&bh->b_count, 0);
@@ -1814,7 +1857,7 @@ static void idetape_init_merge_stage (idetape_tape_t *tape)
 	}
 }
 
-static void idetape_switch_buffers (idetape_tape_t *tape, idetape_stage_t *stage)
+static void idetape_switch_buffers(idetape_tape_t *tape, idetape_stage_t *stage)
 {
 	struct idetape_bh *tmp;
 
@@ -1825,7 +1868,7 @@ static void idetape_switch_buffers (idetape_tape_t *tape, idetape_stage_t *stage
 }
 
 /* Add a new stage at the end of the pipeline. */
-static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
+static void idetape_add_stage_tail(ide_drive_t *drive, idetape_stage_t *stage)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
@@ -1835,9 +1878,10 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
 	spin_lock_irqsave(&tape->lock, flags);
 	stage->next = NULL;
 	if (tape->last_stage != NULL)
-		tape->last_stage->next=stage;
+		tape->last_stage->next = stage;
 	else
-		tape->first_stage = tape->next_stage=stage;
+		tape->first_stage = stage;
+		tape->next_stage  = stage;
 	tape->last_stage = stage;
 	if (tape->next_stage == NULL)
 		tape->next_stage = tape->last_stage;
@@ -1850,13 +1894,14 @@ static void idetape_add_stage_tail (ide_drive_t *drive,idetape_stage_t *stage)
  * caller should ensure that the request will not be serviced before we install
  * the completion (usually by disabling interrupts).
  */
-static void idetape_wait_for_request (ide_drive_t *drive, struct request *rq)
+static void idetape_wait_for_request(ide_drive_t *drive, struct request *rq)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	idetape_tape_t *tape = drive->driver_data;
 
 	if (rq == NULL || !blk_special_request(rq)) {
-		printk (KERN_ERR "ide-tape: bug: Trying to sleep on non-valid request\n");
+		printk(KERN_ERR "ide-tape: bug: Trying to sleep on non-valid"
+				 " request\n");
 		return;
 	}
 	rq->end_io_data = &wait;
@@ -1905,7 +1950,8 @@ static ide_startstop_t idetape_read_position_callback(ide_drive_t *drive)
  * Write a filemark if write_filemark=1. Flush the device buffers without
  * writing a filemark otherwise.
  */
-static void idetape_create_write_filemark_cmd (ide_drive_t *drive, idetape_pc_t *pc,int write_filemark)
+static void idetape_create_write_filemark_cmd(ide_drive_t *drive,
+		idetape_pc_t *pc, int write_filemark)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = WRITE_FILEMARKS;
@@ -1934,7 +1980,7 @@ static void idetape_create_test_unit_ready_cmd(idetape_pc_t *pc)
  * to the request list without waiting for it to be serviced! In that case, we
  * usually use idetape_queue_pc_head().
  */
-static int __idetape_queue_pc_tail (ide_drive_t *drive, idetape_pc_t *pc)
+static int __idetape_queue_pc_tail(ide_drive_t *drive, idetape_pc_t *pc)
 {
 	struct ide_tape_obj *tape = drive->driver_data;
 	struct request rq;
@@ -1945,7 +1991,8 @@ static int __idetape_queue_pc_tail (ide_drive_t *drive, idetape_pc_t *pc)
 	return ide_do_drive_cmd(drive, &rq, ide_wait);
 }
 
-static void idetape_create_load_unload_cmd (ide_drive_t *drive, idetape_pc_t *pc,int cmd)
+static void idetape_create_load_unload_cmd(ide_drive_t *drive, idetape_pc_t *pc,
+		int cmd)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = START_STOP;
@@ -1972,7 +2019,8 @@ static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
 			/* no media */
 			if (load_attempted)
 				return -ENOMEDIUM;
-			idetape_create_load_unload_cmd(drive, &pc, IDETAPE_LU_LOAD_MASK);
+			idetape_create_load_unload_cmd(drive, &pc,
+							IDETAPE_LU_LOAD_MASK);
 			__idetape_queue_pc_tail(drive, &pc);
 			load_attempted = 1;
 		/* not about to be ready */
@@ -1984,24 +2032,25 @@ static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
 	return -EIO;
 }
 
-static int idetape_queue_pc_tail (ide_drive_t *drive,idetape_pc_t *pc)
+static int idetape_queue_pc_tail(ide_drive_t *drive, idetape_pc_t *pc)
 {
 	return __idetape_queue_pc_tail(drive, pc);
 }
 
-static int idetape_flush_tape_buffers (ide_drive_t *drive)
+static int idetape_flush_tape_buffers(ide_drive_t *drive)
 {
 	idetape_pc_t pc;
 	int rc;
 
 	idetape_create_write_filemark_cmd(drive, &pc, 0);
-	if ((rc = idetape_queue_pc_tail(drive, &pc)))
+	rc = idetape_queue_pc_tail(drive, &pc);
+	if (rc)
 		return rc;
 	idetape_wait_ready(drive, 60 * 5 * HZ);
 	return 0;
 }
 
-static void idetape_create_read_position_cmd (idetape_pc_t *pc)
+static void idetape_create_read_position_cmd(idetape_pc_t *pc)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = READ_POSITION;
@@ -2009,7 +2058,7 @@ static void idetape_create_read_position_cmd (idetape_pc_t *pc)
 	pc->callback = &idetape_read_position_callback;
 }
 
-static int idetape_read_position (ide_drive_t *drive)
+static int idetape_read_position(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t pc;
@@ -2024,7 +2073,8 @@ static int idetape_read_position (ide_drive_t *drive)
 	return position;
 }
 
-static void idetape_create_locate_cmd (ide_drive_t *drive, idetape_pc_t *pc, unsigned int block, u8 partition, int skip)
+static void idetape_create_locate_cmd(ide_drive_t *drive, idetape_pc_t *pc,
+		unsigned int block, u8 partition, int skip)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = POSITION_TO_ELEMENT;
@@ -2035,7 +2085,8 @@ static void idetape_create_locate_cmd (ide_drive_t *drive, idetape_pc_t *pc, uns
 	pc->callback = &idetape_pc_callback;
 }
 
-static int idetape_create_prevent_cmd (ide_drive_t *drive, idetape_pc_t *pc, int prevent)
+static int idetape_create_prevent_cmd(ide_drive_t *drive, idetape_pc_t *pc,
+				      int prevent)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -2050,7 +2101,7 @@ static int idetape_create_prevent_cmd (ide_drive_t *drive, idetape_pc_t *pc, int
 	return 1;
 }
 
-static int __idetape_discard_read_pipeline (ide_drive_t *drive)
+static int __idetape_discard_read_pipeline(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
@@ -2086,7 +2137,7 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
 	while (tape->first_stage != NULL) {
 		struct request *rq_ptr = &tape->first_stage->rq;
 
-		cnt += rq_ptr->nr_sectors - rq_ptr->current_nr_sectors; 
+		cnt += rq_ptr->nr_sectors - rq_ptr->current_nr_sectors;
 		if (rq_ptr->errors == IDETAPE_ERROR_FILEMARK)
 			++cnt;
 		idetape_remove_stage_head(drive);
@@ -2102,7 +2153,8 @@ static int __idetape_discard_read_pipeline (ide_drive_t *drive)
  * all higher level operations, we queue the commands at the tail of the request
  * queue and wait for their completion.
  */
-static int idetape_position_tape (ide_drive_t *drive, unsigned int block, u8 partition, int skip)
+static int idetape_position_tape(ide_drive_t *drive, unsigned int block,
+		u8 partition, int skip)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int retval;
@@ -2120,7 +2172,8 @@ static int idetape_position_tape (ide_drive_t *drive, unsigned int block, u8 par
 	return (idetape_queue_pc_tail(drive, &pc));
 }
 
-static void idetape_discard_read_pipeline (ide_drive_t *drive, int restore_position)
+static void idetape_discard_read_pipeline(ide_drive_t *drive,
+					  int restore_position)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int cnt;
@@ -2131,7 +2184,8 @@ static void idetape_discard_read_pipeline (ide_drive_t *drive, int restore_posit
 		position = idetape_read_position(drive);
 		seek = position > cnt ? position - cnt : 0;
 		if (idetape_position_tape(drive, seek, 0, 0)) {
-			printk(KERN_INFO "ide-tape: %s: position_tape failed in discard_pipeline()\n", tape->name);
+			printk(KERN_INFO "ide-tape: %s: position_tape failed in"
+					 " discard_pipeline()\n", tape->name);
 			return;
 		}
 	}
@@ -2141,7 +2195,8 @@ static void idetape_discard_read_pipeline (ide_drive_t *drive, int restore_posit
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
+				 struct idetape_bh *bh)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct request rq;
@@ -2158,7 +2213,8 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct
 	rq.rq_disk = tape->disk;
 	rq.special = (void *)bh;
 	rq.sector = tape->first_frame;
-	rq.nr_sectors = rq.current_nr_sectors = blocks;
+	rq.nr_sectors		= blocks;
+	rq.current_nr_sectors	= blocks;
 	(void) ide_do_drive_cmd(drive, &rq, ide_wait);
 
 	if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
@@ -2185,15 +2241,16 @@ static void idetape_plug_pipeline(ide_drive_t *drive)
 	}
 }
 
-static void idetape_create_inquiry_cmd (idetape_pc_t *pc)
+static void idetape_create_inquiry_cmd(idetape_pc_t *pc)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = INQUIRY;
-	pc->c[4] = pc->request_transfer = 254;
+	pc->c[4] = 254;
+	pc->request_transfer = 254;
 	pc->callback = &idetape_pc_callback;
 }
 
-static void idetape_create_rewind_cmd (ide_drive_t *drive, idetape_pc_t *pc)
+static void idetape_create_rewind_cmd(ide_drive_t *drive, idetape_pc_t *pc)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = REZERO_UNIT;
@@ -2201,7 +2258,7 @@ static void idetape_create_rewind_cmd (ide_drive_t *drive, idetape_pc_t *pc)
 	pc->callback = &idetape_pc_callback;
 }
 
-static void idetape_create_erase_cmd (idetape_pc_t *pc)
+static void idetape_create_erase_cmd(idetape_pc_t *pc)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = ERASE;
@@ -2210,7 +2267,7 @@ static void idetape_create_erase_cmd (idetape_pc_t *pc)
 	pc->callback = &idetape_pc_callback;
 }
 
-static void idetape_create_space_cmd (idetape_pc_t *pc,int count, u8 cmd)
+static void idetape_create_space_cmd(idetape_pc_t *pc, int count, u8 cmd)
 {
 	idetape_init_pc(pc);
 	pc->c[0] = SPACE;
@@ -2220,7 +2277,7 @@ static void idetape_create_space_cmd (idetape_pc_t *pc,int count, u8 cmd)
 	pc->callback = &idetape_pc_callback;
 }
 
-static void idetape_wait_first_stage (ide_drive_t *drive)
+static void idetape_wait_first_stage(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
@@ -2244,7 +2301,7 @@ static void idetape_wait_first_stage (ide_drive_t *drive)
  * 3. If we still can't allocate a stage, fallback to non-pipelined operation
  * mode for this request.
  */
-static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_stage_t *new_stage;
@@ -2268,14 +2325,16 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
 			 * The machine is short on memory. Fallback to non-
 			 * pipelined operation mode for this request.
 			 */
-			return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, tape->merge_stage->bh);
+			return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+						blocks, tape->merge_stage->bh);
 		}
 	}
 	rq = &new_stage->rq;
 	idetape_init_rq(rq, REQ_IDETAPE_WRITE);
 	/* Doesn't actually matter - We always assume sequential access */
 	rq->sector = tape->first_frame;
-	rq->nr_sectors = rq->current_nr_sectors = blocks;
+	rq->current_nr_sectors = blocks;
+	rq->nr_sectors = blocks;
 
 	idetape_switch_buffers(tape, new_stage);
 	idetape_add_stage_tail(drive, new_stage);
@@ -2311,7 +2370,7 @@ static int idetape_add_chrdev_write_request (ide_drive_t *drive, int blocks)
  * Wait until all pending pipeline requests are serviced. Typically called on
  * device close.
  */
-static void idetape_wait_for_pipeline (ide_drive_t *drive)
+static void idetape_wait_for_pipeline(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
@@ -2325,14 +2384,15 @@ static void idetape_wait_for_pipeline (ide_drive_t *drive)
 	}
 }
 
-static void idetape_empty_write_pipeline (ide_drive_t *drive)
+static void idetape_empty_write_pipeline(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int blocks, min;
 	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		printk(KERN_ERR "ide-tape: bug: Trying to empty write pipeline, but we are not writing.\n");
+		printk(KERN_ERR "ide-tape: bug: Trying to empty write pipeline,"
+				" but we are not writing.\n");
 		return;
 	}
 	if (tape->merge_stage_size > tape->stage_size) {
@@ -2355,12 +2415,14 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 			bh = tape->bh;
 			while (i) {
 				if (bh == NULL) {
-
-					printk(KERN_INFO "ide-tape: bug, bh NULL\n");
+					printk(KERN_INFO "ide-tape: bug,"
+							 " bh NULL\n");
 					break;
 				}
-				min = min(i, (unsigned int)(bh->b_size - atomic_read(&bh->b_count)));
-				memset(bh->b_data + atomic_read(&bh->b_count), 0, min);
+				min = min(i, (unsigned int)(bh->b_size -
+						atomic_read(&bh->b_count)));
+				memset(bh->b_data + atomic_read(&bh->b_count),
+						0, min);
 				atomic_add(min, &bh->b_count);
 				i -= min;
 				bh = bh->b_reqnext;
@@ -2396,18 +2458,22 @@ static void idetape_empty_write_pipeline (ide_drive_t *drive)
 	}
 }
 
-static void idetape_restart_speed_control (ide_drive_t *drive)
+static void idetape_restart_speed_control(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
 	tape->restart_speed_control_req = 0;
 	tape->pipeline_head = 0;
 	tape->controlled_last_pipeline_head = 0;
-	tape->controlled_previous_pipeline_head = tape->uncontrolled_previous_pipeline_head = 0;
-	tape->pipeline_head_speed = tape->controlled_pipeline_head_speed = 5000;
+	tape->controlled_previous_pipeline_head = 0;
+	tape->uncontrolled_previous_pipeline_head = 0;
+	tape->controlled_pipeline_head_speed = 5000;
+	tape->pipeline_head_speed = 5000;
 	tape->uncontrolled_pipeline_head_speed = 0;
-	tape->controlled_pipeline_head_time = tape->uncontrolled_pipeline_head_time = jiffies;
-	tape->controlled_previous_head_time = tape->uncontrolled_previous_head_time = jiffies;
+	tape->controlled_pipeline_head_time =
+		tape->uncontrolled_pipeline_head_time = jiffies;
+	tape->controlled_previous_head_time =
+		tape->uncontrolled_previous_head_time = jiffies;
 }
 
 static int idetape_init_read(ide_drive_t *drive, int max_stages)
@@ -2425,10 +2491,12 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
 			idetape_flush_tape_buffers(drive);
 		}
 		if (tape->merge_stage || tape->merge_stage_size) {
-			printk (KERN_ERR "ide-tape: merge_stage_size should be 0 now\n");
+			printk(KERN_ERR "ide-tape: merge_stage_size should be"
+					 " 0 now\n");
 			tape->merge_stage_size = 0;
 		}
-		if ((tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0)) == NULL)
+		tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0);
+		if (!tape->merge_stage)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
 
@@ -2439,7 +2507,9 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
 		 * drives (Seagate STT3401A) will return an error.
 		 */
 		if (drive->dsc_overlap) {
-			bytes_read = idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, 0, tape->merge_stage->bh);
+			bytes_read = idetape_queue_rw_tail(drive,
+							REQ_IDETAPE_READ, 0,
+							tape->merge_stage->bh);
 			if (bytes_read < 0) {
 				__idetape_kfree_stage(tape->merge_stage);
 				tape->merge_stage = NULL;
@@ -2452,7 +2522,8 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
 		idetape_restart_speed_control(drive);
 	idetape_init_rq(&rq, REQ_IDETAPE_READ);
 	rq.sector = tape->first_frame;
-	rq.nr_sectors = rq.current_nr_sectors = blocks;
+	rq.nr_sectors = blocks;
+	rq.current_nr_sectors = blocks;
 	if (!test_bit(IDETAPE_PIPELINE_ERROR, &tape->flags) &&
 	    tape->nr_stages < max_stages) {
 		new_stage = idetape_kmalloc_stage(tape);
@@ -2480,7 +2551,7 @@ static int idetape_init_read(ide_drive_t *drive, int max_stages)
  * Called from idetape_chrdev_read() to service a character device read request
  * and add read-ahead requests to our pipeline.
  */
-static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
+static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	unsigned long flags;
@@ -2505,8 +2576,8 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 	rq_ptr = &tape->first_stage->rq;
 	bytes_read = tape->blk_size * (rq_ptr->nr_sectors -
 					rq_ptr->current_nr_sectors);
-	rq_ptr->nr_sectors = rq_ptr->current_nr_sectors = 0;
-
+	rq_ptr->nr_sectors = 0;
+	rq_ptr->current_nr_sectors = 0;
 
 	if (rq_ptr->errors == IDETAPE_ERROR_EOD)
 		return 0;
@@ -2521,13 +2592,14 @@ static int idetape_add_chrdev_read_request (ide_drive_t *drive,int blocks)
 		idetape_calculate_speeds(drive);
 	}
 	if (bytes_read > blocks * tape->blk_size) {
-		printk(KERN_ERR "ide-tape: bug: trying to return more bytes than requested\n");
+		printk(KERN_ERR "ide-tape: bug: trying to return more bytes"
+				" than requested\n");
 		bytes_read = blocks * tape->blk_size;
 	}
 	return (bytes_read);
 }
 
-static void idetape_pad_zeros (ide_drive_t *drive, int bcount)
+static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct idetape_bh *bh;
@@ -2541,16 +2613,18 @@ static void idetape_pad_zeros (ide_drive_t *drive, int bcount)
 		bcount -= count;
 		blocks = count / tape->blk_size;
 		while (count) {
-			atomic_set(&bh->b_count, min(count, (unsigned int)bh->b_size));
+			atomic_set(&bh->b_count,
+				   min(count, (unsigned int)bh->b_size));
 			memset(bh->b_data, 0, atomic_read(&bh->b_count));
 			count -= atomic_read(&bh->b_count);
 			bh = bh->b_reqnext;
 		}
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, tape->merge_stage->bh);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
+				      tape->merge_stage->bh);
 	}
 }
 
-static int idetape_pipeline_size (ide_drive_t *drive)
+static int idetape_pipeline_size(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_stage_t *stage;
@@ -2575,7 +2649,7 @@ static int idetape_pipeline_size (ide_drive_t *drive)
  * Rewinds the tape to the Beginning Of the current Partition (BOP). We
  * currently support only one partition.
  */
-static int idetape_rewind_tape (ide_drive_t *drive)
+static int idetape_rewind_tape(ide_drive_t *drive)
 {
 	int retval;
 	idetape_pc_t pc;
@@ -2597,7 +2671,8 @@ static int idetape_rewind_tape (ide_drive_t *drive)
 }
 
 /* mtio.h compatible commands should be issued to the chrdev interface. */
-static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned long arg)
+static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd,
+				unsigned long arg)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	void __user *argp = (void __user *)arg;
@@ -2611,20 +2686,20 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned l
 	debug_log(DBG_PROCS, "Enter %s\n", __func__);
 
 	switch (cmd) {
-		case 0x0340:
-			if (copy_from_user(&config, argp, sizeof(config)))
-				return -EFAULT;
-			tape->best_dsc_rw_freq = config.dsc_rw_frequency;
-			tape->max_stages = config.nr_stages;
-			break;
-		case 0x0350:
-			config.dsc_rw_frequency = (int) tape->best_dsc_rw_freq;
-			config.nr_stages = tape->max_stages; 
-			if (copy_to_user(argp, &config, sizeof(config)))
-				return -EFAULT;
-			break;
-		default:
-			return -EIO;
+	case 0x0340:
+		if (copy_from_user(&config, argp, sizeof(config)))
+			return -EFAULT;
+		tape->best_dsc_rw_freq = config.dsc_rw_frequency;
+		tape->max_stages = config.nr_stages;
+		break;
+	case 0x0350:
+		config.dsc_rw_frequency = (int) tape->best_dsc_rw_freq;
+		config.nr_stages = tape->max_stages;
+		if (copy_to_user(argp, &config, sizeof(config)))
+			return -EFAULT;
+		break;
+	default:
+		return -EIO;
 	}
 	return 0;
 }
@@ -2636,12 +2711,13 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, unsigned l
  * support MTFSFM when the filemark is in our internal pipeline even if the tape
  * doesn't support spacing over filemarks in the reverse direction.
  */
-static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_count)
+static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
+					int mt_count)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t pc;
 	unsigned long flags;
-	int retval,count=0;
+	int retval, count = 0;
 	int sprev = !!(tape->caps[4] & 0x20);
 
 	if (mt_count == 0)
@@ -2649,7 +2725,7 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 	if (MTBSF == mt_op || MTBSFM == mt_op) {
 		if (!sprev)
 			return -EIO;
-		mt_count = - mt_count;
+		mt_count = -mt_count;
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
@@ -2682,7 +2758,8 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 				tape->next_stage = tape->first_stage->next;
 			} else
 				spin_unlock_irqrestore(&tape->lock, flags);
-			if (tape->first_stage->rq.errors == IDETAPE_ERROR_FILEMARK)
+			if (tape->first_stage->rq.errors ==
+					IDETAPE_ERROR_FILEMARK)
 				++count;
 			idetape_remove_stage_head(drive);
 		}
@@ -2694,25 +2771,28 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
 	 * the space command.
 	 */
 	switch (mt_op) {
-		case MTFSF:
-		case MTBSF:
-			idetape_create_space_cmd(&pc,mt_count-count,IDETAPE_SPACE_OVER_FILEMARK);
-			return (idetape_queue_pc_tail(drive, &pc));
-		case MTFSFM:
-		case MTBSFM:
-			if (!sprev)
-				return (-EIO);
-			retval = idetape_space_over_filemarks(drive, MTFSF, mt_count-count);
-			if (retval) return (retval);
-			count = (MTBSFM == mt_op ? 1 : -1);
-			return (idetape_space_over_filemarks(drive, MTFSF, count));
-		default:
-			printk(KERN_ERR "ide-tape: MTIO operation %d not supported\n",mt_op);
-			return (-EIO);
+	case MTFSF:
+	case MTBSF:
+		idetape_create_space_cmd(&pc, mt_count - count,
+					 IDETAPE_SPACE_OVER_FILEMARK);
+		return idetape_queue_pc_tail(drive, &pc);
+	case MTFSFM:
+	case MTBSFM:
+		if (!sprev)
+			return -EIO;
+		retval = idetape_space_over_filemarks(drive, MTFSF,
+						      mt_count - count);
+		if (retval)
+			return retval;
+		count = (MTBSFM == mt_op ? 1 : -1);
+		return idetape_space_over_filemarks(drive, MTFSF, count);
+	default:
+		printk(KERN_ERR "ide-tape: MTIO operation %d not supported\n",
+				mt_op);
+		return -EIO;
 	}
 }
 
-
 /*
  * Our character device read / write functions.
  *
@@ -2728,12 +2808,12 @@ static int idetape_space_over_filemarks (ide_drive_t *drive,short mt_op,int mt_c
  * (slightly) increased driver overhead, but will no longer hit performance.
  * This is not applicable to Onstream.
  */
-static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
-				    size_t count, loff_t *ppos)
+static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos)
 {
 	struct ide_tape_obj *tape = ide_tape_f(file);
 	ide_drive_t *drive = tape->drive;
-	ssize_t bytes_read,temp, actually_read = 0, rc;
+	ssize_t bytes_read, temp, actually_read = 0, rc;
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
 
@@ -2751,8 +2831,10 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 	if (count == 0)
 		return (0);
 	if (tape->merge_stage_size) {
-		actually_read = min((unsigned int)(tape->merge_stage_size), (unsigned int)count);
-		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage, actually_read))
+		actually_read = min((unsigned int)(tape->merge_stage_size),
+				    (unsigned int)count);
+		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage,
+					       actually_read))
 			ret = -EFAULT;
 		buf += actually_read;
 		tape->merge_stage_size -= actually_read;
@@ -2762,7 +2844,8 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
 		if (bytes_read <= 0)
 			goto finish;
-		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage, bytes_read))
+		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage,
+					       bytes_read))
 			ret = -EFAULT;
 		buf += bytes_read;
 		count -= bytes_read;
@@ -2773,7 +2856,8 @@ static ssize_t idetape_chrdev_read (struct file *file, char __user *buf,
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage, temp))
+		if (idetape_copy_stage_to_user(tape, buf, tape->merge_stage,
+					       temp))
 			ret = -EFAULT;
 		actually_read += temp;
 		tape->merge_stage_size = bytes_read-temp;
@@ -2786,10 +2870,10 @@ finish:
 		return 0;
 	}
 
-	return (ret) ? ret : actually_read;
+	return ret ? ret : actually_read;
 }
 
-static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
+static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 				     size_t count, loff_t *ppos)
 {
 	struct ide_tape_obj *tape = ide_tape_f(file);
@@ -2813,7 +2897,8 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 				"should be 0 now\n");
 			tape->merge_stage_size = 0;
 		}
-		if ((tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0)) == NULL)
+		tape->merge_stage = __idetape_kmalloc_stage(tape, 0, 0);
+		if (!tape->merge_stage)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
 		idetape_init_merge_stage(tape);
@@ -2825,7 +2910,9 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 		 * drives (Seagate STT3401A) will return an error.
 		 */
 		if (drive->dsc_overlap) {
-			ssize_t retval = idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, 0, tape->merge_stage->bh);
+			ssize_t retval = idetape_queue_rw_tail(drive,
+							REQ_IDETAPE_WRITE, 0,
+							tape->merge_stage->bh);
 			if (retval < 0) {
 				__idetape_kfree_stage(tape->merge_stage);
 				tape->merge_stage = NULL;
@@ -2840,11 +2927,14 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 		idetape_restart_speed_control(drive);
 	if (tape->merge_stage_size) {
 		if (tape->merge_stage_size >= tape->stage_size) {
-			printk(KERN_ERR "ide-tape: bug: merge buffer too big\n");
+			printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
 			tape->merge_stage_size = 0;
 		}
-		actually_written = min((unsigned int)(tape->stage_size - tape->merge_stage_size), (unsigned int)count);
-		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf, actually_written))
+		actually_written = min((unsigned int)
+				(tape->stage_size - tape->merge_stage_size),
+				(unsigned int)count);
+		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf,
+						 actually_written))
 				ret = -EFAULT;
 		buf += actually_written;
 		tape->merge_stage_size += actually_written;
@@ -2860,7 +2950,8 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 	}
 	while (count >= tape->stage_size) {
 		ssize_t retval;
-		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf, tape->stage_size))
+		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf,
+						 tape->stage_size))
 			ret = -EFAULT;
 		buf += tape->stage_size;
 		count -= tape->stage_size;
@@ -2871,14 +2962,15 @@ static ssize_t idetape_chrdev_write (struct file *file, const char __user *buf,
 	}
 	if (count) {
 		actually_written += count;
-		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf, count))
+		if (idetape_copy_stage_from_user(tape, tape->merge_stage, buf,
+						 count))
 			ret = -EFAULT;
 		tape->merge_stage_size += count;
 	}
-	return (ret) ? ret : actually_written;
+	return ret ? ret : actually_written;
 }
 
-static int idetape_write_filemark (ide_drive_t *drive)
+static int idetape_write_filemark(ide_drive_t *drive)
 {
 	idetape_pc_t pc;
 
@@ -2909,110 +3001,117 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t pc;
-	int i,retval;
+	int i, retval;
 
 	debug_log(DBG_ERR, "Handling MTIOCTOP ioctl: mt_op=%d, mt_count=%d\n",
 			mt_op, mt_count);
+
 	/* Commands which need our pipelined read-ahead stages. */
 	switch (mt_op) {
-		case MTFSF:
-		case MTFSFM:
-		case MTBSF:
-		case MTBSFM:
-			if (!mt_count)
-				return (0);
-			return (idetape_space_over_filemarks(drive,mt_op,mt_count));
-		default:
-			break;
+	case MTFSF:
+	case MTFSFM:
+	case MTBSF:
+	case MTBSFM:
+		if (!mt_count)
+			return 0;
+		return idetape_space_over_filemarks(drive, mt_op, mt_count);
+	default:
+		break;
 	}
+
 	switch (mt_op) {
-		case MTWEOF:
-			if (tape->write_prot)
-				return -EACCES;
-			idetape_discard_read_pipeline(drive, 1);
-			for (i = 0; i < mt_count; i++) {
-				retval = idetape_write_filemark(drive);
-				if (retval)
-					return retval;
-			}
-			return (0);
-		case MTREW:
-			idetape_discard_read_pipeline(drive, 0);
-			if (idetape_rewind_tape(drive))
+	case MTWEOF:
+		if (tape->write_prot)
+			return -EACCES;
+		idetape_discard_read_pipeline(drive, 1);
+		for (i = 0; i < mt_count; i++) {
+			retval = idetape_write_filemark(drive);
+			if (retval)
+				return retval;
+		}
+		return 0;
+	case MTREW:
+		idetape_discard_read_pipeline(drive, 0);
+		if (idetape_rewind_tape(drive))
+			return -EIO;
+		return 0;
+	case MTLOAD:
+		idetape_discard_read_pipeline(drive, 0);
+		idetape_create_load_unload_cmd(drive, &pc,
+					       IDETAPE_LU_LOAD_MASK);
+		return idetape_queue_pc_tail(drive, &pc);
+	case MTUNLOAD:
+	case MTOFFL:
+		/*
+		 * If door is locked, attempt to unlock before
+		 * attempting to eject.
+		 */
+		if (tape->door_locked) {
+			if (idetape_create_prevent_cmd(drive, &pc, 0))
+				if (!idetape_queue_pc_tail(drive, &pc))
+					tape->door_locked = DOOR_UNLOCKED;
+		}
+		idetape_discard_read_pipeline(drive, 0);
+		idetape_create_load_unload_cmd(drive, &pc,
+					      !IDETAPE_LU_LOAD_MASK);
+		retval = idetape_queue_pc_tail(drive, &pc);
+		if (!retval)
+			clear_bit(IDETAPE_MEDIUM_PRESENT, &tape->flags);
+		return retval;
+	case MTNOP:
+		idetape_discard_read_pipeline(drive, 0);
+		return idetape_flush_tape_buffers(drive);
+	case MTRETEN:
+		idetape_discard_read_pipeline(drive, 0);
+		idetape_create_load_unload_cmd(drive, &pc,
+			IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
+		return idetape_queue_pc_tail(drive, &pc);
+	case MTEOM:
+		idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
+		return idetape_queue_pc_tail(drive, &pc);
+	case MTERASE:
+		(void)idetape_rewind_tape(drive);
+		idetape_create_erase_cmd(&pc);
+		return idetape_queue_pc_tail(drive, &pc);
+	case MTSETBLK:
+		if (mt_count) {
+			if (mt_count < tape->blk_size ||
+			    mt_count % tape->blk_size)
 				return -EIO;
+			tape->user_bs_factor = mt_count / tape->blk_size;
+			clear_bit(IDETAPE_DETECT_BS, &tape->flags);
+		} else
+			set_bit(IDETAPE_DETECT_BS, &tape->flags);
+		return 0;
+	case MTSEEK:
+		idetape_discard_read_pipeline(drive, 0);
+		return idetape_position_tape(drive,
+			mt_count * tape->user_bs_factor, tape->partition, 0);
+	case MTSETPART:
+		idetape_discard_read_pipeline(drive, 0);
+		return idetape_position_tape(drive, 0, mt_count, 0);
+	case MTFSR:
+	case MTBSR:
+	case MTLOCK:
+		if (!idetape_create_prevent_cmd(drive, &pc, 1))
 			return 0;
-		case MTLOAD:
-			idetape_discard_read_pipeline(drive, 0);
-			idetape_create_load_unload_cmd(drive, &pc, IDETAPE_LU_LOAD_MASK);
-			return (idetape_queue_pc_tail(drive, &pc));
-		case MTUNLOAD:
-		case MTOFFL:
-			/*
-			 * If door is locked, attempt to unlock before
-			 * attempting to eject.
-			 */
-			if (tape->door_locked) {
-				if (idetape_create_prevent_cmd(drive, &pc, 0))
-					if (!idetape_queue_pc_tail(drive, &pc))
-						tape->door_locked = DOOR_UNLOCKED;
-			}
-			idetape_discard_read_pipeline(drive, 0);
-			idetape_create_load_unload_cmd(drive, &pc,!IDETAPE_LU_LOAD_MASK);
-			retval = idetape_queue_pc_tail(drive, &pc);
-			if (!retval)
-				clear_bit(IDETAPE_MEDIUM_PRESENT, &tape->flags);
+		retval = idetape_queue_pc_tail(drive, &pc);
+		if (retval)
 			return retval;
-		case MTNOP:
-			idetape_discard_read_pipeline(drive, 0);
-			return (idetape_flush_tape_buffers(drive));
-		case MTRETEN:
-			idetape_discard_read_pipeline(drive, 0);
-			idetape_create_load_unload_cmd(drive, &pc,IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
-			return (idetape_queue_pc_tail(drive, &pc));
-		case MTEOM:
-			idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
-			return (idetape_queue_pc_tail(drive, &pc));
-		case MTERASE:
-			(void) idetape_rewind_tape(drive);
-			idetape_create_erase_cmd(&pc);
-			return (idetape_queue_pc_tail(drive, &pc));
-		case MTSETBLK:
-			if (mt_count) {
-				if (mt_count < tape->blk_size ||
-				    mt_count % tape->blk_size)
-					return -EIO;
-				tape->user_bs_factor = mt_count /
-							tape->blk_size;
-				clear_bit(IDETAPE_DETECT_BS, &tape->flags);
-			} else
-				set_bit(IDETAPE_DETECT_BS, &tape->flags);
-			return 0;
-		case MTSEEK:
-			idetape_discard_read_pipeline(drive, 0);
-			return idetape_position_tape(drive, mt_count * tape->user_bs_factor, tape->partition, 0);
-		case MTSETPART:
-			idetape_discard_read_pipeline(drive, 0);
-			return (idetape_position_tape(drive, 0, mt_count, 0));
-		case MTFSR:
-		case MTBSR:
-		case MTLOCK:
-			if (!idetape_create_prevent_cmd(drive, &pc, 1))
-				return 0;
-			retval = idetape_queue_pc_tail(drive, &pc);
-			if (retval) return retval;
-			tape->door_locked = DOOR_EXPLICITLY_LOCKED;
-			return 0;
-		case MTUNLOCK:
-			if (!idetape_create_prevent_cmd(drive, &pc, 0))
-				return 0;
-			retval = idetape_queue_pc_tail(drive, &pc);
-			if (retval) return retval;
-			tape->door_locked = DOOR_UNLOCKED;
+		tape->door_locked = DOOR_EXPLICITLY_LOCKED;
+		return 0;
+	case MTUNLOCK:
+		if (!idetape_create_prevent_cmd(drive, &pc, 0))
 			return 0;
-		default:
-			printk(KERN_ERR "ide-tape: MTIO operation %d not "
-				"supported\n", mt_op);
-			return (-EIO);
+		retval = idetape_queue_pc_tail(drive, &pc);
+		if (retval)
+			return retval;
+		tape->door_locked = DOOR_UNLOCKED;
+		return 0;
+	default:
+		printk(KERN_ERR "ide-tape: MTIO operation %d not supported\n",
+				mt_op);
+		return -EIO;
 	}
 }
 
@@ -3042,37 +3141,38 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
 		block_offset = idetape_pipeline_size(drive) /
 			(tape->blk_size * tape->user_bs_factor);
-		if ((position = idetape_read_position(drive)) < 0)
+		position = idetape_read_position(drive);
+		if (position < 0)
 			return -EIO;
 	}
 	switch (cmd) {
-		case MTIOCTOP:
-			if (copy_from_user(&mtop, argp, sizeof (struct mtop)))
-				return -EFAULT;
-			return (idetape_mtioctop(drive,mtop.mt_op,mtop.mt_count));
-		case MTIOCGET:
-			memset(&mtget, 0, sizeof (struct mtget));
-			mtget.mt_type = MT_ISSCSI2;
-			mtget.mt_blkno = position / tape->user_bs_factor - block_offset;
-			mtget.mt_dsreg =
-				((tape->blk_size * tape->user_bs_factor)
-				 << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK;
-
-			if (tape->drv_write_prot) {
-				mtget.mt_gstat |= GMT_WR_PROT(0xffffffff);
-			}
-			if (copy_to_user(argp, &mtget, sizeof(struct mtget)))
-				return -EFAULT;
-			return 0;
-		case MTIOCPOS:
-			mtpos.mt_blkno = position / tape->user_bs_factor - block_offset;
-			if (copy_to_user(argp, &mtpos, sizeof(struct mtpos)))
-				return -EFAULT;
-			return 0;
-		default:
-			if (tape->chrdev_dir == IDETAPE_DIR_READ)
-				idetape_discard_read_pipeline(drive, 1);
-			return idetape_blkdev_ioctl(drive, cmd, arg);
+	case MTIOCTOP:
+		if (copy_from_user(&mtop, argp, sizeof(struct mtop)))
+			return -EFAULT;
+		return idetape_mtioctop(drive, mtop.mt_op, mtop.mt_count);
+	case MTIOCGET:
+		memset(&mtget, 0, sizeof(struct mtget));
+		mtget.mt_type = MT_ISSCSI2;
+		mtget.mt_blkno = position / tape->user_bs_factor - block_offset;
+		mtget.mt_dsreg =
+			((tape->blk_size * tape->user_bs_factor)
+			 << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK;
+
+		if (tape->drv_write_prot)
+			mtget.mt_gstat |= GMT_WR_PROT(0xffffffff);
+
+		if (copy_to_user(argp, &mtget, sizeof(struct mtget)))
+			return -EFAULT;
+		return 0;
+	case MTIOCPOS:
+		mtpos.mt_blkno = position / tape->user_bs_factor - block_offset;
+		if (copy_to_user(argp, &mtpos, sizeof(struct mtpos)))
+			return -EFAULT;
+		return 0;
+	default:
+		if (tape->chrdev_dir == IDETAPE_DIR_READ)
+			idetape_discard_read_pipeline(drive, 1);
+		return idetape_blkdev_ioctl(drive, cmd, arg);
 	}
 }
 
@@ -3101,7 +3201,7 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
 	tape->drv_write_prot = (pc.buffer[2] & 0x80) >> 7;
 }
 
-static int idetape_chrdev_open (struct inode *inode, struct file *filp)
+static int idetape_chrdev_open(struct inode *inode, struct file *filp)
 {
 	unsigned int minor = iminor(inode), i = minor & ~0xc0;
 	ide_drive_t *drive;
@@ -3185,7 +3285,7 @@ out_put_tape:
 	return retval;
 }
 
-static void idetape_write_release (ide_drive_t *drive, unsigned int minor)
+static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
@@ -3202,7 +3302,7 @@ static void idetape_write_release (ide_drive_t *drive, unsigned int minor)
 	idetape_flush_tape_buffers(drive);
 }
 
-static int idetape_chrdev_release (struct inode *inode, struct file *filp)
+static int idetape_chrdev_release(struct inode *inode, struct file *filp)
 {
 	struct ide_tape_obj *tape = ide_tape_f(filp);
 	ide_drive_t *drive = tape->drive;
@@ -3309,7 +3409,7 @@ static void idetape_get_inquiry_results(ide_drive_t *drive)
  * Ask the tape about its various parameters. In particular, we will adjust our
  * data transfer buffer	size to the recommended value as returned by the tape.
  */
-static void idetape_get_mode_sense_results (ide_drive_t *drive)
+static void idetape_get_mode_sense_results(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	idetape_pc_t pc;
@@ -3356,17 +3456,24 @@ static void idetape_get_mode_sense_results (ide_drive_t *drive)
 }
 
 #ifdef CONFIG_IDE_PROC_FS
-static void idetape_add_settings (ide_drive_t *drive)
+static void idetape_add_settings(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_add_setting(drive, "buffer", SETTING_READ, TYPE_SHORT, 0, 0xffff,
 			1, 2, (u16 *)&tape->caps[16], NULL);
-	ide_add_setting(drive,	"pipeline_min",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->min_pipeline,			NULL);
-	ide_add_setting(drive,	"pipeline",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->max_stages,			NULL);
-	ide_add_setting(drive,	"pipeline_max",		SETTING_RW,	TYPE_INT,	1,			0xffff,			tape->stage_size / 1024,	1,		&tape->max_pipeline,			NULL);
-	ide_add_setting(drive,	"pipeline_used",	SETTING_READ,	TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,		&tape->nr_stages,			NULL);
-	ide_add_setting(drive,	"pipeline_pending",	SETTING_READ,	TYPE_INT,	0,			0xffff,			tape->stage_size / 1024,	1,		&tape->nr_pending_stages,		NULL);
+	ide_add_setting(drive, "pipeline_min", SETTING_RW, TYPE_INT, 1, 0xffff,
+			tape->stage_size / 1024, 1, &tape->min_pipeline, NULL);
+	ide_add_setting(drive, "pipeline", SETTING_RW, TYPE_INT, 1, 0xffff,
+			tape->stage_size / 1024, 1, &tape->max_stages, NULL);
+	ide_add_setting(drive, "pipeline_max", SETTING_RW, TYPE_INT, 1,	0xffff,
+			tape->stage_size / 1024, 1, &tape->max_pipeline, NULL);
+	ide_add_setting(drive, "pipeline_used",	SETTING_READ, TYPE_INT, 0,
+			0xffff,	tape->stage_size / 1024, 1, &tape->nr_stages,
+			NULL);
+	ide_add_setting(drive, "pipeline_pending", SETTING_READ, TYPE_INT, 0,
+			0xffff, tape->stage_size / 1024, 1,
+			&tape->nr_pending_stages, NULL);
 	ide_add_setting(drive, "speed", SETTING_READ, TYPE_SHORT, 0, 0xffff,
 			1, 1, (u16 *)&tape->caps[14], NULL);
 	ide_add_setting(drive, "stage", SETTING_READ, TYPE_INT,	0, 0xffff, 1,
@@ -3374,10 +3481,16 @@ static void idetape_add_settings (ide_drive_t *drive)
 	ide_add_setting(drive, "tdsc", SETTING_RW, TYPE_INT, IDETAPE_DSC_RW_MIN,
 			IDETAPE_DSC_RW_MAX, 1000, HZ, &tape->best_dsc_rw_freq,
 			NULL);
-	ide_add_setting(drive,	"dsc_overlap",		SETTING_RW,	TYPE_BYTE,	0,			1,			1,				1,		&drive->dsc_overlap,			NULL);
-	ide_add_setting(drive,	"pipeline_head_speed_c",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->controlled_pipeline_head_speed,	NULL);
-	ide_add_setting(drive,	"pipeline_head_speed_u",SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->uncontrolled_pipeline_head_speed,NULL);
-	ide_add_setting(drive,	"avg_speed",		SETTING_READ,	TYPE_INT,	0,			0xffff,			1,				1,		&tape->avg_speed,			NULL);
+	ide_add_setting(drive, "dsc_overlap", SETTING_RW, TYPE_BYTE, 0, 1, 1,
+			1, &drive->dsc_overlap, NULL);
+	ide_add_setting(drive, "pipeline_head_speed_c", SETTING_READ, TYPE_INT,
+			0, 0xffff, 1, 1, &tape->controlled_pipeline_head_speed,
+			NULL);
+	ide_add_setting(drive, "pipeline_head_speed_u", SETTING_READ, TYPE_INT,
+			0, 0xffff, 1, 1,
+			&tape->uncontrolled_pipeline_head_speed, NULL);
+	ide_add_setting(drive, "avg_speed", SETTING_READ, TYPE_INT, 0, 0xffff,
+			1, 1, &tape->avg_speed, NULL);
 	ide_add_setting(drive, "debug_mask", SETTING_RW, TYPE_INT, 0, 0xffff, 1,
 			1, &tape->debug_mask, NULL);
 }
@@ -3396,7 +3509,7 @@ static inline void idetape_add_settings(ide_drive_t *drive) { ; }
  * Note that at this point ide.c already assigned us an irq, so that we can
  * queue requests here and wait for their completion.
  */
-static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
+static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 {
 	unsigned long t1, tmid, tn, t;
 	int speed;
@@ -3429,7 +3542,9 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	if (((gcw[0] & 0x60) >> 5) == 1)
 		set_bit(IDETAPE_DRQ_INTERRUPT, &tape->flags);
 
-	tape->min_pipeline = tape->max_pipeline = tape->max_stages = 10;
+	tape->min_pipeline = 10;
+	tape->max_pipeline = 10;
+	tape->max_stages   = 10;
 
 	idetape_get_inquiry_results(drive);
 	idetape_get_mode_sense_results(drive);
@@ -3455,13 +3570,20 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 
 	/* Limit memory use for pipeline to 10% of physical memory */
 	si_meminfo(&si);
-	if (tape->max_stages * tape->stage_size > si.totalram * si.mem_unit / 10)
-		tape->max_stages = si.totalram * si.mem_unit / (10 * tape->stage_size);
+	if (tape->max_stages * tape->stage_size >
+			si.totalram * si.mem_unit / 10)
+		tape->max_stages =
+			si.totalram * si.mem_unit / (10 * tape->stage_size);
+
 	tape->max_stages   = min(tape->max_stages, IDETAPE_MAX_PIPELINE_STAGES);
 	tape->min_pipeline = min(tape->max_stages, IDETAPE_MIN_PIPELINE_STAGES);
-	tape->max_pipeline = min(tape->max_stages * 2, IDETAPE_MAX_PIPELINE_STAGES);
-	if (tape->max_stages == 0)
-		tape->max_stages = tape->min_pipeline = tape->max_pipeline = 1;
+	tape->max_pipeline =
+		min(tape->max_stages * 2, IDETAPE_MAX_PIPELINE_STAGES);
+	if (tape->max_stages == 0) {
+		tape->max_stages   = 1;
+		tape->min_pipeline = 1;
+		tape->max_pipeline = 1;
+	}
 
 	t1 = (tape->stage_size * HZ) / (speed * 1000);
 	tmid = (*(u16 *)&tape->caps[16] * 32 * HZ) / (speed * 125);
@@ -3513,7 +3635,8 @@ static void ide_tape_release(struct kref *kref)
 	drive->dsc_overlap = 0;
 	drive->driver_data = NULL;
 	device_destroy(idetape_sysfs_class, MKDEV(IDETAPE_MAJOR, tape->minor));
-	device_destroy(idetape_sysfs_class, MKDEV(IDETAPE_MAJOR, tape->minor + 128));
+	device_destroy(idetape_sysfs_class,
+			MKDEV(IDETAPE_MAJOR, tape->minor + 128));
 	idetape_devs[tape->minor] = NULL;
 	g->private_data = NULL;
 	put_disk(g);
@@ -3577,7 +3700,8 @@ static int idetape_open(struct inode *inode, struct file *filp)
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct ide_tape_obj *tape;
 
-	if (!(tape = ide_tape_get(disk)))
+	tape = ide_tape_get(disk);
+	if (!tape)
 		return -ENXIO;
 
 	return 0;
@@ -3624,17 +3748,20 @@ static int ide_tape_probe(ide_drive_t *drive)
 		goto failed;
 	if (drive->media != ide_tape)
 		goto failed;
-	if (!idetape_identify_device (drive)) {
-		printk(KERN_ERR "ide-tape: %s: not supported by this version of ide-tape\n", drive->name);
+	if (!idetape_identify_device(drive)) {
+		printk(KERN_ERR "ide-tape: %s: not supported by this version of"
+				" the driver\n", drive->name);
 		goto failed;
 	}
 	if (drive->scsi) {
-		printk("ide-tape: passing drive %s to ide-scsi emulation.\n", drive->name);
+		printk(KERN_INFO "ide-tape: passing drive %s to ide-scsi"
+				 " emulation.\n", drive->name);
 		goto failed;
 	}
-	tape = kzalloc(sizeof (idetape_tape_t), GFP_KERNEL);
+	tape = kzalloc(sizeof(idetape_tape_t), GFP_KERNEL);
 	if (tape == NULL) {
-		printk(KERN_ERR "ide-tape: %s: Can't allocate a tape structure\n", drive->name);
+		printk(KERN_ERR "ide-tape: %s: Can't allocate a tape struct\n",
+				drive->name);
 		goto failed;
 	}
 
@@ -3680,7 +3807,7 @@ failed:
 	return -ENODEV;
 }
 
-static void __exit idetape_exit (void)
+static void __exit idetape_exit(void)
 {
 	driver_unregister(&idetape_driver.gen_driver);
 	class_destroy(idetape_sysfs_class);
@@ -3699,7 +3826,8 @@ static int __init idetape_init(void)
 	}
 
 	if (register_chrdev(IDETAPE_MAJOR, "ht", &idetape_fops)) {
-		printk(KERN_ERR "ide-tape: Failed to register character device interface\n");
+		printk(KERN_ERR "ide-tape: Failed to register chrdev"
+				" interface\n");
 		error = -EBUSY;
 		goto out_free_class;
 	}
-- 
cgit v1.1


From dfe799364e7a500389559e1dcd331d995cdc18ea Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Wed, 6 Feb 2008 02:57:55 +0100
Subject: ide-tape: bump minor driver version

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 11e1383..49dd2e7 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -15,7 +15,7 @@
  * Documentation/ide/ChangeLog.ide-tape.1995-2002
  */
 
-#define IDETAPE_VERSION "1.19"
+#define IDETAPE_VERSION "1.20"
 
 #include <linux/module.h>
 #include <linux/types.h>
-- 
cgit v1.1