75 files changed, 6573 insertions, 1102 deletions
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 1eb308c..e11b5fc 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -58,6 +58,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool y
 
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool y
+
 config LOCKDEP_SUPPORT
 	def_bool y
 
@@ -96,6 +99,7 @@ config HVC_TILE
 
 config TILE
 	def_bool y
+	select HAVE_KVM if !TILEGX
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_FIND_NEXT_BIT
 	select USE_GENERIC_SMP_HELPERS
@@ -113,8 +117,6 @@ config TILE
 #       config HUGETLB_PAGE_SIZE_VARIABLE
 
 
-mainmenu "Linux/TILE Kernel Configuration"
-
 # Please note: TILE-Gx support is not yet finalized; this is
 # the preliminary support.  TILE-Gx drivers are only provided
 # with the alpha or beta test versions for Tilera customers.
@@ -236,9 +238,9 @@ choice
 	  If you are not absolutely sure what you are doing, leave this
 	  option alone!
 
-	config VMSPLIT_375G
+	config VMSPLIT_3_75G
 		bool "3.75G/0.25G user/kernel split (no kernel networking)"
-	config VMSPLIT_35G
+	config VMSPLIT_3_5G
 		bool "3.5G/0.5G user/kernel split"
 	config VMSPLIT_3G
 		bool "3G/1G user/kernel split"
@@ -252,8 +254,8 @@ endchoice
 
 config PAGE_OFFSET
 	hex
-	default 0xF0000000 if VMSPLIT_375G
-	default 0xE0000000 if VMSPLIT_35G
+	default 0xF0000000 if VMSPLIT_3_75G
+	default 0xE0000000 if VMSPLIT_3_5G
 	default 0xB0000000 if VMSPLIT_3G_OPT
 	default 0x80000000 if VMSPLIT_2G
 	default 0x40000000 if VMSPLIT_1G
@@ -314,10 +316,31 @@ config HARDWALL
 	bool "Hardwall support to allow access to user dynamic network"
 	default y
 
+config KERNEL_PL
+	int "Processor protection level for kernel"
+	range 1 2
+	default "1"
+	---help---
+	  This setting determines the processor protection level the
+	  kernel will be built to run at.  Generally you should use
+	  the default value here.
+
 endmenu  # Tilera-specific configuration
 
 menu "Bus options"
 
+config PCI
+	bool "PCI support"
+	default y
+	select PCI_DOMAINS
+	---help---
+	  Enable PCI root complex support, so PCIe endpoint devices can
+	  be attached to the Tile chip.  Many, but not all, PCI devices
+	  are supported under Tilera's root complex driver.
+
+config PCI_DOMAINS
+	bool
+
 config NO_IOMEM
 	def_bool !PCI
 
@@ -354,3 +377,5 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+source "arch/tile/kvm/Kconfig"
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index fd8f6bb..17acce7 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -26,8 +26,9 @@ $(error Set TILERA_ROOT or CROSS_COMPILE when building $(ARCH) on $(HOST_ARCH))
   endif
 endif
 
-
+ifneq ($(CONFIG_DEBUG_EXTRA_FLAGS),"")
 KBUILD_CFLAGS   += $(CONFIG_DEBUG_EXTRA_FLAGS)
+endif
 
 LIBGCC_PATH     := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
 
@@ -49,6 +50,20 @@ head-y		:= arch/tile/kernel/head_$(BITS).o
 libs-y		+= arch/tile/lib/
 libs-y		+= $(LIBGCC_PATH)
 
-
 # See arch/tile/Kbuild for content of core part of the kernel
 core-y		+= arch/tile/
+
+core-$(CONFIG_KVM) += arch/tile/kvm/
+
+ifdef TILERA_ROOT
+INSTALL_PATH ?= $(TILERA_ROOT)/tile/boot
+endif
+
+install:
+	install -D -m 755 vmlinux $(INSTALL_PATH)/vmlinux-$(KERNELRELEASE)
+	install -D -m 644 .config $(INSTALL_PATH)/config-$(KERNELRELEASE)
+	install -D -m 644 System.map $(INSTALL_PATH)/System.map-$(KERNELRELEASE)
+
+define archhelp
+	echo '  install         - install kernel into $(INSTALL_PATH)'
+endef
diff --git a/arch/tile/include/arch/sim.h b/arch/tile/include/arch/sim.h
new file mode 100644
index 0000000..74b7c16
--- /dev/null
+++ b/arch/tile/include/arch/sim.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file
+ *
+ * Provides an API for controlling the simulator at runtime.
+ */
+
+/**
+ * @addtogroup arch_sim
+ * @{
+ *
+ * An API for controlling the simulator at runtime.
+ *
+ * The simulator's behavior can be modified while it is running.
+ * For example, human-readable trace output can be enabled and disabled
+ * around code of interest.
+ *
+ * There are two ways to modify simulator behavior:
+ * programmatically, by calling various sim_* functions, and
+ * interactively, by entering commands like "sim set functional true"
+ * at the tile-monitor prompt.  Typing "sim help" at that prompt provides
+ * a list of interactive commands.
+ *
+ * All interactive commands can also be executed programmatically by
+ * passing a string to the sim_command function.
+ */
+
+#ifndef __ARCH_SIM_H__
+#define __ARCH_SIM_H__
+
+#include <arch/sim_def.h>
+#include <arch/abi.h>
+
+#ifndef __ASSEMBLER__
+
+#include <arch/spr_def.h>
+
+
+/**
+ * Return true if the current program is running under a simulator,
+ * rather than on real hardware.  If running on hardware, other "sim_xxx()"
+ * calls have no useful effect.
+ */
+static inline int
+sim_is_simulator(void)
+{
+  return __insn_mfspr(SPR_SIM_CONTROL) != 0;
+}
+
+
+/**
+ * Checkpoint the simulator state to a checkpoint file.
+ *
+ * The checkpoint file name is either the default or the name specified
+ * on the command line with "--checkpoint-file".
+ */
+static __inline void
+sim_checkpoint(void)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_CHECKPOINT);
+}
+
+
+/**
+ * Report whether or not various kinds of simulator tracing are enabled.
+ *
+ * @return The bitwise OR of these values:
+ *
+ * SIM_TRACE_CYCLES (--trace-cycles),
+ * SIM_TRACE_ROUTER (--trace-router),
+ * SIM_TRACE_REGISTER_WRITES (--trace-register-writes),
+ * SIM_TRACE_DISASM (--trace-disasm),
+ * SIM_TRACE_STALL_INFO (--trace-stall-info)
+ * SIM_TRACE_MEMORY_CONTROLLER (--trace-memory-controller)
+ * SIM_TRACE_L2_CACHE (--trace-l2)
+ * SIM_TRACE_LINES (--trace-lines)
+ */
+static __inline unsigned int
+sim_get_tracing(void)
+{
+  return __insn_mfspr(SPR_SIM_CONTROL) & SIM_TRACE_FLAG_MASK;
+}
+
+
+/**
+ * Turn on or off different kinds of simulator tracing.
+ *
+ * @param mask Either one of these special values:
+ *
+ * SIM_TRACE_NONE (turns off tracing),
+ * SIM_TRACE_ALL (turns on all possible tracing).
+ *
+ * or the bitwise OR of these values:
+ *
+ * SIM_TRACE_CYCLES (--trace-cycles),
+ * SIM_TRACE_ROUTER (--trace-router),
+ * SIM_TRACE_REGISTER_WRITES (--trace-register-writes),
+ * SIM_TRACE_DISASM (--trace-disasm),
+ * SIM_TRACE_STALL_INFO (--trace-stall-info)
+ * SIM_TRACE_MEMORY_CONTROLLER (--trace-memory-controller)
+ * SIM_TRACE_L2_CACHE (--trace-l2)
+ * SIM_TRACE_LINES (--trace-lines)
+ */
+static __inline void
+sim_set_tracing(unsigned int mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_TRACE_SPR_ARG(mask));
+}
+
+
+/**
+ * Request dumping of different kinds of simulator state.
+ *
+ * @param mask Either this special value:
+ *
+ * SIM_DUMP_ALL (dump all known state)
+ *
+ * or the bitwise OR of these values:
+ *
+ * SIM_DUMP_REGS (the register file),
+ * SIM_DUMP_SPRS (the SPRs),
+ * SIM_DUMP_ITLB (the iTLB),
+ * SIM_DUMP_DTLB (the dTLB),
+ * SIM_DUMP_L1I (the L1 I-cache),
+ * SIM_DUMP_L1D (the L1 D-cache),
+ * SIM_DUMP_L2 (the L2 cache),
+ * SIM_DUMP_SNREGS (the switch register file),
+ * SIM_DUMP_SNITLB (the switch iTLB),
+ * SIM_DUMP_SNL1I (the switch L1 I-cache),
+ * SIM_DUMP_BACKTRACE (the current backtrace)
+ */
+static __inline void
+sim_dump(unsigned int mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_DUMP_SPR_ARG(mask));
+}
+
+
+/**
+ * Print a string to the simulator stdout.
+ *
+ * @param str The string to be written; a newline is automatically added.
+ */
+static __inline void
+sim_print_string(const char* str)
+{
+  int i;
+  for (i = 0; str[i] != 0; i++)
+  {
+    __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+                 (str[i] << _SIM_CONTROL_OPERATOR_BITS));
+  }
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+               (SIM_PUTC_FLUSH_STRING << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+
+/**
+ * Execute a simulator command string.
+ *
+ * Type 'sim help' at the tile-monitor prompt to learn what commands
+ * are available.  Note the use of the tile-monitor "sim" command to
+ * pass commands to the simulator.
+ *
+ * The argument to sim_command() does not include the leading "sim"
+ * prefix used at the tile-monitor prompt; for example, you might call
+ * sim_command("trace disasm").
+ */
+static __inline void
+sim_command(const char* str)
+{
+  int c;
+  do
+  {
+    c = *str++;
+    __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_COMMAND |
+                 (c << _SIM_CONTROL_OPERATOR_BITS));
+  }
+  while (c);
+}
+
+
+
+#ifndef __DOXYGEN__
+
+/**
+ * The underlying implementation of "_sim_syscall()".
+ *
+ * We use extra "and" instructions to ensure that all the values
+ * we are passing to the simulator are actually valid in the registers
+ * (i.e. returned from memory) prior to the SIM_CONTROL spr.
+ */
+static __inline int _sim_syscall0(int val)
+{
+  long result;
+  __asm__ __volatile__ ("mtspr SIM_CONTROL, r0"
+                        : "=R00" (result) : "R00" (val));
+  return result;
+}
+
+static __inline int _sim_syscall1(int val, long arg1)
+{
+  long result;
+  __asm__ __volatile__ ("{ and zero, r1, r1; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (result) : "R00" (val), "R01" (arg1));
+  return result;
+}
+
+static __inline int _sim_syscall2(int val, long arg1, long arg2)
+{
+  long result;
+  __asm__ __volatile__ ("{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (result)
+                        : "R00" (val), "R01" (arg1), "R02" (arg2));
+  return result;
+}
+
+/* Note that _sim_syscall3() and higher are technically at risk of
+   receiving an interrupt right before the mtspr bundle, in which case
+   the register values for arguments 3 and up may still be in flight
+   to the core from a stack frame reload. */
+
+static __inline int _sim_syscall3(int val, long arg1, long arg2, long arg3)
+{
+  long result;
+  __asm__ __volatile__ ("{ and zero, r3, r3 };"
+                        "{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (result)
+                        : "R00" (val), "R01" (arg1), "R02" (arg2),
+                          "R03" (arg3));
+  return result;
+}
+
+static __inline int _sim_syscall4(int val, long arg1, long arg2, long arg3,
+                                  long arg4)
+{
+  long result;
+  __asm__ __volatile__ ("{ and zero, r3, r4 };"
+                        "{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (result)
+                        : "R00" (val), "R01" (arg1), "R02" (arg2),
+                          "R03" (arg3), "R04" (arg4));
+  return result;
+}
+
+static __inline int _sim_syscall5(int val, long arg1, long arg2, long arg3,
+                                  long arg4, long arg5)
+{
+  long result;
+  __asm__ __volatile__ ("{ and zero, r3, r4; and zero, r5, r5 };"
+                        "{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (result)
+                        : "R00" (val), "R01" (arg1), "R02" (arg2),
+                          "R03" (arg3), "R04" (arg4), "R05" (arg5));
+  return result;
+}
+
+
+/**
+ * Make a special syscall to the simulator itself, if running under
+ * simulation. This is used as the implementation of other functions
+ * and should not be used outside this file.
+ *
+ * @param syscall_num The simulator syscall number.
+ * @param nr The number of additional arguments provided.
+ *
+ * @return Varies by syscall.
+ */
+#define _sim_syscall(syscall_num, nr, args...) \
+  _sim_syscall##nr( \
+    ((syscall_num) << _SIM_CONTROL_OPERATOR_BITS) | SIM_CONTROL_SYSCALL, args)
+
+
+/* Values for the "access_mask" parameters below. */
+#define SIM_WATCHPOINT_READ    1
+#define SIM_WATCHPOINT_WRITE   2
+#define SIM_WATCHPOINT_EXECUTE 4
+
+
+static __inline int
+sim_add_watchpoint(unsigned int process_id,
+                   unsigned long address,
+                   unsigned long size,
+                   unsigned int access_mask,
+                   unsigned long user_data)
+{
+  return _sim_syscall(SIM_SYSCALL_ADD_WATCHPOINT, 5, process_id,
+                     address, size, access_mask, user_data);
+}
+
+
+static __inline int
+sim_remove_watchpoint(unsigned int process_id,
+                      unsigned long address,
+                      unsigned long size,
+                      unsigned int access_mask,
+                      unsigned long user_data)
+{
+  return _sim_syscall(SIM_SYSCALL_REMOVE_WATCHPOINT, 5, process_id,
+                     address, size, access_mask, user_data);
+}
+
+
+/**
+ * Return value from sim_query_watchpoint.
+ */
+struct SimQueryWatchpointStatus
+{
+  /**
+   * 0 if a watchpoint fired, 1 if no watchpoint fired, or -1 for
+   * error (meaning a bad process_id).
+   */
+  int syscall_status;
+
+  /**
+   * The address of the watchpoint that fired (this is the address
+   * passed to sim_add_watchpoint, not an address within that range
+   * that actually triggered the watchpoint).
+   */
+  unsigned long address;
+
+  /** The arbitrary user_data installed by sim_add_watchpoint. */
+  unsigned long user_data;
+};
+
+
+static __inline struct SimQueryWatchpointStatus
+sim_query_watchpoint(unsigned int process_id)
+{
+  struct SimQueryWatchpointStatus status;
+  long val = SIM_CONTROL_SYSCALL |
+    (SIM_SYSCALL_QUERY_WATCHPOINT << _SIM_CONTROL_OPERATOR_BITS);
+  __asm__ __volatile__ ("{ and zero, r1, r1; mtspr SIM_CONTROL, r0 }"
+                        : "=R00" (status.syscall_status),
+                          "=R01" (status.address),
+                          "=R02" (status.user_data)
+                        : "R00" (val), "R01" (process_id));
+  return status;
+}
+
+
+/* On the simulator, confirm lines have been evicted everywhere. */
+static __inline void
+sim_validate_lines_evicted(unsigned long long pa, unsigned long length)
+{
+#ifdef __LP64__
+  _sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED, 2, pa, length);
+#else
+  _sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED, 4,
+               0 /* dummy */, (long)(pa), (long)(pa >> 32), length);
+#endif
+}
+
+
+#endif /* !__DOXYGEN__ */
+
+
+
+
+/**
+ * Modify the shaping parameters of a shim.
+ *
+ * @param shim The shim to modify. One of:
+ *   SIM_CONTROL_SHAPING_GBE_0
+ *   SIM_CONTROL_SHAPING_GBE_1
+ *   SIM_CONTROL_SHAPING_GBE_2
+ *   SIM_CONTROL_SHAPING_GBE_3
+ *   SIM_CONTROL_SHAPING_XGBE_0
+ *   SIM_CONTROL_SHAPING_XGBE_1
+ *
+ * @param type The type of shaping. This should be the same type of
+ * shaping that is already in place on the shim. One of:
+ *   SIM_CONTROL_SHAPING_MULTIPLIER
+ *   SIM_CONTROL_SHAPING_PPS
+ *   SIM_CONTROL_SHAPING_BPS
+ *
+ * @param units The magnitude of the rate. One of:
+ *   SIM_CONTROL_SHAPING_UNITS_SINGLE
+ *   SIM_CONTROL_SHAPING_UNITS_KILO
+ *   SIM_CONTROL_SHAPING_UNITS_MEGA
+ *   SIM_CONTROL_SHAPING_UNITS_GIGA
+ *
+ * @param rate The rate to which to change it. This must fit in
+ * SIM_CONTROL_SHAPING_RATE_BITS bits or a warning is issued and
+ * the shaping is not changed.
+ *
+ * @return 0 if no problems were detected in the arguments to sim_set_shaping
+ * or 1 if problems were detected (for example, rate does not fit in 17 bits).
+ */
+static __inline int
+sim_set_shaping(unsigned shim,
+                unsigned type,
+                unsigned units,
+                unsigned rate)
+{
+  if ((rate & ~((1 << SIM_CONTROL_SHAPING_RATE_BITS) - 1)) != 0)
+    return 1;
+
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_SHAPING_SPR_ARG(shim, type, units, rate));
+  return 0;
+}
+
+#ifdef __tilegx__
+
+/** Enable a set of mPIPE links.  Pass a -1 link_mask to enable all links. */
+static __inline void
+sim_enable_mpipe_links(unsigned mpipe, unsigned long link_mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL,
+               (SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE |
+                (mpipe << 8) | (1 << 16) | ((uint_reg_t)link_mask << 32)));
+}
+
+/** Disable a set of mPIPE links.  Pass a -1 link_mask to disable all links. */
+static __inline void
+sim_disable_mpipe_links(unsigned mpipe, unsigned long link_mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL,
+               (SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE |
+                (mpipe << 8) | (0 << 16) | ((uint_reg_t)link_mask << 32)));
+}
+
+#endif /* __tilegx__ */
+
+
+/*
+ * An API for changing "functional" mode.
+ */
+
+#ifndef __DOXYGEN__
+
+#define sim_enable_functional() \
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_ENABLE_FUNCTIONAL)
+
+#define sim_disable_functional() \
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_DISABLE_FUNCTIONAL)
+
+#endif /* __DOXYGEN__ */
+
+
+/*
+ * Profiler support.
+ */
+
+/**
+ * Turn profiling on for the current task.
+ *
+ * Note that this has no effect if run in an environment without
+ * profiling support (thus, the proper flags to the simulator must
+ * be supplied).
+ */
+static __inline void
+sim_profiler_enable(void)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PROFILER_ENABLE);
+}
+
+
+/** Turn profiling off for the current task. */
+static __inline void
+sim_profiler_disable(void)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PROFILER_DISABLE);
+}
+
+
+/**
+ * Turn profiling on or off for the current task.
+ *
+ * @param enabled If true, turns on profiling. If false, turns it off.
+ *
+ * Note that this has no effect if run in an environment without
+ * profiling support (thus, the proper flags to the simulator must
+ * be supplied).
+ */
+static __inline void
+sim_profiler_set_enabled(int enabled)
+{
+  int val =
+    enabled ? SIM_CONTROL_PROFILER_ENABLE : SIM_CONTROL_PROFILER_DISABLE;
+  __insn_mtspr(SPR_SIM_CONTROL, val);
+}
+
+
+/**
+ * Return true if and only if profiling is currently enabled
+ * for the current task.
+ *
+ * This returns false even if sim_profiler_enable() was called
+ * if the current execution environment does not support profiling.
+ */
+static __inline int
+sim_profiler_is_enabled(void)
+{
+  return ((__insn_mfspr(SPR_SIM_CONTROL) & SIM_PROFILER_ENABLED_MASK) != 0);
+}
+
+
+/**
+ * Reset profiling counters to zero for the current task.
+ *
+ * Resetting can be done while profiling is enabled.  It does not affect
+ * the chip-wide profiling counters.
+ */
+static __inline void
+sim_profiler_clear(void)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PROFILER_CLEAR);
+}
+
+
+/**
+ * Enable specified chip-level profiling counters.
+ *
+ * Does not affect the per-task profiling counters.
+ *
+ * @param mask Either this special value:
+ *
+ * SIM_CHIP_ALL (enables all chip-level components).
+ *
+ * or the bitwise OR of these values:
+ *
+ * SIM_CHIP_MEMCTL (enable all memory controllers)
+ * SIM_CHIP_XAUI (enable all XAUI controllers)
+ * SIM_CHIP_MPIPE (enable all MPIPE controllers)
+ */
+static __inline void
+sim_profiler_chip_enable(unsigned int mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_ENABLE_SPR_ARG(mask));
+}
+
+
+/**
+ * Disable specified chip-level profiling counters.
+ *
+ * Does not affect the per-task profiling counters.
+ *
+ * @param mask Either this special value:
+ *
+ * SIM_CHIP_ALL (disables all chip-level components).
+ *
+ * or the bitwise OR of these values:
+ *
+ * SIM_CHIP_MEMCTL (disable all memory controllers)
+ * SIM_CHIP_XAUI (disable all XAUI controllers)
+ * SIM_CHIP_MPIPE (disable all MPIPE controllers)
+ */
+static __inline void
+sim_profiler_chip_disable(unsigned int mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_DISABLE_SPR_ARG(mask));
+}
+
+
+/**
+ * Reset specified chip-level profiling counters to zero.
+ *
+ * Does not affect the per-task profiling counters.
+ *
+ * @param mask Either this special value:
+ *
+ * SIM_CHIP_ALL (clears all chip-level components).
+ *
+ * or the bitwise OR of these values:
+ *
+ * SIM_CHIP_MEMCTL (clear all memory controllers)
+ * SIM_CHIP_XAUI (clear all XAUI controllers)
+ * SIM_CHIP_MPIPE (clear all MPIPE controllers)
+ */
+static __inline void
+sim_profiler_chip_clear(unsigned int mask)
+{
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
+}
+
+
+/*
+ * Event support.
+ */
+
+#ifndef __DOXYGEN__
+
+static __inline void
+sim_event_begin(unsigned int x)
+{
+#if defined(__tile__) && !defined(__NO_EVENT_SPR__)
+  __insn_mtspr(SPR_EVENT_BEGIN, x);
+#endif
+}
+
+static __inline void
+sim_event_end(unsigned int x)
+{
+#if defined(__tile__) && !defined(__NO_EVENT_SPR__)
+  __insn_mtspr(SPR_EVENT_END, x);
+#endif
+}
+
+#endif /* !__DOXYGEN__ */
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !__ARCH_SIM_H__ */
+
+/** @} */
diff --git a/arch/tile/include/arch/sim_def.h b/arch/tile/include/arch/sim_def.h
index 6418fbd..7a17082 100644
--- a/arch/tile/include/arch/sim_def.h
+++ b/arch/tile/include/arch/sim_def.h
@@ -1,477 +1,461 @@
-// Copyright 2010 Tilera Corporation. All Rights Reserved.
-//
-//   This program is free software; you can redistribute it and/or
-//   modify it under the terms of the GNU General Public License
-//   as published by the Free Software Foundation, version 2.
-//
-//   This program is distributed in the hope that it will be useful, but
-//   WITHOUT ANY WARRANTY; without even the implied warranty of
-//   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
-//   NON INFRINGEMENT.  See the GNU General Public License for
-//   more details.
-
-//! @file
-//!
-//! Some low-level simulator definitions.
-//!
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file
+ *
+ * Some low-level simulator definitions.
+ */
 
 #ifndef __ARCH_SIM_DEF_H__
 #define __ARCH_SIM_DEF_H__
 
 
-//! Internal: the low bits of the SIM_CONTROL_* SPR values specify
-//! the operation to perform, and the remaining bits are
-//! an operation-specific parameter (often unused).
-//!
+/**
+ * Internal: the low bits of the SIM_CONTROL_* SPR values specify
+ * the operation to perform, and the remaining bits are
+ * an operation-specific parameter (often unused).
+ */
 #define _SIM_CONTROL_OPERATOR_BITS 8
 
 
-//== Values which can be written to SPR_SIM_CONTROL.
+/*
+ * Values which can be written to SPR_SIM_CONTROL.
+ */
 
-//! If written to SPR_SIM_CONTROL, stops profiling.
-//!
+/** If written to SPR_SIM_CONTROL, stops profiling. */
 #define SIM_CONTROL_PROFILER_DISABLE 0
 
-//! If written to SPR_SIM_CONTROL, starts profiling.
-//!
+/** If written to SPR_SIM_CONTROL, starts profiling. */
 #define SIM_CONTROL_PROFILER_ENABLE 1
 
-//! If written to SPR_SIM_CONTROL, clears profiling counters.
-//!
+/** If written to SPR_SIM_CONTROL, clears profiling counters. */
 #define SIM_CONTROL_PROFILER_CLEAR 2
 
-//! If written to SPR_SIM_CONTROL, checkpoints the simulator.
-//!
+/** If written to SPR_SIM_CONTROL, checkpoints the simulator. */
 #define SIM_CONTROL_CHECKPOINT 3
 
-//! If written to SPR_SIM_CONTROL, combined with a mask (shifted by 8),
-//! sets the tracing mask to the given mask. See "sim_set_tracing()".
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a mask (shifted by 8),
+ * sets the tracing mask to the given mask. See "sim_set_tracing()".
+ */
 #define SIM_CONTROL_SET_TRACING 4
 
-//! If written to SPR_SIM_CONTROL, combined with a mask (shifted by 8),
-//! dumps the requested items of machine state to the log.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a mask (shifted by 8),
+ * dumps the requested items of machine state to the log.
+ */
 #define SIM_CONTROL_DUMP 5
 
-//! If written to SPR_SIM_CONTROL, clears chip-level profiling counters.
-//!
+/** If written to SPR_SIM_CONTROL, clears chip-level profiling counters. */
 #define SIM_CONTROL_PROFILER_CHIP_CLEAR 6
 
-//! If written to SPR_SIM_CONTROL, disables chip-level profiling.
-//!
+/** If written to SPR_SIM_CONTROL, disables chip-level profiling. */
 #define SIM_CONTROL_PROFILER_CHIP_DISABLE 7
 
-//! If written to SPR_SIM_CONTROL, enables chip-level profiling.
-//!
+/** If written to SPR_SIM_CONTROL, enables chip-level profiling. */
 #define SIM_CONTROL_PROFILER_CHIP_ENABLE 8
 
-//! If written to SPR_SIM_CONTROL, enables chip-level functional mode
-//!
+/** If written to SPR_SIM_CONTROL, enables chip-level functional mode */
 #define SIM_CONTROL_ENABLE_FUNCTIONAL 9
 
-//! If written to SPR_SIM_CONTROL, disables chip-level functional mode.
-//!
+/** If written to SPR_SIM_CONTROL, disables chip-level functional mode. */
 #define SIM_CONTROL_DISABLE_FUNCTIONAL 10
 
-//! If written to SPR_SIM_CONTROL, enables chip-level functional mode.
-//! All tiles must perform this write for functional mode to be enabled.
-//! Ignored in naked boot mode unless --functional is specified.
-//! WARNING: Only the hypervisor startup code should use this!
-//!
+/**
+ * If written to SPR_SIM_CONTROL, enables chip-level functional mode.
+ * All tiles must perform this write for functional mode to be enabled.
+ * Ignored in naked boot mode unless --functional is specified.
+ * WARNING: Only the hypervisor startup code should use this!
+ */
 #define SIM_CONTROL_ENABLE_FUNCTIONAL_BARRIER 11
 
-//! If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
-//! writes a string directly to the simulator output.  Written to once for
-//! each character in the string, plus a final NUL.  Instead of NUL,
-//! you can also use "SIM_PUTC_FLUSH_STRING" or "SIM_PUTC_FLUSH_BINARY".
-//!
-// ISSUE: Document the meaning of "newline", and the handling of NUL.
-//
+/**
+ * If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
+ * writes a string directly to the simulator output.  Written to once for
+ * each character in the string, plus a final NUL.  Instead of NUL,
+ * you can also use "SIM_PUTC_FLUSH_STRING" or "SIM_PUTC_FLUSH_BINARY".
+ */
+/* ISSUE: Document the meaning of "newline", and the handling of NUL. */
 #define SIM_CONTROL_PUTC 12
 
-//! If written to SPR_SIM_CONTROL, clears the --grind-coherence state for
-//! this core.  This is intended to be used before a loop that will
-//! invalidate the cache by loading new data and evicting all current data.
-//! Generally speaking, this API should only be used by system code.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, clears the --grind-coherence state for
+ * this core.  This is intended to be used before a loop that will
+ * invalidate the cache by loading new data and evicting all current data.
+ * Generally speaking, this API should only be used by system code.
+ */
 #define SIM_CONTROL_GRINDER_CLEAR 13
 
-//! If written to SPR_SIM_CONTROL, shuts down the simulator.
-//!
+/** If written to SPR_SIM_CONTROL, shuts down the simulator. */
 #define SIM_CONTROL_SHUTDOWN 14
 
-//! If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
-//! indicates that a fork syscall just created the given process.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
+ * indicates that a fork syscall just created the given process.
+ */
 #define SIM_CONTROL_OS_FORK 15
 
-//! If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
-//! indicates that an exit syscall was just executed by the given process.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
+ * indicates that an exit syscall was just executed by the given process.
+ */
 #define SIM_CONTROL_OS_EXIT 16
 
-//! If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
-//! indicates that the OS just switched to the given process.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
+ * indicates that the OS just switched to the given process.
+ */
 #define SIM_CONTROL_OS_SWITCH 17
 
-//! If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
-//! indicates that an exec syscall was just executed. Written to once for
-//! each character in the executable name, plus a final NUL.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
+ * indicates that an exec syscall was just executed. Written to once for
+ * each character in the executable name, plus a final NUL.
+ */
 #define SIM_CONTROL_OS_EXEC 18
 
-//! If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
-//! indicates that an interpreter (PT_INTERP) was loaded.  Written to once
-//! for each character in "ADDR:PATH", plus a final NUL, where "ADDR" is a
-//! hex load address starting with "0x", and "PATH" is the executable name.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
+ * indicates that an interpreter (PT_INTERP) was loaded.  Written to once
+ * for each character in "ADDR:PATH", plus a final NUL, where "ADDR" is a
+ * hex load address starting with "0x", and "PATH" is the executable name.
+ */
 #define SIM_CONTROL_OS_INTERP 19
 
-//! If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
-//! indicates that a dll was loaded.  Written to once for each character
-//! in "ADDR:PATH", plus a final NUL, where "ADDR" is a hexadecimal load
-//! address starting with "0x", and "PATH" is the executable name.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
+ * indicates that a dll was loaded.  Written to once for each character
+ * in "ADDR:PATH", plus a final NUL, where "ADDR" is a hexadecimal load
+ * address starting with "0x", and "PATH" is the executable name.
+ */
 #define SIM_CONTROL_DLOPEN 20
 
-//! If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
-//! indicates that a dll was unloaded.  Written to once for each character
-//! in "ADDR", plus a final NUL, where "ADDR" is a hexadecimal load
-//! address starting with "0x".
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a character (shifted by 8),
+ * indicates that a dll was unloaded.  Written to once for each character
+ * in "ADDR", plus a final NUL, where "ADDR" is a hexadecimal load
+ * address starting with "0x".
+ */
 #define SIM_CONTROL_DLCLOSE 21
 
-//! If written to SPR_SIM_CONTROL, combined with a flag (shifted by 8),
-//! indicates whether to allow data reads to remotely-cached
-//! dirty cache lines to be cached locally without grinder warnings or
-//! assertions (used by Linux kernel fast memcpy).
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a flag (shifted by 8),
+ * indicates whether to allow data reads to remotely-cached
+ * dirty cache lines to be cached locally without grinder warnings or
+ * assertions (used by Linux kernel fast memcpy).
+ */
 #define SIM_CONTROL_ALLOW_MULTIPLE_CACHING 22
 
-//! If written to SPR_SIM_CONTROL, enables memory tracing.
-//!
+/** If written to SPR_SIM_CONTROL, enables memory tracing. */
 #define SIM_CONTROL_ENABLE_MEM_LOGGING 23
 
-//! If written to SPR_SIM_CONTROL, disables memory tracing.
-//!
+/** If written to SPR_SIM_CONTROL, disables memory tracing. */
 #define SIM_CONTROL_DISABLE_MEM_LOGGING 24
 
-//! If written to SPR_SIM_CONTROL, changes the shaping parameters of one of
-//! the gbe or xgbe shims. Must specify the shim id, the type, the units, and
-//! the rate, as defined in SIM_SHAPING_SPR_ARG.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, changes the shaping parameters of one of
+ * the gbe or xgbe shims. Must specify the shim id, the type, the units, and
+ * the rate, as defined in SIM_SHAPING_SPR_ARG.
+ */
 #define SIM_CONTROL_SHAPING 25
 
-//! If written to SPR_SIM_CONTROL, combined with character (shifted by 8),
-//! requests that a simulator command be executed.  Written to once for each
-//! character in the command, plus a final NUL.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with character (shifted by 8),
+ * requests that a simulator command be executed.  Written to once for each
+ * character in the command, plus a final NUL.
+ */
 #define SIM_CONTROL_COMMAND 26
 
-//! If written to SPR_SIM_CONTROL, indicates that the simulated system
-//! is panicking, to allow debugging via --debug-on-panic.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, indicates that the simulated system
+ * is panicking, to allow debugging via --debug-on-panic.
+ */
 #define SIM_CONTROL_PANIC 27
 
-//! If written to SPR_SIM_CONTROL, triggers a simulator syscall.
-//! See "sim_syscall()" for more info.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, triggers a simulator syscall.
+ * See "sim_syscall()" for more info.
+ */
 #define SIM_CONTROL_SYSCALL 32
 
-//! If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
-//! provides the pid that subsequent SIM_CONTROL_OS_FORK writes should
-//! use as the pid, rather than the default previous SIM_CONTROL_OS_SWITCH.
-//!
+/**
+ * If written to SPR_SIM_CONTROL, combined with a pid (shifted by 8),
+ * provides the pid that subsequent SIM_CONTROL_OS_FORK writes should
+ * use as the pid, rather than the default previous SIM_CONTROL_OS_SWITCH.
+ */
 #define SIM_CONTROL_OS_FORK_PARENT 33
 
-//! If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
-//! (shifted by 8), clears the pending magic data section.  The cleared
-//! pending magic data section and any subsequently appended magic bytes
-//! will only take effect when the classifier blast programmer is run.
+/**
+ * If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
+ * (shifted by 8), clears the pending magic data section.  The cleared
+ * pending magic data section and any subsequently appended magic bytes
+ * will only take effect when the classifier blast programmer is run.
+ */
 #define SIM_CONTROL_CLEAR_MPIPE_MAGIC_BYTES 34
 
-//! If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
-//! (shifted by 8) and a byte of data (shifted by 16), appends that byte
-//! to the shim's pending magic data section.  The pending magic data
-//! section takes effect when the classifier blast programmer is run.
+/**
+ * If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
+ * (shifted by 8) and a byte of data (shifted by 16), appends that byte
+ * to the shim's pending magic data section.  The pending magic data
+ * section takes effect when the classifier blast programmer is run.
+ */
 #define SIM_CONTROL_APPEND_MPIPE_MAGIC_BYTE 35
 
-//! If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
-//! (shifted by 8), an enable=1/disable=0 bit (shifted by 16), and a
-//! mask of links (shifted by 32), enable or disable the corresponding
-//! mPIPE links.
+/**
+ * If written to SPR_SIM_CONTROL, combined with a mPIPE shim number
+ * (shifted by 8), an enable=1/disable=0 bit (shifted by 16), and a
+ * mask of links (shifted by 32), enable or disable the corresponding
+ * mPIPE links.
+ */
 #define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
 
-//== Syscall numbers for use with "sim_syscall()".
 
-//! Syscall number for sim_add_watchpoint().
-//!
+/*
+ * Syscall numbers for use with "sim_syscall()".
+ */
+
+/** Syscall number for sim_add_watchpoint(). */
 #define SIM_SYSCALL_ADD_WATCHPOINT 2
 
-//! Syscall number for sim_remove_watchpoint().
-//!
+/** Syscall number for sim_remove_watchpoint(). */
 #define SIM_SYSCALL_REMOVE_WATCHPOINT 3
 
-//! Syscall number for sim_query_watchpoint().
-//!
+/** Syscall number for sim_query_watchpoint(). */
 #define SIM_SYSCALL_QUERY_WATCHPOINT 4
 
-//! Syscall number that asserts that the cache lines whose 64-bit PA
-//! is passed as the second argument to sim_syscall(), and over a
-//! range passed as the third argument, are no longer in cache.
-//! The simulator raises an error if this is not the case.
-//!
+/**
+ * Syscall number that asserts that the cache lines whose 64-bit PA
+ * is passed as the second argument to sim_syscall(), and over a
+ * range passed as the third argument, are no longer in cache.
+ * The simulator raises an error if this is not the case.
+ */
 #define SIM_SYSCALL_VALIDATE_LINES_EVICTED 5
 
 
-//== Bit masks which can be shifted by 8, combined with
-//== SIM_CONTROL_SET_TRACING, and written to SPR_SIM_CONTROL.
+/*
+ * Bit masks which can be shifted by 8, combined with
+ * SIM_CONTROL_SET_TRACING, and written to SPR_SIM_CONTROL.
+ */
 
-//! @addtogroup arch_sim
-//! @{
+/**
+ * @addtogroup arch_sim
+ * @{
+ */
 
-//! Enable --trace-cycle when passed to simulator_set_tracing().
-//!
+/** Enable --trace-cycle when passed to simulator_set_tracing(). */
 #define SIM_TRACE_CYCLES          0x01
 
-//! Enable --trace-router when passed to simulator_set_tracing().
-//!
+/** Enable --trace-router when passed to simulator_set_tracing(). */
 #define SIM_TRACE_ROUTER          0x02
 
-//! Enable --trace-register-writes when passed to simulator_set_tracing().
-//!
+/** Enable --trace-register-writes when passed to simulator_set_tracing(). */
 #define SIM_TRACE_REGISTER_WRITES 0x04
 
-//! Enable --trace-disasm when passed to simulator_set_tracing().
-//!
+/** Enable --trace-disasm when passed to simulator_set_tracing(). */
 #define SIM_TRACE_DISASM          0x08
 
-//! Enable --trace-stall-info when passed to simulator_set_tracing().
-//!
+/** Enable --trace-stall-info when passed to simulator_set_tracing(). */
 #define SIM_TRACE_STALL_INFO      0x10
 
-//! Enable --trace-memory-controller when passed to simulator_set_tracing().
-//!
+/** Enable --trace-memory-controller when passed to simulator_set_tracing(). */
 #define SIM_TRACE_MEMORY_CONTROLLER 0x20
 
-//! Enable --trace-l2 when passed to simulator_set_tracing().
-//!
+/** Enable --trace-l2 when passed to simulator_set_tracing(). */
 #define SIM_TRACE_L2_CACHE 0x40
 
-//! Enable --trace-lines when passed to simulator_set_tracing().
-//!
+/** Enable --trace-lines when passed to simulator_set_tracing(). */
 #define SIM_TRACE_LINES 0x80
 
-//! Turn off all tracing when passed to simulator_set_tracing().
-//!
+/** Turn off all tracing when passed to simulator_set_tracing(). */
 #define SIM_TRACE_NONE 0
 
-//! Turn on all tracing when passed to simulator_set_tracing().
-//!
+/** Turn on all tracing when passed to simulator_set_tracing(). */
 #define SIM_TRACE_ALL (-1)
 
-//! @}
+/** @} */
 
-//! Computes the value to write to SPR_SIM_CONTROL to set tracing flags.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to set tracing flags. */
 #define SIM_TRACE_SPR_ARG(mask) \
   (SIM_CONTROL_SET_TRACING | ((mask) << _SIM_CONTROL_OPERATOR_BITS))
 
 
-//== Bit masks which can be shifted by 8, combined with
-//== SIM_CONTROL_DUMP, and written to SPR_SIM_CONTROL.
+/*
+ * Bit masks which can be shifted by 8, combined with
+ * SIM_CONTROL_DUMP, and written to SPR_SIM_CONTROL.
+ */
 
-//! @addtogroup arch_sim
-//! @{
+/**
+ * @addtogroup arch_sim
+ * @{
+ */
 
-//! Dump the general-purpose registers.
-//!
+/** Dump the general-purpose registers. */
 #define SIM_DUMP_REGS          0x001
 
-//! Dump the SPRs.
-//!
+/** Dump the SPRs. */
 #define SIM_DUMP_SPRS          0x002
 
-//! Dump the ITLB.
-//!
+/** Dump the ITLB. */
 #define SIM_DUMP_ITLB          0x004
 
-//! Dump the DTLB.
-//!
+/** Dump the DTLB. */
 #define SIM_DUMP_DTLB          0x008
 
-//! Dump the L1 I-cache.
-//!
+/** Dump the L1 I-cache. */
 #define SIM_DUMP_L1I           0x010
 
-//! Dump the L1 D-cache.
-//!
+/** Dump the L1 D-cache. */
 #define SIM_DUMP_L1D           0x020
 
-//! Dump the L2 cache.
-//!
+/** Dump the L2 cache. */
 #define SIM_DUMP_L2            0x040
 
-//! Dump the switch registers.
-//!
+/** Dump the switch registers. */
 #define SIM_DUMP_SNREGS        0x080
 
-//! Dump the switch ITLB.
-//!
+/** Dump the switch ITLB. */
 #define SIM_DUMP_SNITLB        0x100
 
-//! Dump the switch L1 I-cache.
-//!
+/** Dump the switch L1 I-cache. */
 #define SIM_DUMP_SNL1I         0x200
 
-//! Dump the current backtrace.
-//!
+/** Dump the current backtrace. */
 #define SIM_DUMP_BACKTRACE     0x400
 
-//! Only dump valid lines in caches.
-//!
+/** Only dump valid lines in caches. */
 #define SIM_DUMP_VALID_LINES   0x800
 
-//! Dump everything that is dumpable.
-//!
+/** Dump everything that is dumpable. */
 #define SIM_DUMP_ALL (-1 & ~SIM_DUMP_VALID_LINES)
 
-// @}
+/** @} */
 
-//! Computes the value to write to SPR_SIM_CONTROL to dump machine state.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to dump machine state. */
 #define SIM_DUMP_SPR_ARG(mask) \
   (SIM_CONTROL_DUMP | ((mask) << _SIM_CONTROL_OPERATOR_BITS))
 
 
-//== Bit masks which can be shifted by 8, combined with
-//== SIM_CONTROL_PROFILER_CHIP_xxx, and written to SPR_SIM_CONTROL.
+/*
+ * Bit masks which can be shifted by 8, combined with
+ * SIM_CONTROL_PROFILER_CHIP_xxx, and written to SPR_SIM_CONTROL.
+ */
 
-//! @addtogroup arch_sim
-//! @{
+/**
+ * @addtogroup arch_sim
+ * @{
+ */
 
-//! Use with with SIM_PROFILER_CHIP_xxx to control the memory controllers.
-//!
+/** Use with with SIM_PROFILER_CHIP_xxx to control the memory controllers. */
 #define SIM_CHIP_MEMCTL        0x001
 
-//! Use with with SIM_PROFILER_CHIP_xxx to control the XAUI interface.
-//!
+/** Use with with SIM_PROFILER_CHIP_xxx to control the XAUI interface. */
 #define SIM_CHIP_XAUI          0x002
 
-//! Use with with SIM_PROFILER_CHIP_xxx to control the PCIe interface.
-//!
+/** Use with with SIM_PROFILER_CHIP_xxx to control the PCIe interface. */
 #define SIM_CHIP_PCIE          0x004
 
-//! Use with with SIM_PROFILER_CHIP_xxx to control the MPIPE interface.
-//!
+/** Use with with SIM_PROFILER_CHIP_xxx to control the MPIPE interface. */
 #define SIM_CHIP_MPIPE         0x008
 
-//! Reference all chip devices.
-//!
+/** Use with with SIM_PROFILER_CHIP_xxx to control the TRIO interface. */
+#define SIM_CHIP_TRIO          0x010
+
+/** Reference all chip devices. */
 #define SIM_CHIP_ALL (-1)
 
-//! @}
+/** @} */
 
-//! Computes the value to write to SPR_SIM_CONTROL to clear chip statistics.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to clear chip statistics. */
 #define SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask) \
   (SIM_CONTROL_PROFILER_CHIP_CLEAR | ((mask) << _SIM_CONTROL_OPERATOR_BITS))
 
-//! Computes the value to write to SPR_SIM_CONTROL to disable chip statistics.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to disable chip statistics.*/
 #define SIM_PROFILER_CHIP_DISABLE_SPR_ARG(mask) \
   (SIM_CONTROL_PROFILER_CHIP_DISABLE | ((mask) << _SIM_CONTROL_OPERATOR_BITS))
 
-//! Computes the value to write to SPR_SIM_CONTROL to enable chip statistics.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to enable chip statistics. */
 #define SIM_PROFILER_CHIP_ENABLE_SPR_ARG(mask) \
   (SIM_CONTROL_PROFILER_CHIP_ENABLE | ((mask) << _SIM_CONTROL_OPERATOR_BITS))
 
 
 
-// Shim bitrate controls.
+/* Shim bitrate controls. */
 
-//! The number of bits used to store the shim id.
-//!
+/** The number of bits used to store the shim id. */
 #define SIM_CONTROL_SHAPING_SHIM_ID_BITS 3
 
-//! @addtogroup arch_sim
-//! @{
+/**
+ * @addtogroup arch_sim
+ * @{
+ */
 
-//! Change the gbe 0 bitrate.
-//!
+/** Change the gbe 0 bitrate. */
 #define SIM_CONTROL_SHAPING_GBE_0 0x0
 
-//! Change the gbe 1 bitrate.
-//!
+/** Change the gbe 1 bitrate. */
 #define SIM_CONTROL_SHAPING_GBE_1 0x1
 
-//! Change the gbe 2 bitrate.
-//!
+/** Change the gbe 2 bitrate. */
 #define SIM_CONTROL_SHAPING_GBE_2 0x2
 
-//! Change the gbe 3 bitrate.
-//!
+/** Change the gbe 3 bitrate. */
 #define SIM_CONTROL_SHAPING_GBE_3 0x3
 
-//! Change the xgbe 0 bitrate.
-//!
+/** Change the xgbe 0 bitrate. */
 #define SIM_CONTROL_SHAPING_XGBE_0 0x4
 
-//! Change the xgbe 1 bitrate.
-//!
+/** Change the xgbe 1 bitrate. */
 #define SIM_CONTROL_SHAPING_XGBE_1 0x5
 
-//! The type of shaping to do.
-//!
+/** The type of shaping to do. */
 #define SIM_CONTROL_SHAPING_TYPE_BITS 2
 
-//! Control the multiplier.
-//!
+/** Control the multiplier. */
 #define SIM_CONTROL_SHAPING_MULTIPLIER 0
 
-//! Control the PPS.
-//!
+/** Control the PPS. */
 #define SIM_CONTROL_SHAPING_PPS 1
 
-//! Control the BPS.
-//!
+/** Control the BPS. */
 #define SIM_CONTROL_SHAPING_BPS 2
 
-//! The number of bits for the units for the shaping parameter.
-//!
+/** The number of bits for the units for the shaping parameter. */
 #define SIM_CONTROL_SHAPING_UNITS_BITS 2
 
-//! Provide a number in single units.
-//!
+/** Provide a number in single units. */
 #define SIM_CONTROL_SHAPING_UNITS_SINGLE 0
 
-//! Provide a number in kilo units.
-//!
+/** Provide a number in kilo units. */
 #define SIM_CONTROL_SHAPING_UNITS_KILO 1
 
-//! Provide a number in mega units.
-//!
+/** Provide a number in mega units. */
 #define SIM_CONTROL_SHAPING_UNITS_MEGA 2
 
-//! Provide a number in giga units.
-//!
+/** Provide a number in giga units. */
 #define SIM_CONTROL_SHAPING_UNITS_GIGA 3
 
-// @}
+/** @} */
 
-//! How many bits are available for the rate.
-//!
+/** How many bits are available for the rate. */
 #define SIM_CONTROL_SHAPING_RATE_BITS \
   (32 - (_SIM_CONTROL_OPERATOR_BITS + \
          SIM_CONTROL_SHAPING_SHIM_ID_BITS + \
          SIM_CONTROL_SHAPING_TYPE_BITS + \
          SIM_CONTROL_SHAPING_UNITS_BITS))
 
-//! Computes the value to write to SPR_SIM_CONTROL to change a bitrate.
-//!
+/** Computes the value to write to SPR_SIM_CONTROL to change a bitrate. */
 #define SIM_SHAPING_SPR_ARG(shim, type, units, rate) \
   (SIM_CONTROL_SHAPING | \
    ((shim) | \
@@ -483,30 +467,36 @@
                SIM_CONTROL_SHAPING_UNITS_BITS))) << _SIM_CONTROL_OPERATOR_BITS)
 
 
-//== Values returned when reading SPR_SIM_CONTROL.
-// ISSUE: These names should share a longer common prefix.
+/*
+ * Values returned when reading SPR_SIM_CONTROL.
+ * ISSUE: These names should share a longer common prefix.
+ */
 
-//! When reading SPR_SIM_CONTROL, the mask of simulator tracing bits
-//! (SIM_TRACE_xxx values).
-//!
+/**
+ * When reading SPR_SIM_CONTROL, the mask of simulator tracing bits
+ * (SIM_TRACE_xxx values).
+ */
 #define SIM_TRACE_FLAG_MASK 0xFFFF
 
-//! When reading SPR_SIM_CONTROL, the mask for whether profiling is enabled.
-//!
+/** When reading SPR_SIM_CONTROL, the mask for whether profiling is enabled. */
 #define SIM_PROFILER_ENABLED_MASK 0x10000
 
 
-//== Special arguments for "SIM_CONTROL_PUTC".
+/*
+ * Special arguments for "SIM_CONTROL_PUTC".
+ */
 
-//! Flag value for forcing a PUTC string-flush, including
-//! coordinate/cycle prefix and newline.
-//!
+/**
+ * Flag value for forcing a PUTC string-flush, including
+ * coordinate/cycle prefix and newline.
+ */
 #define SIM_PUTC_FLUSH_STRING 0x100
 
-//! Flag value for forcing a PUTC binary-data-flush, which skips the
-//! prefix and does not append a newline.
-//!
+/**
+ * Flag value for forcing a PUTC binary-data-flush, which skips the
+ * prefix and does not append a newline.
+ */
 #define SIM_PUTC_FLUSH_BINARY 0x101
 
 
-#endif //__ARCH_SIM_DEF_H__
+#endif /* __ARCH_SIM_DEF_H__ */
diff --git a/arch/tile/include/arch/spr_def.h b/arch/tile/include/arch/spr_def.h
index c8fdbd9..442fcba0 100644
--- a/arch/tile/include/arch/spr_def.h
+++ b/arch/tile/include/arch/spr_def.h
@@ -12,8 +12,93 @@
  *   more details.
  */
 
+/*
+ * In addition to including the proper base SPR definition file, depending
+ * on machine architecture, this file defines several macros which allow
+ * kernel code to use protection-level dependent SPRs without worrying
+ * about which PL it's running at.  In these macros, the PL that the SPR
+ * or interrupt number applies to is replaced by K.
+ */
+
+#if CONFIG_KERNEL_PL != 1 && CONFIG_KERNEL_PL != 2
+#error CONFIG_KERNEL_PL must be 1 or 2
+#endif
+
+/* Concatenate 4 strings. */
+#define __concat4(a, b, c, d) a ## b ## c ## d
+#define _concat4(a, b, c, d)  __concat4(a, b, c, d)
+
 #ifdef __tilegx__
 #include <arch/spr_def_64.h>
+
+/* TILE-Gx dependent, protection-level dependent SPRs. */
+
+#define SPR_INTERRUPT_MASK_K \
+	_concat4(SPR_INTERRUPT_MASK_, CONFIG_KERNEL_PL,,)
+#define SPR_INTERRUPT_MASK_SET_K \
+	_concat4(SPR_INTERRUPT_MASK_SET_, CONFIG_KERNEL_PL,,)
+#define SPR_INTERRUPT_MASK_RESET_K \
+	_concat4(SPR_INTERRUPT_MASK_RESET_, CONFIG_KERNEL_PL,,)
+#define SPR_INTERRUPT_VECTOR_BASE_K \
+	_concat4(SPR_INTERRUPT_VECTOR_BASE_, CONFIG_KERNEL_PL,,)
+
+#define SPR_IPI_MASK_K \
+	_concat4(SPR_IPI_MASK_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_MASK_RESET_K \
+	_concat4(SPR_IPI_MASK_RESET_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_MASK_SET_K \
+	_concat4(SPR_IPI_MASK_SET_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_EVENT_K \
+	_concat4(SPR_IPI_EVENT_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_EVENT_RESET_K \
+	_concat4(SPR_IPI_EVENT_RESET_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_MASK_SET_K \
+	_concat4(SPR_IPI_MASK_SET_, CONFIG_KERNEL_PL,,)
+#define INT_IPI_K \
+	_concat4(INT_IPI_, CONFIG_KERNEL_PL,,)
+
+#define SPR_SINGLE_STEP_CONTROL_K \
+	_concat4(SPR_SINGLE_STEP_CONTROL_, CONFIG_KERNEL_PL,,)
+#define SPR_SINGLE_STEP_EN_K_K \
+	_concat4(SPR_SINGLE_STEP_EN_, CONFIG_KERNEL_PL, _, CONFIG_KERNEL_PL)
+#define INT_SINGLE_STEP_K \
+	_concat4(INT_SINGLE_STEP_, CONFIG_KERNEL_PL,,)
+
 #else
 #include <arch/spr_def_32.h>
+
+/* TILEPro dependent, protection-level dependent SPRs. */
+
+#define SPR_INTERRUPT_MASK_K_0 \
+	_concat4(SPR_INTERRUPT_MASK_, CONFIG_KERNEL_PL, _0,)
+#define SPR_INTERRUPT_MASK_K_1 \
+	_concat4(SPR_INTERRUPT_MASK_, CONFIG_KERNEL_PL, _1,)
+#define SPR_INTERRUPT_MASK_SET_K_0 \
+	_concat4(SPR_INTERRUPT_MASK_SET_, CONFIG_KERNEL_PL, _0,)
+#define SPR_INTERRUPT_MASK_SET_K_1 \
+	_concat4(SPR_INTERRUPT_MASK_SET_, CONFIG_KERNEL_PL, _1,)
+#define SPR_INTERRUPT_MASK_RESET_K_0 \
+	_concat4(SPR_INTERRUPT_MASK_RESET_, CONFIG_KERNEL_PL, _0,)
+#define SPR_INTERRUPT_MASK_RESET_K_1 \
+	_concat4(SPR_INTERRUPT_MASK_RESET_, CONFIG_KERNEL_PL, _1,)
+
 #endif
+
+/* Generic protection-level dependent SPRs. */
+
+#define SPR_SYSTEM_SAVE_K_0 \
+	_concat4(SPR_SYSTEM_SAVE_, CONFIG_KERNEL_PL, _0,)
+#define SPR_SYSTEM_SAVE_K_1 \
+	_concat4(SPR_SYSTEM_SAVE_, CONFIG_KERNEL_PL, _1,)
+#define SPR_SYSTEM_SAVE_K_2 \
+	_concat4(SPR_SYSTEM_SAVE_, CONFIG_KERNEL_PL, _2,)
+#define SPR_SYSTEM_SAVE_K_3 \
+	_concat4(SPR_SYSTEM_SAVE_, CONFIG_KERNEL_PL, _3,)
+#define SPR_EX_CONTEXT_K_0 \
+	_concat4(SPR_EX_CONTEXT_, CONFIG_KERNEL_PL, _0,)
+#define SPR_EX_CONTEXT_K_1 \
+	_concat4(SPR_EX_CONTEXT_, CONFIG_KERNEL_PL, _1,)
+#define SPR_INTCTRL_K_STATUS \
+	_concat4(SPR_INTCTRL_, CONFIG_KERNEL_PL, _STATUS,)
+#define INT_INTCTRL_K \
+	_concat4(INT_INTCTRL_, CONFIG_KERNEL_PL,,)
diff --git a/arch/tile/include/arch/spr_def_32.h b/arch/tile/include/arch/spr_def_32.h
index b4fc068..bbc1f4c 100644
--- a/arch/tile/include/arch/spr_def_32.h
+++ b/arch/tile/include/arch/spr_def_32.h
@@ -56,58 +56,93 @@
 #define SPR_EX_CONTEXT_1_1__ICS_SHIFT 2
 #define SPR_EX_CONTEXT_1_1__ICS_RMASK 0x1
 #define SPR_EX_CONTEXT_1_1__ICS_MASK  0x4
+#define SPR_EX_CONTEXT_2_0 0x4605
+#define SPR_EX_CONTEXT_2_1 0x4606
+#define SPR_EX_CONTEXT_2_1__PL_SHIFT 0
+#define SPR_EX_CONTEXT_2_1__PL_RMASK 0x3
+#define SPR_EX_CONTEXT_2_1__PL_MASK  0x3
+#define SPR_EX_CONTEXT_2_1__ICS_SHIFT 2
+#define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
+#define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
 #define SPR_FAIL 0x4e09
 #define SPR_INTCTRL_0_STATUS 0x4a07
 #define SPR_INTCTRL_1_STATUS 0x4807
+#define SPR_INTCTRL_2_STATUS 0x4607
 #define SPR_INTERRUPT_CRITICAL_SECTION 0x4e0a
 #define SPR_INTERRUPT_MASK_0_0 0x4a08
 #define SPR_INTERRUPT_MASK_0_1 0x4a09
 #define SPR_INTERRUPT_MASK_1_0 0x4809
 #define SPR_INTERRUPT_MASK_1_1 0x480a
+#define SPR_INTERRUPT_MASK_2_0 0x4608
+#define SPR_INTERRUPT_MASK_2_1 0x4609
 #define SPR_INTERRUPT_MASK_RESET_0_0 0x4a0a
 #define SPR_INTERRUPT_MASK_RESET_0_1 0x4a0b
 #define SPR_INTERRUPT_MASK_RESET_1_0 0x480b
 #define SPR_INTERRUPT_MASK_RESET_1_1 0x480c
+#define SPR_INTERRUPT_MASK_RESET_2_0 0x460a
+#define SPR_INTERRUPT_MASK_RESET_2_1 0x460b
 #define SPR_INTERRUPT_MASK_SET_0_0 0x4a0c
 #define SPR_INTERRUPT_MASK_SET_0_1 0x4a0d
 #define SPR_INTERRUPT_MASK_SET_1_0 0x480d
 #define SPR_INTERRUPT_MASK_SET_1_1 0x480e
+#define SPR_INTERRUPT_MASK_SET_2_0 0x460c
+#define SPR_INTERRUPT_MASK_SET_2_1 0x460d
 #define SPR_MPL_DMA_CPL_SET_0 0x5800
 #define SPR_MPL_DMA_CPL_SET_1 0x5801
+#define SPR_MPL_DMA_CPL_SET_2 0x5802
 #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
 #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
+#define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
 #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
 #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
+#define SPR_MPL_INTCTRL_0_SET_2 0x4a02
 #define SPR_MPL_INTCTRL_1_SET_0 0x4800
 #define SPR_MPL_INTCTRL_1_SET_1 0x4801
+#define SPR_MPL_INTCTRL_1_SET_2 0x4802
+#define SPR_MPL_INTCTRL_2_SET_0 0x4600
+#define SPR_MPL_INTCTRL_2_SET_1 0x4601
+#define SPR_MPL_INTCTRL_2_SET_2 0x4602
 #define SPR_MPL_SN_ACCESS_SET_0 0x0800
 #define SPR_MPL_SN_ACCESS_SET_1 0x0801
+#define SPR_MPL_SN_ACCESS_SET_2 0x0802
 #define SPR_MPL_SN_CPL_SET_0 0x5a00
 #define SPR_MPL_SN_CPL_SET_1 0x5a01
+#define SPR_MPL_SN_CPL_SET_2 0x5a02
 #define SPR_MPL_SN_FIREWALL_SET_0 0x2c00
 #define SPR_MPL_SN_FIREWALL_SET_1 0x2c01
+#define SPR_MPL_SN_FIREWALL_SET_2 0x2c02
 #define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
 #define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
+#define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
 #define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
 #define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
+#define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
 #define SPR_MPL_UDN_AVAIL_SET_0 0x4000
 #define SPR_MPL_UDN_AVAIL_SET_1 0x4001
+#define SPR_MPL_UDN_AVAIL_SET_2 0x4002
 #define SPR_MPL_UDN_CA_SET_0 0x3c00
 #define SPR_MPL_UDN_CA_SET_1 0x3c01
+#define SPR_MPL_UDN_CA_SET_2 0x3c02
 #define SPR_MPL_UDN_COMPLETE_SET_0 0x1400
 #define SPR_MPL_UDN_COMPLETE_SET_1 0x1401
+#define SPR_MPL_UDN_COMPLETE_SET_2 0x1402
 #define SPR_MPL_UDN_FIREWALL_SET_0 0x3000
 #define SPR_MPL_UDN_FIREWALL_SET_1 0x3001
+#define SPR_MPL_UDN_FIREWALL_SET_2 0x3002
 #define SPR_MPL_UDN_REFILL_SET_0 0x1000
 #define SPR_MPL_UDN_REFILL_SET_1 0x1001
+#define SPR_MPL_UDN_REFILL_SET_2 0x1002
 #define SPR_MPL_UDN_TIMER_SET_0 0x3600
 #define SPR_MPL_UDN_TIMER_SET_1 0x3601
+#define SPR_MPL_UDN_TIMER_SET_2 0x3602
 #define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
 #define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
+#define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
 #define SPR_PASS 0x4e0b
 #define SPR_PERF_COUNT_0 0x4205
 #define SPR_PERF_COUNT_1 0x4206
 #define SPR_PERF_COUNT_CTL 0x4207
+#define SPR_PERF_COUNT_DN_CTL 0x4210
 #define SPR_PERF_COUNT_STS 0x4208
 #define SPR_PROC_STATUS 0x4f00
 #define SPR_SIM_CONTROL 0x4e0c
@@ -124,6 +159,10 @@
 #define SPR_SYSTEM_SAVE_1_1 0x4901
 #define SPR_SYSTEM_SAVE_1_2 0x4902
 #define SPR_SYSTEM_SAVE_1_3 0x4903
+#define SPR_SYSTEM_SAVE_2_0 0x4700
+#define SPR_SYSTEM_SAVE_2_1 0x4701
+#define SPR_SYSTEM_SAVE_2_2 0x4702
+#define SPR_SYSTEM_SAVE_2_3 0x4703
 #define SPR_TILE_COORD 0x4c17
 #define SPR_TILE_RTF_HWM 0x4e10
 #define SPR_TILE_TIMER_CONTROL 0x3205
diff --git a/arch/tile/include/asm/backtrace.h b/arch/tile/include/asm/backtrace.h
index 758ca46..f18887d 100644
--- a/arch/tile/include/asm/backtrace.h
+++ b/arch/tile/include/asm/backtrace.h
@@ -146,7 +146,10 @@ enum {
 
 	CALLER_SP_IN_R52_BASE = 4,
 
-	CALLER_SP_OFFSET_BASE = 8
+	CALLER_SP_OFFSET_BASE = 8,
+
+	/* Marks the entry point of certain functions. */
+	ENTRY_POINT_INFO_OP = 16
 };
 
 
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 6832b4b..6d4f0ff 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -120,6 +120,7 @@ static inline unsigned long __arch_hweight64(__u64 w)
 
 #include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
 #include <asm-generic/bitops/minix.h>
diff --git a/arch/tile/include/asm/cacheflush.h b/arch/tile/include/asm/cacheflush.h
index c5741da4..14a3f85 100644
--- a/arch/tile/include/asm/cacheflush.h
+++ b/arch/tile/include/asm/cacheflush.h
@@ -137,4 +137,56 @@ static inline void finv_buffer(void *buffer, size_t size)
 	mb_incoherent();
 }
 
+/*
+ * Flush & invalidate a VA range that is homed remotely on a single core,
+ * waiting until the memory controller holds the flushed values.
+ */
+static inline void finv_buffer_remote(void *buffer, size_t size)
+{
+	char *p;
+	int i;
+
+	/*
+	 * Flush and invalidate the buffer out of the local L1/L2
+	 * and request the home cache to flush and invalidate as well.
+	 */
+	__finv_buffer(buffer, size);
+
+	/*
+	 * Wait for the home cache to acknowledge that it has processed
+	 * all the flush-and-invalidate requests.  This does not mean
+	 * that the flushed data has reached the memory controller yet,
+	 * but it does mean the home cache is processing the flushes.
+	 */
+	__insn_mf();
+
+	/*
+	 * Issue a load to the last cache line, which can't complete
+	 * until all the previously-issued flushes to the same memory
+	 * controller have also completed.  If we weren't striping
+	 * memory, that one load would be sufficient, but since we may
+	 * be, we also need to back up to the last load issued to
+	 * another memory controller, which would be the point where
+	 * we crossed an 8KB boundary (the granularity of striping
+	 * across memory controllers).  Keep backing up and doing this
+	 * until we are before the beginning of the buffer, or have
+	 * hit all the controllers.
+	 */
+	for (i = 0, p = (char *)buffer + size - 1;
+	     i < (1 << CHIP_LOG_NUM_MSHIMS()) && p >= (char *)buffer;
+	     ++i) {
+		const unsigned long STRIPE_WIDTH = 8192;
+
+		/* Force a load instruction to issue. */
+		*(volatile char *)p;
+
+		/* Jump to end of previous stripe. */
+		p -= STRIPE_WIDTH;
+		p = (char *)((unsigned long)p | (STRIPE_WIDTH - 1));
+	}
+
+	/* Wait for the loads (and thus flushes) to have completed. */
+	__insn_mf();
+}
+
 #endif /* _ASM_TILE_CACHEFLUSH_H */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 8b60ec8..c3ae570 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -216,15 +216,16 @@ struct compat_siginfo;
 struct compat_sigaltstack;
 long compat_sys_execve(const char __user *path,
 		       const compat_uptr_t __user *argv,
-		       const compat_uptr_t __user *envp);
+		       const compat_uptr_t __user *envp, struct pt_regs *);
 long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
 			     struct compat_sigaction __user *oact,
 			     size_t sigsetsize);
 long compat_sys_rt_sigqueueinfo(int pid, int sig,
 				struct compat_siginfo __user *uinfo);
-long compat_sys_rt_sigreturn(void);
+long compat_sys_rt_sigreturn(struct pt_regs *);
 long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr);
+			    struct compat_sigaltstack __user *uoss_ptr,
+			    struct pt_regs *);
 long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high);
 long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high);
 long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count,
@@ -255,4 +256,12 @@ long tile_compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 /* Tilera Linux syscalls that don't have "compat" versions. */
 #define compat_sys_flush_cache sys_flush_cache
 
+/* These are the intvec_64.S trampolines. */
+long _compat_sys_execve(const char __user *path,
+			const compat_uptr_t __user *argv,
+			const compat_uptr_t __user *envp);
+long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
+			    struct compat_sigaltstack __user *uoss_ptr);
+long _compat_sys_rt_sigreturn(void);
+
 #endif /* _ASM_TILE_COMPAT_H */
diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h
index d155db6..b2a6c5d 100644
--- a/arch/tile/include/asm/highmem.h
+++ b/arch/tile/include/asm/highmem.h
@@ -23,7 +23,6 @@
 
 #include <linux/interrupt.h>
 #include <linux/threads.h>
-#include <asm/kmap_types.h>
 #include <asm/tlbflush.h>
 #include <asm/homecache.h>
 
@@ -60,12 +59,12 @@ void *kmap_fix_kpte(struct page *page, int finished);
 /* This macro is used only in map_new_virtual() to map "page". */
 #define kmap_prot page_to_kpgprot(page)
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
 void kmap_atomic_fix_kpte(struct page *page, int finished);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index ee43328..d3cbb9b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -55,9 +55,6 @@ extern void iounmap(volatile void __iomem *addr);
 #define ioremap_writethrough(physaddr, size)	ioremap(physaddr, size)
 #define ioremap_fullcache(physaddr, size)	ioremap(physaddr, size)
 
-void __iomem *ioport_map(unsigned long port, unsigned int len);
-extern inline void ioport_unmap(void __iomem *addr) {}
-
 #define mmiowb()
 
 /* Conversion between virtual and physical mappings.  */
@@ -189,12 +186,22 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
  * we never run, uses them unconditionally.
  */
 
-static inline int ioport_panic(void)
+static inline long ioport_panic(void)
 {
 	panic("inb/outb and friends do not exist on tile");
 	return 0;
 }
 
+static inline void __iomem *ioport_map(unsigned long port, unsigned int len)
+{
+	return (void __iomem *) ioport_panic();
+}
+
+static inline void ioport_unmap(void __iomem *addr)
+{
+	ioport_panic();
+}
+
 static inline u8 inb(unsigned long addr)
 {
 	return ioport_panic();
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index a11d483..641e4ff 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -47,53 +47,53 @@
 	int __n = (n); \
 	int __mask = 1 << (__n & 0x1f); \
 	if (__n < 32) \
-		__insn_mtspr(SPR_INTERRUPT_MASK_SET_1_0, __mask); \
+		__insn_mtspr(SPR_INTERRUPT_MASK_SET_K_0, __mask); \
 	else \
-		__insn_mtspr(SPR_INTERRUPT_MASK_SET_1_1, __mask); \
+		__insn_mtspr(SPR_INTERRUPT_MASK_SET_K_1, __mask); \
 } while (0)
 #define interrupt_mask_reset(n) do { \
 	int __n = (n); \
 	int __mask = 1 << (__n & 0x1f); \
 	if (__n < 32) \
-		__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1_0, __mask); \
+		__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, __mask); \
 	else \
-		__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1_1, __mask); \
+		__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, __mask); \
 } while (0)
 #define interrupt_mask_check(n) ({ \
 	int __n = (n); \
 	(((__n < 32) ? \
-	 __insn_mfspr(SPR_INTERRUPT_MASK_1_0) : \
-	 __insn_mfspr(SPR_INTERRUPT_MASK_1_1)) \
+	 __insn_mfspr(SPR_INTERRUPT_MASK_K_0) : \
+	 __insn_mfspr(SPR_INTERRUPT_MASK_K_1)) \
 	  >> (__n & 0x1f)) & 1; \
 })
 #define interrupt_mask_set_mask(mask) do { \
 	unsigned long long __m = (mask); \
-	__insn_mtspr(SPR_INTERRUPT_MASK_SET_1_0, (unsigned long)(__m)); \
-	__insn_mtspr(SPR_INTERRUPT_MASK_SET_1_1, (unsigned long)(__m>>32)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K_0, (unsigned long)(__m)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
 #define interrupt_mask_reset_mask(mask) do { \
 	unsigned long long __m = (mask); \
-	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1_0, (unsigned long)(__m)); \
-	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1_1, (unsigned long)(__m>>32)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, (unsigned long)(__m)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
 #else
 #define interrupt_mask_set(n) \
-	__insn_mtspr(SPR_INTERRUPT_MASK_SET_1, (1UL << (n)))
+	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (1UL << (n)))
 #define interrupt_mask_reset(n) \
-	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1, (1UL << (n)))
+	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (1UL << (n)))
 #define interrupt_mask_check(n) \
-	((__insn_mfspr(SPR_INTERRUPT_MASK_1) >> (n)) & 1)
+	((__insn_mfspr(SPR_INTERRUPT_MASK_K) >> (n)) & 1)
 #define interrupt_mask_set_mask(mask) \
-	__insn_mtspr(SPR_INTERRUPT_MASK_SET_1, (mask))
+	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (mask))
 #define interrupt_mask_reset_mask(mask) \
-	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_1, (mask))
+	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (mask))
 #endif
 
 /*
  * The set of interrupts we want active if irqs are enabled.
  * Note that in particular, the tile timer interrupt comes and goes
  * from this set, since we have no other way to turn off the timer.
- * Likewise, INTCTRL_1 is removed and re-added during device
+ * Likewise, INTCTRL_K is removed and re-added during device
  * interrupts, as is the the hardwall UDN_FIREWALL interrupt.
  * We use a low bit (MEM_ERROR) as our sentinel value and make sure it
  * is always claimed as an "active interrupt" so we can query that bit
@@ -170,14 +170,14 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Return 0 or 1 to indicate whether interrupts are currently disabled. */
 #define IRQS_DISABLED(tmp)					\
-	mfspr   tmp, INTERRUPT_MASK_1;				\
+	mfspr   tmp, SPR_INTERRUPT_MASK_K;			\
 	andi    tmp, tmp, 1
 
 /* Load up a pointer to &interrupts_enabled_mask. */
 #define GET_INTERRUPTS_ENABLED_MASK_PTR(reg)			\
-	moveli reg, hw2_last(interrupts_enabled_mask); \
-	shl16insli reg, reg, hw1(interrupts_enabled_mask); \
-	shl16insli reg, reg, hw0(interrupts_enabled_mask); \
+	moveli reg, hw2_last(interrupts_enabled_mask);		\
+	shl16insli reg, reg, hw1(interrupts_enabled_mask);	\
+	shl16insli reg, reg, hw0(interrupts_enabled_mask);	\
 	add     reg, reg, tp
 
 /* Disable interrupts. */
@@ -185,18 +185,18 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	moveli  tmp0, hw2_last(LINUX_MASKABLE_INTERRUPTS);	\
 	shl16insli tmp0, tmp0, hw1(LINUX_MASKABLE_INTERRUPTS);	\
 	shl16insli tmp0, tmp0, hw0(LINUX_MASKABLE_INTERRUPTS);	\
-	mtspr   INTERRUPT_MASK_SET_1, tmp0
+	mtspr   SPR_INTERRUPT_MASK_SET_K, tmp0
 
 /* Disable ALL synchronous interrupts (used by NMI entry). */
 #define IRQ_DISABLE_ALL(tmp)					\
 	movei   tmp, -1;					\
-	mtspr   INTERRUPT_MASK_SET_1, tmp
+	mtspr   SPR_INTERRUPT_MASK_SET_K, tmp
 
 /* Enable interrupts. */
 #define IRQ_ENABLE(tmp0, tmp1)					\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
 	ld      tmp0, tmp0;					\
-	mtspr   INTERRUPT_MASK_RESET_1, tmp0
+	mtspr   SPR_INTERRUPT_MASK_RESET_K, tmp0
 
 #else /* !__tilegx__ */
 
@@ -210,14 +210,14 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
  * (making the original code's write of the "high" mask word idempotent).
  */
 #define IRQS_DISABLED(tmp)					\
-	mfspr   tmp, INTERRUPT_MASK_1_0;			\
+	mfspr   tmp, SPR_INTERRUPT_MASK_K_0;			\
 	shri    tmp, tmp, INT_MEM_ERROR;			\
 	andi    tmp, tmp, 1
 
 /* Load up a pointer to &interrupts_enabled_mask. */
 #define GET_INTERRUPTS_ENABLED_MASK_PTR(reg)			\
-	moveli  reg, lo16(interrupts_enabled_mask);	\
-	auli    reg, reg, ha16(interrupts_enabled_mask);\
+	moveli  reg, lo16(interrupts_enabled_mask);		\
+	auli    reg, reg, ha16(interrupts_enabled_mask);	\
 	add     reg, reg, tp
 
 /* Disable interrupts. */
@@ -227,16 +227,16 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	 moveli tmp1, lo16(LINUX_MASKABLE_INTERRUPTS)		\
 	};							\
 	{							\
-	 mtspr  INTERRUPT_MASK_SET_1_0, tmp0;			\
+	 mtspr  SPR_INTERRUPT_MASK_SET_K_0, tmp0;		\
 	 auli   tmp1, tmp1, ha16(LINUX_MASKABLE_INTERRUPTS)	\
 	};							\
-	mtspr   INTERRUPT_MASK_SET_1_1, tmp1
+	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp1
 
 /* Disable ALL synchronous interrupts (used by NMI entry). */
 #define IRQ_DISABLE_ALL(tmp)					\
 	movei   tmp, -1;					\
-	mtspr   INTERRUPT_MASK_SET_1_0, tmp;			\
-	mtspr   INTERRUPT_MASK_SET_1_1, tmp
+	mtspr   SPR_INTERRUPT_MASK_SET_K_0, tmp;		\
+	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp
 
 /* Enable interrupts. */
 #define IRQ_ENABLE(tmp0, tmp1)					\
@@ -246,8 +246,8 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	 addi   tmp1, tmp0, 4					\
 	};							\
 	lw      tmp1, tmp1;					\
-	mtspr   INTERRUPT_MASK_RESET_1_0, tmp0;			\
-	mtspr   INTERRUPT_MASK_RESET_1_1, tmp1
+	mtspr   SPR_INTERRUPT_MASK_RESET_K_0, tmp0;		\
+	mtspr   SPR_INTERRUPT_MASK_RESET_K_1, tmp1
 #endif
 
 /*
diff --git a/arch/tile/include/asm/kmap_types.h b/arch/tile/include/asm/kmap_types.h
index 1480106..3d0f202 100644
--- a/arch/tile/include/asm/kmap_types.h
+++ b/arch/tile/include/asm/kmap_types.h
@@ -16,28 +16,42 @@
 #define _ASM_TILE_KMAP_TYPES_H
 
 /*
- * In TILE Linux each set of four of these uses another 16MB chunk of
- * address space, given 64 tiles and 64KB pages, so we only enable
- * ones that are required by the kernel configuration.
+ * In 32-bit TILE Linux we have to balance the desire to have a lot of
+ * nested atomic mappings with the fact that large page sizes and many
+ * processors chew up address space quickly.  In a typical
+ * 64-processor, 64KB-page layout build, making KM_TYPE_NR one larger
+ * adds 4MB of required address-space.  For now we leave KM_TYPE_NR
+ * set to depth 8.
  */
 enum km_type {
+	KM_TYPE_NR = 8
+};
+
+/*
+ * We provide dummy definitions of all the stray values that used to be
+ * required for kmap_atomic() and no longer are.
+ */
+enum {
 	KM_BOUNCE_READ,
 	KM_SKB_SUNRPC_DATA,
 	KM_SKB_DATA_SOFTIRQ,
 	KM_USER0,
 	KM_USER1,
 	KM_BIO_SRC_IRQ,
+	KM_BIO_DST_IRQ,
+	KM_PTE0,
+	KM_PTE1,
 	KM_IRQ0,
 	KM_IRQ1,
 	KM_SOFTIRQ0,
 	KM_SOFTIRQ1,
-	KM_MEMCPY0,
-	KM_MEMCPY1,
-#if defined(CONFIG_HIGHPTE)
-	KM_PTE0,
-	KM_PTE1,
-#endif
-	KM_TYPE_NR
+	KM_SYNC_ICACHE,
+	KM_SYNC_DCACHE,
+	KM_UML_USERCOPY,
+	KM_IRQ_PTE,
+	KM_NMI,
+	KM_NMI_PTE,
+	KM_KDB
 };
 
 #endif /* _ASM_TILE_KMAP_TYPES_H */
diff --git a/arch/tile/include/asm/mman.h b/arch/tile/include/asm/mman.h
index 4c6811e..81b8fc3 100644
--- a/arch/tile/include/asm/mman.h
+++ b/arch/tile/include/asm/mman.h
@@ -23,6 +23,7 @@
 #define MAP_POPULATE	0x0040		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x0080		/* do not block on IO */
 #define MAP_GROWSDOWN	0x0100		/* stack-like segment */
+#define MAP_STACK	MAP_GROWSDOWN	/* provide convenience alias */
 #define MAP_LOCKED	0x0200		/* pages are locked */
 #define MAP_NORESERVE	0x0400		/* don't check for reservations */
 #define MAP_DENYWRITE	0x0800		/* ETXTBSY */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index 7d90641..7979a45 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -199,17 +199,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * If you want more physical memory than this then see the CONFIG_HIGHMEM
  * option in the kernel configuration.
  *
- * The top two 16MB chunks in the table below (VIRT and HV) are
- * unavailable to Linux.  Since the kernel interrupt vectors must live
- * at 0xfd000000, we map all of the bottom of RAM at this address with
- * a huge page table entry to minimize its ITLB footprint (as well as
- * at PAGE_OFFSET).  The last architected requirement is that user
- * interrupt vectors live at 0xfc000000, so we make that range of
- * memory available to user processes.  The remaining regions are sized
- * as shown; after the first four addresses, we show "typical" values,
- * since the actual addresses depend on kernel #defines.
+ * The top 16MB chunk in the table below is unavailable to Linux.  Since
+ * the kernel interrupt vectors must live at ether 0xfe000000 or 0xfd000000
+ * (depending on whether the kernel is at PL2 or Pl1), we map all of the
+ * bottom of RAM at this address with a huge page table entry to minimize
+ * its ITLB footprint (as well as at PAGE_OFFSET).  The last architected
+ * requirement is that user interrupt vectors live at 0xfc000000, so we
+ * make that range of memory available to user processes.  The remaining
+ * regions are sized as shown; the first four addresses use the PL 1
+ * values, and after that, we show "typical" values, since the actual
+ * addresses depend on kernel #defines.
  *
- * MEM_VIRT_INTRPT                 0xff000000
  * MEM_HV_INTRPT                   0xfe000000
  * MEM_SV_INTRPT (kernel code)     0xfd000000
  * MEM_USER_INTRPT (user vector)   0xfc000000
@@ -221,9 +221,14 @@ static inline __attribute_const__ int get_order(unsigned long size)
  */
 
 #define MEM_USER_INTRPT		_AC(0xfc000000, UL)
+#if CONFIG_KERNEL_PL == 1
 #define MEM_SV_INTRPT		_AC(0xfd000000, UL)
 #define MEM_HV_INTRPT		_AC(0xfe000000, UL)
-#define MEM_VIRT_INTRPT		_AC(0xff000000, UL)
+#else
+#define MEM_GUEST_INTRPT	_AC(0xfd000000, UL)
+#define MEM_SV_INTRPT		_AC(0xfe000000, UL)
+#define MEM_HV_INTRPT		_AC(0xff000000, UL)
+#endif
 
 #define INTRPT_SIZE		0x4000
 
diff --git a/arch/tile/include/asm/pci-bridge.h b/arch/tile/include/asm/pci-bridge.h
deleted file mode 100644
index e853b0e..0000000
--- a/arch/tile/include/asm/pci-bridge.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef _ASM_TILE_PCI_BRIDGE_H
-#define _ASM_TILE_PCI_BRIDGE_H
-
-#include <linux/ioport.h>
-#include <linux/pci.h>
-
-struct device_node;
-struct pci_controller;
-
-/*
- * pci_io_base returns the memory address at which you can access
- * the I/O space for PCI bus number `bus' (or NULL on error).
- */
-extern void __iomem *pci_bus_io_base(unsigned int bus);
-extern unsigned long pci_bus_io_base_phys(unsigned int bus);
-extern unsigned long pci_bus_mem_base_phys(unsigned int bus);
-
-/* Allocate a new PCI host bridge structure */
-extern struct pci_controller *pcibios_alloc_controller(void);
-
-/* Helper function for setting up resources */
-extern void pci_init_resource(struct resource *res, unsigned long start,
-			      unsigned long end, int flags, char *name);
-
-/* Get the PCI host controller for a bus */
-extern struct pci_controller *pci_bus_to_hose(int bus);
-
-/*
- * Structure of a PCI controller (host bridge)
- */
-struct pci_controller {
-	int index;		/* PCI domain number */
-	struct pci_bus *root_bus;
-
-	int first_busno;
-	int last_busno;
-
-	int hv_cfg_fd[2];	/* config{0,1} fds for this PCIe controller */
-	int hv_mem_fd;		/* fd to Hypervisor for MMIO operations */
-
-	struct pci_ops *ops;
-
-	int irq_base;		/* Base IRQ from the Hypervisor	*/
-	int plx_gen1;		/* flag for PLX Gen 1 configuration */
-
-	/* Address ranges that are routed to this controller/bridge. */
-	struct resource mem_resources[3];
-};
-
-static inline struct pci_controller *pci_bus_to_host(struct pci_bus *bus)
-{
-	return bus->sysdata;
-}
-
-extern void setup_indirect_pci_nomap(struct pci_controller *hose,
-			       void __iomem *cfg_addr, void __iomem *cfg_data);
-extern void setup_indirect_pci(struct pci_controller *hose,
-			       u32 cfg_addr, u32 cfg_data);
-extern void setup_grackle(struct pci_controller *hose);
-
-extern unsigned char common_swizzle(struct pci_dev *, unsigned char *);
-
-/*
- *   The following code swizzles for exactly one bridge.  The routine
- *   common_swizzle below handles multiple bridges.  But there are a
- *   some boards that don't follow the PCI spec's suggestion so we
- *   break this piece out separately.
- */
-static inline unsigned char bridge_swizzle(unsigned char pin,
-		unsigned char idsel)
-{
-	return (((pin-1) + idsel) % 4) + 1;
-}
-
-/*
- * The following macro is used to lookup irqs in a standard table
- * format for those PPC systems that do not already have PCI
- * interrupts properly routed.
- */
-/* FIXME - double check this */
-#define PCI_IRQ_TABLE_LOOKUP ({ \
-	long _ctl_ = -1; \
-	if (idsel >= min_idsel && idsel <= max_idsel && pin <= irqs_per_slot) \
-		_ctl_ = pci_irq_table[idsel - min_idsel][pin-1]; \
-	_ctl_; \
-})
-
-/*
- * Scan the buses below a given PCI host bridge and assign suitable
- * resources to all devices found.
- */
-extern int pciauto_bus_scan(struct pci_controller *, int);
-
-#ifdef CONFIG_PCI
-extern unsigned long pci_address_to_pio(phys_addr_t address);
-#else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
-#endif
-
-#endif /* _ASM_TILE_PCI_BRIDGE_H */
diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index b0c15da..c3fc458 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -15,7 +15,29 @@
 #ifndef _ASM_TILE_PCI_H
 #define _ASM_TILE_PCI_H
 
-#include <asm/pci-bridge.h>
+#include <linux/pci.h>
+
+/*
+ * Structure of a PCI controller (host bridge)
+ */
+struct pci_controller {
+	int index;		/* PCI domain number */
+	struct pci_bus *root_bus;
+
+	int first_busno;
+	int last_busno;
+
+	int hv_cfg_fd[2];	/* config{0,1} fds for this PCIe controller */
+	int hv_mem_fd;		/* fd to Hypervisor for MMIO operations */
+
+	struct pci_ops *ops;
+
+	int irq_base;		/* Base IRQ from the Hypervisor	*/
+	int plx_gen1;		/* flag for PLX Gen 1 configuration */
+
+	/* Address ranges that are routed to this controller/bridge. */
+	struct resource mem_resources[3];
+};
 
 /*
  * The hypervisor maps the entirety of CPA-space as bus addresses, so
@@ -24,56 +46,12 @@
  */
 #define PCI_DMA_BUS_IS_PHYS     1
 
-struct pci_controller *pci_bus_to_hose(int bus);
-unsigned char __init common_swizzle(struct pci_dev *dev, unsigned char *pinp);
 int __init tile_pci_init(void);
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr);
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
-void __devinit pcibios_fixup_bus(struct pci_bus *bus);
 
-int __devinit _tile_cfg_read(struct pci_controller *hose,
-				    int bus,
-				    int slot,
-				    int function,
-				    int offset,
-				    int size,
-				    u32 *val);
-int __devinit _tile_cfg_write(struct pci_controller *hose,
-				     int bus,
-				     int slot,
-				     int function,
-				     int offset,
-				     int size,
-				     u32 val);
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr) {}
 
-/*
- * These are used to to config reads and writes in the early stages of
- * setup before the driver infrastructure has been set up enough to be
- * able to do config reads and writes.
- */
-#define early_cfg_read(where, size, value) \
-	_tile_cfg_read(controller, \
-		       current_bus, \
-		       pci_slot, \
-		       pci_fn, \
-		       where, \
-		       size, \
-		       value)
-
-#define early_cfg_write(where, size, value) \
-	_tile_cfg_write(controller, \
-		       current_bus, \
-		       pci_slot, \
-		       pci_fn, \
-		       where, \
-		       size, \
-		       value)
-
-
-
-#define PCICFG_BYTE	1
-#define PCICFG_WORD	2
-#define PCICFG_DWORD	4
+void __devinit pcibios_fixup_bus(struct pci_bus *bus);
 
 #define	TILE_NUM_PCIE	2
 
@@ -88,33 +66,33 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 }
 
 /*
- * I/O space is currently not supported.
+ * pcibios_assign_all_busses() tells whether or not the bus numbers
+ * should be reassigned, in case the BIOS didn't do it correctly, or
+ * in case we don't have a BIOS and we want to let Linux do it.
  */
+static inline int pcibios_assign_all_busses(void)
+{
+	return 1;
+}
 
-#define TILE_PCIE_LOWER_IO		0x0
-#define TILE_PCIE_UPPER_IO		0x10000
-#define TILE_PCIE_PCIE_IO_SIZE		0x0000FFFF
-
-#define _PAGE_NO_CACHE		0
-#define _PAGE_GUARDED		0
-
-
-#define pcibios_assign_all_busses()    pci_assign_all_buses
-extern int pci_assign_all_buses;
-
+/*
+ * No special bus mastering setup handling.
+ */
 static inline void pcibios_set_master(struct pci_dev *dev)
 {
-	/* No special bus mastering setup handling */
 }
 
 #define PCIBIOS_MIN_MEM		0
-#define PCIBIOS_MIN_IO		TILE_PCIE_LOWER_IO
+#define PCIBIOS_MIN_IO		0
 
 /*
  * This flag tells if the platform is TILEmpower that needs
  * special configuration for the PLX switch chip.
  */
-extern int blade_pci;
+extern int tile_plx_gen1;
+
+/* Use any cpu for PCI. */
+#define cpumask_of_pcibus(bus) cpu_online_mask
 
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
@@ -122,7 +100,4 @@ extern int blade_pci;
 /* generic pci stuff */
 #include <asm-generic/pci.h>
 
-/* Use any cpu for PCI. */
-#define cpumask_of_pcibus(bus) cpu_online_mask
-
 #endif /* _ASM_TILE_PCI_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index b336737..a6604e9 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -344,18 +344,11 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
 #if defined(CONFIG_HIGHPTE)
-extern pte_t *_pte_offset_map(pmd_t *, unsigned long address, enum km_type);
-#define pte_offset_map(dir, address) \
-	_pte_offset_map(dir, address, KM_PTE0)
-#define pte_offset_map_nested(dir, address) \
-	_pte_offset_map(dir, address, KM_PTE1)
-#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
+extern pte_t *pte_offset_map(pmd_t *, unsigned long address);
+#define pte_unmap(pte) kunmap_atomic(pte)
 #else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #endif
 
 /* Clear a non-executable kernel PTE and flush it from the TLB. */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index ccd5f84..a9e7c87 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -292,8 +292,18 @@ extern int kstack_hash;
 /* Are we using huge pages in the TLB for kernel data? */
 extern int kdata_huge;
 
+/* Support standard Linux prefetching. */
+#define ARCH_HAS_PREFETCH
+#define prefetch(x) __builtin_prefetch(x)
 #define PREFETCH_STRIDE CHIP_L2_LINE_SIZE()
 
+/* Bring a value into the L1D, faulting the TLB if necessary. */
+#ifdef __tilegx__
+#define prefetch_L1(x) __insn_prefetch_l1_fault((void *)(x))
+#else
+#define prefetch_L1(x) __insn_prefetch_L1((void *)(x))
+#endif
+
 #else /* __ASSEMBLY__ */
 
 /* Do some slow action (e.g. read a slow SPR). */
@@ -328,18 +338,21 @@ extern int kdata_huge;
  * Note that assembly code assumes that USER_PL is zero.
  */
 #define USER_PL 0
-#define KERNEL_PL 1
+#if CONFIG_KERNEL_PL == 2
+#define GUEST_PL 1
+#endif
+#define KERNEL_PL CONFIG_KERNEL_PL
 
-/* SYSTEM_SAVE_1_0 holds the current cpu number ORed with ksp0. */
+/* SYSTEM_SAVE_K_0 holds the current cpu number ORed with ksp0. */
 #define CPU_LOG_MASK_VALUE 12
 #define CPU_MASK_VALUE ((1 << CPU_LOG_MASK_VALUE) - 1)
 #if CONFIG_NR_CPUS > CPU_MASK_VALUE
 # error Too many cpus!
 #endif
 #define raw_smp_processor_id() \
-	((int)__insn_mfspr(SPR_SYSTEM_SAVE_1_0) & CPU_MASK_VALUE)
+	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & CPU_MASK_VALUE)
 #define get_current_ksp0() \
-	(__insn_mfspr(SPR_SYSTEM_SAVE_1_0) & ~CPU_MASK_VALUE)
+	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~CPU_MASK_VALUE)
 #define next_current_ksp0(task) ({ \
 	unsigned long __ksp0 = task_ksp0(task); \
 	int __cpu = raw_smp_processor_id(); \
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 4a02bb0..ac6d343 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -62,8 +62,8 @@ struct pt_regs {
 	pt_reg_t lr;		/* aliases regs[TREG_LR] */
 
 	/* Saved special registers. */
-	pt_reg_t pc;		/* stored in EX_CONTEXT_1_0 */
-	pt_reg_t ex1;		/* stored in EX_CONTEXT_1_1 (PL and ICS bit) */
+	pt_reg_t pc;		/* stored in EX_CONTEXT_K_0 */
+	pt_reg_t ex1;		/* stored in EX_CONTEXT_K_1 (PL and ICS bit) */
 	pt_reg_t faultnum;	/* fault number (INT_SWINT_1 for syscall) */
 	pt_reg_t orig_r0;	/* r0 at syscall entry, else zero */
 	pt_reg_t flags;		/* flags (see below) */
diff --git a/arch/tile/include/asm/signal.h b/arch/tile/include/asm/signal.h
index c1ee1d6..81d92a4 100644
--- a/arch/tile/include/asm/signal.h
+++ b/arch/tile/include/asm/signal.h
@@ -25,7 +25,7 @@
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 struct pt_regs;
-int restore_sigcontext(struct pt_regs *, struct sigcontext __user *, long *);
+int restore_sigcontext(struct pt_regs *, struct sigcontext __user *);
 int setup_sigcontext(struct sigcontext __user *, struct pt_regs *);
 void do_signal(struct pt_regs *regs);
 #endif
diff --git a/arch/tile/include/asm/stat.h b/arch/tile/include/asm/stat.h
index 3dc90fa..b16e5db 100644
--- a/arch/tile/include/asm/stat.h
+++ b/arch/tile/include/asm/stat.h
@@ -1 +1,4 @@
+#ifdef CONFIG_COMPAT
+#define __ARCH_WANT_STAT64	/* Used for compat_sys_stat64() etc. */
+#endif
 #include <asm-generic/stat.h>
diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index ce99ffe..3b5507c 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -32,8 +32,9 @@ extern void *compat_sys_call_table[];
 
 /*
  * Note that by convention, any syscall which requires the current
- * register set takes an additional "struct pt_regs *" pointer; the
- * sys_xxx() function just adds the pointer and tail-calls to _sys_xxx().
+ * register set takes an additional "struct pt_regs *" pointer; a
+ * _sys_xxx() trampoline in intvec*.S just sets up the pointer and
+ * jumps to sys_xxx().
  */
 
 /* kernel/sys.c */
@@ -43,66 +44,17 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
 		       u32 len_lo, u32 len_hi, int advice);
 long sys_flush_cache(void);
-long sys_mmap2(unsigned long addr, unsigned long len,
-	       unsigned long prot, unsigned long flags,
-	       unsigned long fd, unsigned long pgoff);
-#ifdef __tilegx__
-long sys_mmap(unsigned long addr, unsigned long len,
-	      unsigned long prot, unsigned long flags,
-	      unsigned long fd, off_t pgoff);
+#ifndef __tilegx__  /* No mmap() in the 32-bit kernel. */
+#define sys_mmap sys_mmap
 #endif
 
-/* kernel/process.c */
-long sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       void __user *parent_tid, void __user *child_tid);
-long _sys_clone(unsigned long clone_flags, unsigned long newsp,
-		void __user *parent_tid, void __user *child_tid,
-		struct pt_regs *regs);
-long sys_fork(void);
-long _sys_fork(struct pt_regs *regs);
-long sys_vfork(void);
-long _sys_vfork(struct pt_regs *regs);
-long sys_execve(const char __user *filename,
-		const char __user *const __user *argv,
-		const char __user *const __user *envp);
-long _sys_execve(const char __user *filename,
-		 const char __user *const __user *argv,
-		 const char __user *const __user *envp, struct pt_regs *regs);
-
-/* kernel/signal.c */
-long sys_sigaltstack(const stack_t __user *, stack_t __user *);
-long _sys_sigaltstack(const stack_t __user *, stack_t __user *,
-		      struct pt_regs *);
-long sys_rt_sigreturn(void);
-long _sys_rt_sigreturn(struct pt_regs *regs);
-
-/* platform-independent functions */
-long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
-long sys_rt_sigaction(int sig, const struct sigaction __user *act,
-		      struct sigaction __user *oact, size_t sigsetsize);
-
 #ifndef __tilegx__
 /* mm/fault.c */
-int sys_cmpxchg_badaddr(unsigned long address);
-int _sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *);
+long sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *);
+long _sys_cmpxchg_badaddr(unsigned long address);
 #endif
 
 #ifdef CONFIG_COMPAT
-long compat_sys_execve(const char __user *path,
-		       const compat_uptr_t __user *argv,
-		       const compat_uptr_t __user *envp);
-long _compat_sys_execve(const char __user *path,
-			const compat_uptr_t __user *argv,
-			const compat_uptr_t __user *envp,
-			struct pt_regs *regs);
-long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr);
-long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			     struct compat_sigaltstack __user *uoss_ptr,
-			     struct pt_regs *regs);
-long compat_sys_rt_sigreturn(void);
-long _compat_sys_rt_sigreturn(struct pt_regs *regs);
-
 /* These four are not defined for 64-bit, but serve as "compat" syscalls. */
 long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg);
 long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
@@ -110,4 +62,15 @@ long sys_truncate64(const char __user *path, loff_t length);
 long sys_ftruncate64(unsigned int fd, loff_t length);
 #endif
 
+/* These are the intvec*.S trampolines. */
+long _sys_sigaltstack(const stack_t __user *, stack_t __user *);
+long _sys_rt_sigreturn(void);
+long _sys_clone(unsigned long clone_flags, unsigned long newsp,
+		void __user *parent_tid, void __user *child_tid);
+long _sys_execve(const char __user *filename,
+		 const char __user *const __user *argv,
+		 const char __user *const __user *envp);
+
+#include <asm-generic/syscalls.h>
+
 #endif /* _ASM_TILE_SYSCALLS_H */
diff --git a/arch/tile/include/asm/system.h b/arch/tile/include/asm/system.h
index f749be3..5388850 100644
--- a/arch/tile/include/asm/system.h
+++ b/arch/tile/include/asm/system.h
@@ -89,6 +89,10 @@
 #define get_cycles_low() __insn_mfspr(SPR_CYCLE)   /* just get all 64 bits */
 #endif
 
+#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
+int __mb_incoherent(void);  /* Helper routine for mb_incoherent(). */
+#endif
+
 /* Fence to guarantee visibility of stores to incoherent memory. */
 static inline void
 mb_incoherent(void)
@@ -97,7 +101,6 @@ mb_incoherent(void)
 
 #if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 	{
-		int __mb_incoherent(void);
 #if CHIP_HAS_TILE_WRITE_PENDING()
 		const unsigned long WRITE_TIMEOUT_CYCLES = 400;
 		unsigned long start = get_cycles_low();
@@ -161,7 +164,7 @@ extern struct task_struct *_switch_to(struct task_struct *prev,
 /* Helper function for _switch_to(). */
 extern struct task_struct *__switch_to(struct task_struct *prev,
 				       struct task_struct *next,
-				       unsigned long new_system_save_1_0);
+				       unsigned long new_system_save_k_0);
 
 /* Address that switched-away from tasks are at. */
 extern unsigned long get_switch_to_pc(void);
@@ -214,13 +217,6 @@ int hardwall_deactivate(struct task_struct *task);
 } while (0)
 #endif
 
-/* Invoke the simulator "syscall" mechanism (see arch/tile/kernel/entry.S). */
-extern int _sim_syscall(int syscall_num, ...);
-#define sim_syscall(syscall_num, ...) \
-	_sim_syscall(SIM_CONTROL_SYSCALL + \
-		((syscall_num) << _SIM_CONTROL_OPERATOR_BITS), \
-		## __VA_ARGS__)
-
 /*
  * Kernel threads can check to see if they need to migrate their
  * stack whenever they return from a context switch; for user
diff --git a/arch/tile/include/asm/traps.h b/arch/tile/include/asm/traps.h
index 432a9c1..d06e35f 100644
--- a/arch/tile/include/asm/traps.h
+++ b/arch/tile/include/asm/traps.h
@@ -59,4 +59,8 @@ void do_hardwall_trap(struct pt_regs *, int fault_num);
 void do_breakpoint(struct pt_regs *, int fault_num);
 
 
+#ifdef __tilegx__
+void gx_singlestep_handle(struct pt_regs *, int fault_num);
+#endif
+
 #endif /* _ASM_TILE_SYSCALLS_H */
diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h
index f2e3ff4..b35c2db 100644
--- a/arch/tile/include/asm/unistd.h
+++ b/arch/tile/include/asm/unistd.h
@@ -41,6 +41,7 @@ __SYSCALL(__NR_cmpxchg_badaddr, sys_cmpxchg_badaddr)
 #ifdef CONFIG_COMPAT
 #define __ARCH_WANT_SYS_LLSEEK
 #endif
+#define __ARCH_WANT_SYS_NEWFSTATAT
 #endif
 
 #endif /* _ASM_TILE_UNISTD_H */
diff --git a/arch/tile/include/hv/drv_xgbe_impl.h b/arch/tile/include/hv/drv_xgbe_impl.h
new file mode 100644
index 0000000..3a73b2b
--- /dev/null
+++ b/arch/tile/include/hv/drv_xgbe_impl.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drivers/xgbe/impl.h
+ * Implementation details for the NetIO library.
+ */
+
+#ifndef __DRV_XGBE_IMPL_H__
+#define __DRV_XGBE_IMPL_H__
+
+#include <hv/netio_errors.h>
+#include <hv/netio_intf.h>
+#include <hv/drv_xgbe_intf.h>
+
+
+/** How many groups we have (log2). */
+#define LOG2_NUM_GROUPS (12)
+/** How many groups we have. */
+#define NUM_GROUPS (1 << LOG2_NUM_GROUPS)
+
+/** Number of output requests we'll buffer per tile. */
+#define EPP_REQS_PER_TILE (32)
+
+/** Words used in an eDMA command without checksum acceleration. */
+#define EDMA_WDS_NO_CSUM      8
+/** Words used in an eDMA command with checksum acceleration. */
+#define EDMA_WDS_CSUM        10
+/** Total available words in the eDMA command FIFO. */
+#define EDMA_WDS_TOTAL      128
+
+
+/*
+ * FIXME: These definitions are internal and should have underscores!
+ * NOTE: The actual numeric values here are intentional and allow us to
+ * optimize the concept "if small ... else if large ... else ...", by
+ * checking for the low bit being set, and then for non-zero.
+ * These are used as array indices, so they must have the values (0, 1, 2)
+ * in some order.
+ */
+#define SIZE_SMALL (1)       /**< Small packet queue. */
+#define SIZE_LARGE (2)       /**< Large packet queue. */
+#define SIZE_JUMBO (0)       /**< Jumbo packet queue. */
+
+/** The number of "SIZE_xxx" values. */
+#define NETIO_NUM_SIZES 3
+
+
+/*
+ * Default numbers of packets for IPP drivers.  These values are chosen
+ * such that CIPP1 will not overflow its L2 cache.
+ */
+
+/** The default number of small packets. */
+#define NETIO_DEFAULT_SMALL_PACKETS 2750
+/** The default number of large packets. */
+#define NETIO_DEFAULT_LARGE_PACKETS 2500
+/** The default number of jumbo packets. */
+#define NETIO_DEFAULT_JUMBO_PACKETS 250
+
+
+/** Log2 of the size of a memory arena. */
+#define NETIO_ARENA_SHIFT      24      /* 16 MB */
+/** Size of a memory arena. */
+#define NETIO_ARENA_SIZE       (1 << NETIO_ARENA_SHIFT)
+
+
+/** A queue of packets.
+ *
+ * This structure partially defines a queue of packets waiting to be
+ * processed.  The queue as a whole is written to by an interrupt handler and
+ * read by non-interrupt code; this data structure is what's touched by the
+ * interrupt handler.  The other part of the queue state, the read offset, is
+ * kept in user space, not in hypervisor space, so it is in a separate data
+ * structure.
+ *
+ * The read offset (__packet_receive_read in the user part of the queue
+ * structure) points to the next packet to be read. When the read offset is
+ * equal to the write offset, the queue is empty; therefore the queue must
+ * contain one more slot than the required maximum queue size.
+ *
+ * Here's an example of all 3 state variables and what they mean.  All
+ * pointers move left to right.
+ *
+ * @code
+ *   I   I   V   V   V   V   I   I   I   I
+ *   0   1   2   3   4   5   6   7   8   9  10
+ *           ^       ^       ^               ^
+ *           |               |               |
+ *           |               |               __last_packet_plus_one
+ *           |               __buffer_write
+ *           __packet_receive_read
+ * @endcode
+ *
+ * This queue has 10 slots, and thus can hold 9 packets (_last_packet_plus_one
+ * = 10).  The read pointer is at 2, and the write pointer is at 6; thus,
+ * there are valid, unread packets in slots 2, 3, 4, and 5.  The remaining
+ * slots are invalid (do not contain a packet).
+ */
+typedef struct {
+  /** Byte offset of the next notify packet to be written: zero for the first
+   *  packet on the queue, sizeof (netio_pkt_t) for the second packet on the
+   *  queue, etc. */
+  volatile uint32_t __packet_write;
+
+  /** Offset of the packet after the last valid packet (i.e., when any
+   *  pointer is incremented to this value, it wraps back to zero). */
+  uint32_t __last_packet_plus_one;
+}
+__netio_packet_queue_t;
+
+
+/** A queue of buffers.
+ *
+ * This structure partially defines a queue of empty buffers which have been
+ * obtained via requests to the IPP.  (The elements of the queue are packet
+ * handles, which are transformed into a full netio_pkt_t when the buffer is
+ * retrieved.)  The queue as a whole is written to by an interrupt handler and
+ * read by non-interrupt code; this data structure is what's touched by the
+ * interrupt handler.  The other parts of the queue state, the read offset and
+ * requested write offset, are kept in user space, not in hypervisor space, so
+ * they are in a separate data structure.
+ *
+ * The read offset (__buffer_read in the user part of the queue structure)
+ * points to the next buffer to be read. When the read offset is equal to the
+ * write offset, the queue is empty; therefore the queue must contain one more
+ * slot than the required maximum queue size.
+ *
+ * The requested write offset (__buffer_requested_write in the user part of
+ * the queue structure) points to the slot which will hold the next buffer we
+ * request from the IPP, once we get around to sending such a request.  When
+ * the requested write offset is equal to the write offset, no requests for
+ * new buffers are outstanding; when the requested write offset is one greater
+ * than the read offset, no more requests may be sent.
+ *
+ * Note that, unlike the packet_queue, the buffer_queue places incoming
+ * buffers at decreasing addresses.  This makes the check for "is it time to
+ * wrap the buffer pointer" cheaper in the assembly code which receives new
+ * buffers, and means that the value which defines the queue size,
+ * __last_buffer, is different than in the packet queue.  Also, the offset
+ * used in the packet_queue is already scaled by the size of a packet; here we
+ * use unscaled slot indices for the offsets.  (These differences are
+ * historical, and in the future it's possible that the packet_queue will look
+ * more like this queue.)
+ *
+ * @code
+ * Here's an example of all 4 state variables and what they mean.  Remember:
+ * all pointers move right to left.
+ *
+ *   V   V   V   I   I   R   R   V   V   V
+ *   0   1   2   3   4   5   6   7   8   9
+ *           ^       ^       ^           ^
+ *           |       |       |           |
+ *           |       |       |           __last_buffer
+ *           |       |       __buffer_write
+ *           |       __buffer_requested_write
+ *           __buffer_read
+ * @endcode
+ *
+ * This queue has 10 slots, and thus can hold 9 buffers (_last_buffer = 9).
+ * The read pointer is at 2, and the write pointer is at 6; thus, there are
+ * valid, unread buffers in slots 2, 1, 0, 9, 8, and 7.  The requested write
+ * pointer is at 4; thus, requests have been made to the IPP for buffers which
+ * will be placed in slots 6 and 5 when they arrive.  Finally, the remaining
+ * slots are invalid (do not contain a buffer).
+ */
+typedef struct
+{
+  /** Ordinal number of the next buffer to be written: 0 for the first slot in
+   *  the queue, 1 for the second slot in the queue, etc. */
+  volatile uint32_t __buffer_write;
+
+  /** Ordinal number of the last buffer (i.e., when any pointer is decremented
+   *  below zero, it is reloaded with this value). */
+  uint32_t __last_buffer;
+}
+__netio_buffer_queue_t;
+
+
+/**
+ * An object for providing Ethernet packets to a process.
+ */
+typedef struct __netio_queue_impl_t
+{
+  /** The queue of packets waiting to be received. */
+  __netio_packet_queue_t __packet_receive_queue;
+  /** The intr bit mask that IDs this device. */
+  unsigned int __intr_id;
+  /** Offset to queues of empty buffers, one per size. */
+  uint32_t __buffer_queue[NETIO_NUM_SIZES];
+  /** The address of the first EPP tile, or -1 if no EPP. */
+  /* ISSUE: Actually this is always "0" or "~0". */
+  uint32_t __epp_location;
+  /** The queue ID that this queue represents. */
+  unsigned int __queue_id;
+  /** Number of acknowledgements received. */
+  volatile uint32_t __acks_received;
+  /** Last completion number received for packet_sendv. */
+  volatile uint32_t __last_completion_rcv;
+  /** Number of packets allowed to be outstanding. */
+  uint32_t __max_outstanding;
+  /** First VA available for packets. */
+  void* __va_0;
+  /** First VA in second range available for packets. */
+  void* __va_1;
+  /** Padding to align the "__packets" field to the size of a netio_pkt_t. */
+  uint32_t __padding[3];
+  /** The packets themselves. */
+  netio_pkt_t __packets[0];
+}
+netio_queue_impl_t;
+
+
+/**
+ * An object for managing the user end of a NetIO queue.
+ */
+typedef struct __netio_queue_user_impl_t
+{
+  /** The next incoming packet to be read. */
+  uint32_t __packet_receive_read;
+  /** The next empty buffers to be read, one index per size. */
+  uint8_t __buffer_read[NETIO_NUM_SIZES];
+  /** Where the empty buffer we next request from the IPP will go, one index
+   * per size. */
+  uint8_t __buffer_requested_write[NETIO_NUM_SIZES];
+  /** PCIe interface flag. */
+  uint8_t __pcie;
+  /** Number of packets left to be received before we send a credit update. */
+  uint32_t __receive_credit_remaining;
+  /** Value placed in __receive_credit_remaining when it reaches zero. */
+  uint32_t __receive_credit_interval;
+  /** First fast I/O routine index. */
+  uint32_t __fastio_index;
+  /** Number of acknowledgements expected. */
+  uint32_t __acks_outstanding;
+  /** Last completion number requested. */
+  uint32_t __last_completion_req;
+  /** File descriptor for driver. */
+  int __fd;
+}
+netio_queue_user_impl_t;
+
+
+#define NETIO_GROUP_CHUNK_SIZE   64   /**< Max # groups in one IPP request */
+#define NETIO_BUCKET_CHUNK_SIZE  64   /**< Max # buckets in one IPP request */
+
+
+/** Internal structure used to convey packet send information to the
+ * hypervisor.  FIXME: Actually, it's not used for that anymore, but
+ * netio_packet_send() still uses it internally.
+ */
+typedef struct
+{
+  uint16_t flags;              /**< Packet flags (__NETIO_SEND_FLG_xxx) */
+  uint16_t transfer_size;      /**< Size of packet */
+  uint32_t va;                 /**< VA of start of packet */
+  __netio_pkt_handle_t handle; /**< Packet handle */
+  uint32_t csum0;              /**< First checksum word */
+  uint32_t csum1;              /**< Second checksum word */
+}
+__netio_send_cmd_t;
+
+
+/** Flags used in two contexts:
+ *  - As the "flags" member in the __netio_send_cmd_t, above; used only
+ *    for netio_pkt_send_{prepare,commit}.
+ *  - As part of the flags passed to the various send packet fast I/O calls.
+ */
+
+/** Need acknowledgement on this packet.  Note that some code in the
+ *  normal send_pkt fast I/O handler assumes that this is equal to 1. */
+#define __NETIO_SEND_FLG_ACK    0x1
+
+/** Do checksum on this packet.  (Only used with the __netio_send_cmd_t;
+ *  normal packet sends use a special fast I/O index to denote checksumming,
+ *  and multi-segment sends test the checksum descriptor.) */
+#define __NETIO_SEND_FLG_CSUM   0x2
+
+/** Get a completion on this packet.  Only used with multi-segment sends.  */
+#define __NETIO_SEND_FLG_COMPLETION 0x4
+
+/** Position of the number-of-extra-segments value in the flags word.
+    Only used with multi-segment sends. */
+#define __NETIO_SEND_FLG_XSEG_SHIFT 3
+
+/** Width of the number-of-extra-segments value in the flags word. */
+#define __NETIO_SEND_FLG_XSEG_WIDTH 2
+
+#endif /* __DRV_XGBE_IMPL_H__ */
diff --git a/arch/tile/include/hv/drv_xgbe_intf.h b/arch/tile/include/hv/drv_xgbe_intf.h
new file mode 100644
index 0000000..146e47d
--- /dev/null
+++ b/arch/tile/include/hv/drv_xgbe_intf.h
@@ -0,0 +1,615 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_xgbe_intf.h
+ * Interface to the hypervisor XGBE driver.
+ */
+
+#ifndef __DRV_XGBE_INTF_H__
+#define __DRV_XGBE_INTF_H__
+
+/**
+ * An object for forwarding VAs and PAs to the hypervisor.
+ * @ingroup types
+ *
+ * This allows the supervisor to specify a number of areas of memory to
+ * store packet buffers.
+ */
+typedef struct
+{
+  /** The physical address of the memory. */
+  HV_PhysAddr pa;
+  /** Page table entry for the memory.  This is only used to derive the
+   *  memory's caching mode; the PA bits are ignored. */
+  HV_PTE pte;
+  /** The virtual address of the memory. */
+  HV_VirtAddr va;
+  /** Size (in bytes) of the memory area. */
+  int size;
+
+}
+netio_ipp_address_t;
+
+/** The various pread/pwrite offsets into the hypervisor-level driver.
+ * @ingroup types
+ */
+typedef enum
+{
+  /** Inform the Linux driver of the address of the NetIO arena memory.
+   *  This offset is actually only used to convey information from netio
+   *  to the Linux driver; it never makes it from there to the hypervisor.
+   *  Write-only; takes a uint32_t specifying the VA address. */
+  NETIO_FIXED_ADDR               = 0x5000000000000000ULL,
+
+  /** Inform the Linux driver of the size of the NetIO arena memory.
+   *  This offset is actually only used to convey information from netio
+   *  to the Linux driver; it never makes it from there to the hypervisor.
+   *  Write-only; takes a uint32_t specifying the VA size. */
+  NETIO_FIXED_SIZE               = 0x5100000000000000ULL,
+
+  /** Register current tile with IPP.  Write then read: write, takes a
+   *  netio_input_config_t, read returns a pointer to a netio_queue_impl_t. */
+  NETIO_IPP_INPUT_REGISTER_OFF   = 0x6000000000000000ULL,
+
+  /** Unregister current tile from IPP.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_UNREGISTER_OFF = 0x6100000000000000ULL,
+
+  /** Start packets flowing.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_INIT_OFF       = 0x6200000000000000ULL,
+
+  /** Stop packets flowing.  Write-only, takes a dummy argument. */
+  NETIO_IPP_INPUT_UNINIT_OFF     = 0x6300000000000000ULL,
+
+  /** Configure group (typically we group on VLAN).  Write-only: takes an
+   *  array of netio_group_t's, low 24 bits of the offset is the base group
+   *  number times the size of a netio_group_t. */
+  NETIO_IPP_INPUT_GROUP_CFG_OFF  = 0x6400000000000000ULL,
+
+  /** Configure bucket.  Write-only: takes an array of netio_bucket_t's, low
+   *  24 bits of the offset is the base bucket number times the size of a
+   *  netio_bucket_t. */
+  NETIO_IPP_INPUT_BUCKET_CFG_OFF = 0x6500000000000000ULL,
+
+  /** Get/set a parameter.  Read or write: read or write data is the parameter
+   *  value, low 32 bits of the offset is a __netio_getset_offset_t. */
+  NETIO_IPP_PARAM_OFF            = 0x6600000000000000ULL,
+
+  /** Get fast I/O index.  Read-only; returns a 4-byte base index value. */
+  NETIO_IPP_GET_FASTIO_OFF       = 0x6700000000000000ULL,
+
+  /** Configure hijack IP address.  Packets with this IPv4 dest address
+   *  go to bucket NETIO_NUM_BUCKETS - 1.  Write-only: takes an IP address
+   *  in some standard form.  FIXME: Define the form! */
+  NETIO_IPP_INPUT_HIJACK_CFG_OFF  = 0x6800000000000000ULL,
+
+  /**
+   * Offsets beyond this point are reserved for the supervisor (although that
+   * enforcement must be done by the supervisor driver itself).
+   */
+  NETIO_IPP_USER_MAX_OFF         = 0x6FFFFFFFFFFFFFFFULL,
+
+  /** Register I/O memory.  Write-only, takes a netio_ipp_address_t. */
+  NETIO_IPP_IOMEM_REGISTER_OFF   = 0x7000000000000000ULL,
+
+  /** Unregister I/O memory.  Write-only, takes a netio_ipp_address_t. */
+  NETIO_IPP_IOMEM_UNREGISTER_OFF = 0x7100000000000000ULL,
+
+  /* Offsets greater than 0x7FFFFFFF can't be used directly from Linux
+   * userspace code due to limitations in the pread/pwrite syscalls. */
+
+  /** Drain LIPP buffers. */
+  NETIO_IPP_DRAIN_OFF              = 0xFA00000000000000ULL,
+
+  /** Supply a netio_ipp_address_t to be used as shared memory for the
+   *  LEPP command queue. */
+  NETIO_EPP_SHM_OFF              = 0xFB00000000000000ULL,
+
+  /* 0xFC... is currently unused. */
+
+  /** Stop IPP/EPP tiles.  Write-only, takes a dummy argument.  */
+  NETIO_IPP_STOP_SHIM_OFF        = 0xFD00000000000000ULL,
+
+  /** Start IPP/EPP tiles.  Write-only, takes a dummy argument.  */
+  NETIO_IPP_START_SHIM_OFF       = 0xFE00000000000000ULL,
+
+  /** Supply packet arena.  Write-only, takes an array of
+    * netio_ipp_address_t values. */
+  NETIO_IPP_ADDRESS_OFF          = 0xFF00000000000000ULL,
+} netio_hv_offset_t;
+
+/** Extract the base offset from an offset */
+#define NETIO_BASE_OFFSET(off)    ((off) & 0xFF00000000000000ULL)
+/** Extract the local offset from an offset */
+#define NETIO_LOCAL_OFFSET(off)   ((off) & 0x00FFFFFFFFFFFFFFULL)
+
+
+/**
+ * Get/set offset.
+ */
+typedef union
+{
+  struct
+  {
+    uint64_t addr:48;        /**< Class-specific address */
+    unsigned int class:8;    /**< Class (e.g., NETIO_PARAM) */
+    unsigned int opcode:8;   /**< High 8 bits of NETIO_IPP_PARAM_OFF */
+  }
+  bits;                      /**< Bitfields */
+  uint64_t word;             /**< Aggregated value to use as the offset */
+}
+__netio_getset_offset_t;
+
+/**
+ * Fast I/O index offsets (must be contiguous).
+ */
+typedef enum
+{
+  NETIO_FASTIO_ALLOCATE         = 0, /**< Get empty packet buffer */
+  NETIO_FASTIO_FREE_BUFFER      = 1, /**< Give buffer back to IPP */
+  NETIO_FASTIO_RETURN_CREDITS   = 2, /**< Give credits to IPP */
+  NETIO_FASTIO_SEND_PKT_NOCK    = 3, /**< Send a packet, no checksum */
+  NETIO_FASTIO_SEND_PKT_CK      = 4, /**< Send a packet, with checksum */
+  NETIO_FASTIO_SEND_PKT_VEC     = 5, /**< Send a vector of packets */
+  NETIO_FASTIO_SENDV_PKT        = 6, /**< Sendv one packet */
+  NETIO_FASTIO_NUM_INDEX        = 7, /**< Total number of fast I/O indices */
+} netio_fastio_index_t;
+
+/** 3-word return type for Fast I/O call. */
+typedef struct
+{
+  int err;            /**< Error code. */
+  uint32_t val0;      /**< Value.  Meaning depends upon the specific call. */
+  uint32_t val1;      /**< Value.  Meaning depends upon the specific call. */
+} netio_fastio_rv3_t;
+
+/** 0-argument fast I/O call */
+int __netio_fastio0(uint32_t fastio_index);
+/** 1-argument fast I/O call */
+int __netio_fastio1(uint32_t fastio_index, uint32_t arg0);
+/** 3-argument fast I/O call, 2-word return value */
+netio_fastio_rv3_t __netio_fastio3_rv3(uint32_t fastio_index, uint32_t arg0,
+                                       uint32_t arg1, uint32_t arg2);
+/** 4-argument fast I/O call */
+int __netio_fastio4(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3);
+/** 6-argument fast I/O call */
+int __netio_fastio6(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3, uint32_t arg4, uint32_t arg5);
+/** 9-argument fast I/O call */
+int __netio_fastio9(uint32_t fastio_index, uint32_t arg0, uint32_t arg1,
+                    uint32_t arg2, uint32_t arg3, uint32_t arg4, uint32_t arg5,
+                    uint32_t arg6, uint32_t arg7, uint32_t arg8);
+
+/** Allocate an empty packet.
+ * @param fastio_index Fast I/O index.
+ * @param size Size of the packet to allocate.
+ */
+#define __netio_fastio_allocate(fastio_index, size) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_ALLOCATE, size)
+
+/** Free a buffer.
+ * @param fastio_index Fast I/O index.
+ * @param handle Handle for the packet to free.
+ */
+#define __netio_fastio_free_buffer(fastio_index, handle) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_FREE_BUFFER, handle)
+
+/** Increment our receive credits.
+ * @param fastio_index Fast I/O index.
+ * @param credits Number of credits to add.
+ */
+#define __netio_fastio_return_credits(fastio_index, credits) \
+  __netio_fastio1((fastio_index) + NETIO_FASTIO_RETURN_CREDITS, credits)
+
+/** Send packet, no checksum.
+ * @param fastio_index Fast I/O index.
+ * @param ackflag Nonzero if we want an ack.
+ * @param size Size of the packet.
+ * @param va Virtual address of start of packet.
+ * @param handle Packet handle.
+ */
+#define __netio_fastio_send_pkt_nock(fastio_index, ackflag, size, va, handle) \
+  __netio_fastio4((fastio_index) + NETIO_FASTIO_SEND_PKT_NOCK, ackflag, \
+                  size, va, handle)
+
+/** Send packet, calculate checksum.
+ * @param fastio_index Fast I/O index.
+ * @param ackflag Nonzero if we want an ack.
+ * @param size Size of the packet.
+ * @param va Virtual address of start of packet.
+ * @param handle Packet handle.
+ * @param csum0 Shim checksum header.
+ * @param csum1 Checksum seed.
+ */
+#define __netio_fastio_send_pkt_ck(fastio_index, ackflag, size, va, handle, \
+                                   csum0, csum1) \
+  __netio_fastio6((fastio_index) + NETIO_FASTIO_SEND_PKT_CK, ackflag, \
+                  size, va, handle, csum0, csum1)
+
+
+/** Format for the "csum0" argument to the __netio_fastio_send routines
+ * and LEPP.  Note that this is currently exactly identical to the
+ * ShimProtocolOffloadHeader.
+ */
+typedef union
+{
+  struct
+  {
+    unsigned int start_byte:7;       /**< The first byte to be checksummed */
+    unsigned int count:14;           /**< Number of bytes to be checksummed. */
+    unsigned int destination_byte:7; /**< The byte to write the checksum to. */
+    unsigned int reserved:4;         /**< Reserved. */
+  } bits;                            /**< Decomposed method of access. */
+  unsigned int word;                 /**< To send out the IDN. */
+} __netio_checksum_header_t;
+
+
+/** Sendv packet with 1 or 2 segments.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits; number of segments minus
+ *        1 in next 2 bits; expected checksum in high 16 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; if zero, no checksum.
+ * @param va_F Virtual address of first segment.
+ * @param va_L Virtual address of last segment, if 2 segments.
+ * @param len_F_L Length of first segment in low 16 bits; length of last
+ *        segment, if 2 segments, in high 16 bits.
+ */
+#define __netio_fastio_sendv_pkt_1_2(fastio_index, flags, confno, csum0, \
+                                     va_F, va_L, len_F_L) \
+  __netio_fastio6((fastio_index) + NETIO_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L)
+
+/** Send packet on PCIe interface.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; Hard wired 0, not needed for PCIe.
+ * @param va_F Virtual address of the packet buffer.
+ * @param va_L Virtual address of last segment, if 2 segments. Hard wired 0.
+ * @param len_F_L Length of the packet buffer in low 16 bits.
+ */
+#define __netio_fastio_send_pcie_pkt(fastio_index, flags, confno, csum0, \
+                                     va_F, va_L, len_F_L) \
+  __netio_fastio6((fastio_index) + PCIE_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L)
+
+/** Sendv packet with 3 or 4 segments.
+ * @param fastio_index Fast I/O index.
+ * @param flags Ack/csum/notify flags in low 3 bits; number of segments minus
+ *        1 in next 2 bits; expected checksum in high 16 bits.
+ * @param confno Confirmation number to request, if notify flag set.
+ * @param csum0 Checksum descriptor; if zero, no checksum.
+ * @param va_F Virtual address of first segment.
+ * @param va_L Virtual address of last segment (third segment if 3 segments,
+ *        fourth segment if 4 segments).
+ * @param len_F_L Length of first segment in low 16 bits; length of last
+ *        segment in high 16 bits.
+ * @param va_M0 Virtual address of "middle 0" segment; this segment is sent
+ *        second when there are three segments, and third if there are four.
+ * @param va_M1 Virtual address of "middle 1" segment; this segment is sent
+ *        second when there are four segments.
+ * @param len_M0_M1 Length of middle 0 segment in low 16 bits; length of middle
+ *        1 segment, if 4 segments, in high 16 bits.
+ */
+#define __netio_fastio_sendv_pkt_3_4(fastio_index, flags, confno, csum0, va_F, \
+                                     va_L, len_F_L, va_M0, va_M1, len_M0_M1) \
+  __netio_fastio9((fastio_index) + NETIO_FASTIO_SENDV_PKT, flags, confno, \
+                  csum0, va_F, va_L, len_F_L, va_M0, va_M1, len_M0_M1)
+
+/** Send vector of packets.
+ * @param fastio_index Fast I/O index.
+ * @param seqno Number of packets transmitted so far on this interface;
+ *        used to decide which packets should be acknowledged.
+ * @param nentries Number of entries in vector.
+ * @param va Virtual address of start of vector entry array.
+ * @return 3-word netio_fastio_rv3_t structure.  The structure's err member
+ *         is an error code, or zero if no error.  The val0 member is the
+ *         updated value of seqno; it has been incremented by 1 for each
+ *         packet sent.  That increment may be less than nentries if an
+ *         error occured, or if some of the entries in the vector contain
+ *         handles equal to NETIO_PKT_HANDLE_NONE.  The val1 member is the
+ *         updated value of nentries; it has been decremented by 1 for each
+ *         vector entry processed.  Again, that decrement may be less than
+ *         nentries (leaving the returned value positive) if an error
+ *         occurred.
+ */
+#define __netio_fastio_send_pkt_vec(fastio_index, seqno, nentries, va) \
+  __netio_fastio3_rv3((fastio_index) + NETIO_FASTIO_SEND_PKT_VEC, seqno, \
+                      nentries, va)
+
+
+/** An egress DMA command for LEPP. */
+typedef struct
+{
+  /** Is this a TSO transfer?
+   *
+   * NOTE: This field is always 0, to distinguish it from
+   * lepp_tso_cmd_t.  It must come first!
+   */
+  uint8_t tso               : 1;
+
+  /** Unused padding bits. */
+  uint8_t _unused           : 3;
+
+  /** Should this packet be sent directly from caches instead of DRAM,
+   * using hash-for-home to locate the packet data?
+   */
+  uint8_t hash_for_home     : 1;
+
+  /** Should we compute a checksum? */
+  uint8_t compute_checksum  : 1;
+
+  /** Is this the final buffer for this packet?
+   *
+   * A single packet can be split over several input buffers (a "gather"
+   * operation).  This flag indicates that this is the last buffer
+   * in a packet.
+   */
+  uint8_t end_of_packet     : 1;
+
+  /** Should LEPP advance 'comp_busy' when this DMA is fully finished? */
+  uint8_t send_completion   : 1;
+
+  /** High bits of Client Physical Address of the start of the buffer
+   *  to be egressed.
+   *
+   *  NOTE: Only 6 bits are actually needed here, as CPAs are
+   *  currently 38 bits.  So two bits could be scavenged from this.
+   */
+  uint8_t cpa_hi;
+
+  /** The number of bytes to be egressed. */
+  uint16_t length;
+
+  /** Low 32 bits of Client Physical Address of the start of the buffer
+   *  to be egressed.
+   */
+  uint32_t cpa_lo;
+
+  /** Checksum information (only used if 'compute_checksum'). */
+  __netio_checksum_header_t checksum_data;
+
+} lepp_cmd_t;
+
+
+/** A chunk of physical memory for a TSO egress. */
+typedef struct
+{
+  /** The low bits of the CPA. */
+  uint32_t cpa_lo;
+  /** The high bits of the CPA. */
+  uint16_t cpa_hi		: 15;
+  /** Should this packet be sent directly from caches instead of DRAM,
+   *  using hash-for-home to locate the packet data?
+   */
+  uint16_t hash_for_home	: 1;
+  /** The length in bytes. */
+  uint16_t length;
+} lepp_frag_t;
+
+
+/** An LEPP command that handles TSO. */
+typedef struct
+{
+  /** Is this a TSO transfer?
+   *
+   *  NOTE: This field is always 1, to distinguish it from
+   *  lepp_cmd_t.  It must come first!
+   */
+  uint8_t tso             : 1;
+
+  /** Unused padding bits. */
+  uint8_t _unused         : 7;
+
+  /** Size of the header[] array in bytes.  It must be in the range
+   *  [40, 127], which are the smallest header for a TCP packet over
+   *  Ethernet and the maximum possible prepend size supported by
+   *  hardware, respectively.  Note that the array storage must be
+   *  padded out to a multiple of four bytes so that the following
+   *  LEPP command is aligned properly.
+   */
+  uint8_t header_size;
+
+  /** Byte offset of the IP header in header[]. */
+  uint8_t ip_offset;
+
+  /** Byte offset of the TCP header in header[]. */
+  uint8_t tcp_offset;
+
+  /** The number of bytes to use for the payload of each packet,
+   *  except of course the last one, which may not have enough bytes.
+   *  This means that each Ethernet packet except the last will have a
+   *  size of header_size + payload_size.
+   */
+  uint16_t payload_size;
+
+  /** The length of the 'frags' array that follows this struct. */
+  uint16_t num_frags;
+
+  /** The actual frags. */
+  lepp_frag_t frags[0 /* Variable-sized; num_frags entries. */];
+
+  /*
+   * The packet header template logically follows frags[],
+   * but you can't declare that in C.
+   *
+   * uint32_t header[header_size_in_words_rounded_up];
+   */
+
+} lepp_tso_cmd_t;
+
+
+/** An LEPP completion ring entry. */
+typedef void* lepp_comp_t;
+
+
+/** Maximum number of frags for one TSO command.  This is adapted from
+ *  linux's "MAX_SKB_FRAGS", and presumably over-estimates by one, for
+ *  our page size of exactly 65536.  We add one for a "body" fragment.
+ */
+#define LEPP_MAX_FRAGS (65536 / HV_PAGE_SIZE_SMALL + 2 + 1)
+
+/** Total number of bytes needed for an lepp_tso_cmd_t. */
+#define LEPP_TSO_CMD_SIZE(num_frags, header_size) \
+  (sizeof(lepp_tso_cmd_t) + \
+   (num_frags) * sizeof(lepp_frag_t) + \
+   (((header_size) + 3) & -4))
+
+/** The size of the lepp "cmd" queue. */
+#define LEPP_CMD_QUEUE_BYTES \
+ (((CHIP_L2_CACHE_SIZE() - 2 * CHIP_L2_LINE_SIZE()) / \
+  (sizeof(lepp_cmd_t) + sizeof(lepp_comp_t))) * sizeof(lepp_cmd_t))
+
+/** The largest possible command that can go in lepp_queue_t::cmds[]. */
+#define LEPP_MAX_CMD_SIZE LEPP_TSO_CMD_SIZE(LEPP_MAX_FRAGS, 128)
+
+/** The largest possible value of lepp_queue_t::cmd_{head, tail} (inclusive).
+ */
+#define LEPP_CMD_LIMIT \
+  (LEPP_CMD_QUEUE_BYTES - LEPP_MAX_CMD_SIZE)
+
+/** The maximum number of completions in an LEPP queue. */
+#define LEPP_COMP_QUEUE_SIZE \
+  ((LEPP_CMD_LIMIT + sizeof(lepp_cmd_t) - 1) / sizeof(lepp_cmd_t))
+
+/** Increment an index modulo the queue size. */
+#define LEPP_QINC(var) \
+  (var = __insn_mnz(var - (LEPP_COMP_QUEUE_SIZE - 1), var + 1))
+
+/** A queue used to convey egress commands from the client to LEPP. */
+typedef struct
+{
+  /** Index of first completion not yet processed by user code.
+   *  If this is equal to comp_busy, there are no such completions.
+   *
+   *  NOTE: This is only read/written by the user.
+   */
+  unsigned int comp_head;
+
+  /** Index of first completion record not yet completed.
+   *  If this is equal to comp_tail, there are no such completions.
+   *  This index gets advanced (modulo LEPP_QUEUE_SIZE) whenever
+   *  a command with the 'completion' bit set is finished.
+   *
+   *  NOTE: This is only written by LEPP, only read by the user.
+   */
+  volatile unsigned int comp_busy;
+
+  /** Index of the first empty slot in the completion ring.
+   *  Entries from this up to but not including comp_head (in ring order)
+   *  can be filled in with completion data.
+   *
+   *  NOTE: This is only read/written by the user.
+   */
+  unsigned int comp_tail;
+
+  /** Byte index of first command enqueued for LEPP but not yet processed.
+   *
+   *  This is always divisible by sizeof(void*) and always <= LEPP_CMD_LIMIT.
+   *
+   *  NOTE: LEPP advances this counter as soon as it no longer needs
+   *  the cmds[] storage for this entry, but the transfer is not actually
+   *  complete (i.e. the buffer pointed to by the command is no longer
+   *  needed) until comp_busy advances.
+   *
+   *  If this is equal to cmd_tail, the ring is empty.
+   *
+   *  NOTE: This is only written by LEPP, only read by the user.
+   */
+  volatile unsigned int cmd_head;
+
+  /** Byte index of first empty slot in the command ring.  This field can
+   *  be incremented up to but not equal to cmd_head (because that would
+   *  mean the ring is empty).
+   *
+   *  This is always divisible by sizeof(void*) and always <= LEPP_CMD_LIMIT.
+   *
+   *  NOTE: This is read/written by the user, only read by LEPP.
+   */
+  volatile unsigned int cmd_tail;
+
+  /** A ring of variable-sized egress DMA commands.
+   *
+   *  NOTE: Only written by the user, only read by LEPP.
+   */
+  char cmds[LEPP_CMD_QUEUE_BYTES]
+    __attribute__((aligned(CHIP_L2_LINE_SIZE())));
+
+  /** A ring of user completion data.
+   *  NOTE: Only read/written by the user.
+   */
+  lepp_comp_t comps[LEPP_COMP_QUEUE_SIZE]
+    __attribute__((aligned(CHIP_L2_LINE_SIZE())));
+} lepp_queue_t;
+
+
+/** An internal helper function for determining the number of entries
+ *  available in a ring buffer, given that there is one sentinel.
+ */
+static inline unsigned int
+_lepp_num_free_slots(unsigned int head, unsigned int tail)
+{
+  /*
+   * One entry is reserved for use as a sentinel, to distinguish
+   * "empty" from "full".  So we compute
+   * (head - tail - 1) % LEPP_QUEUE_SIZE, but without using a slow % operation.
+   */
+  return (head - tail - 1) + ((head <= tail) ? LEPP_COMP_QUEUE_SIZE : 0);
+}
+
+
+/** Returns how many new comp entries can be enqueued. */
+static inline unsigned int
+lepp_num_free_comp_slots(const lepp_queue_t* q)
+{
+  return _lepp_num_free_slots(q->comp_head, q->comp_tail);
+}
+
+static inline int
+lepp_qsub(int v1, int v2)
+{
+  int delta = v1 - v2;
+  return delta + ((delta >> 31) & LEPP_COMP_QUEUE_SIZE);
+}
+
+
+/** FIXME: Check this from linux, via a new "pwrite()" call. */
+#define LIPP_VERSION 1
+
+
+/** We use exactly two bytes of alignment padding. */
+#define LIPP_PACKET_PADDING 2
+
+/** The minimum size of a "small" buffer (including the padding). */
+#define LIPP_SMALL_PACKET_SIZE 128
+
+/*
+ * NOTE: The following two values should total to less than around
+ * 13582, to keep the total size used for "lipp_state_t" below 64K.
+ */
+
+/** The maximum number of "small" buffers.
+ *  This is enough for 53 network cpus with 128 credits.  Note that
+ *  if these are exhausted, we will fall back to using large buffers.
+ */
+#define LIPP_SMALL_BUFFERS 6785
+
+/** The maximum number of "large" buffers.
+ *  This is enough for 53 network cpus with 128 credits.
+ */
+#define LIPP_LARGE_BUFFERS 6785
+
+#endif /* __DRV_XGBE_INTF_H__ */
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index 9bd303a..f672544 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -1003,37 +1003,37 @@ int hv_console_write(HV_VirtAddr bytes, int len);
  *  when these occur in a client's interrupt critical section, they must
  *  be delivered through the downcall mechanism.
  *
- *  A downcall is initially delivered to the client as an INTCTRL_1
- *  interrupt.  Upon entry to the INTCTRL_1 vector, the client must
- *  immediately invoke the hv_downcall_dispatch service.  This service
- *  will not return; instead it will cause one of the client's actual
- *  downcall-handling interrupt vectors to be entered.  The EX_CONTEXT
- *  registers in the client will be set so that when the client irets,
- *  it will return to the code which was interrupted by the INTCTRL_1
- *  interrupt.
- *
- *  Under some circumstances, the firing of INTCTRL_1 can race with
+ *  A downcall is initially delivered to the client as an INTCTRL_CL
+ *  interrupt, where CL is the client's PL.  Upon entry to the INTCTRL_CL
+ *  vector, the client must immediately invoke the hv_downcall_dispatch
+ *  service.  This service will not return; instead it will cause one of
+ *  the client's actual downcall-handling interrupt vectors to be entered.
+ *  The EX_CONTEXT registers in the client will be set so that when the
+ *  client irets, it will return to the code which was interrupted by the
+ *  INTCTRL_CL interrupt.
+ *
+ *  Under some circumstances, the firing of INTCTRL_CL can race with
  *  the lowering of a device interrupt.  In such a case, the
  *  hv_downcall_dispatch service may issue an iret instruction instead
  *  of entering one of the client's actual downcall-handling interrupt
  *  vectors.  This will return execution to the location that was
- *  interrupted by INTCTRL_1.
+ *  interrupted by INTCTRL_CL.
  *
  *  Any saving of registers should be done by the actual handling
- *  vectors; no registers should be changed by the INTCTRL_1 handler.
+ *  vectors; no registers should be changed by the INTCTRL_CL handler.
  *  In particular, the client should not use a jal instruction to invoke
  *  the hv_downcall_dispatch service, as that would overwrite the client's
  *  lr register.  Note that the hv_downcall_dispatch service may overwrite
  *  one or more of the client's system save registers.
  *
- *  The client must not modify the INTCTRL_1_STATUS SPR.  The hypervisor
+ *  The client must not modify the INTCTRL_CL_STATUS SPR.  The hypervisor
  *  will set this register to cause a downcall to happen, and will clear
  *  it when no further downcalls are pending.
  *
- *  When a downcall vector is entered, the INTCTRL_1 interrupt will be
+ *  When a downcall vector is entered, the INTCTRL_CL interrupt will be
  *  masked.  When the client is done processing a downcall, and is ready
  *  to accept another, it must unmask this interrupt; if more downcalls
- *  are pending, this will cause the INTCTRL_1 vector to be reentered.
+ *  are pending, this will cause the INTCTRL_CL vector to be reentered.
  *  Currently the following interrupt vectors can be entered through a
  *  downcall:
  *
diff --git a/arch/tile/include/hv/netio_errors.h b/arch/tile/include/hv/netio_errors.h
new file mode 100644
index 0000000..e1591bf
--- /dev/null
+++ b/arch/tile/include/hv/netio_errors.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Error codes returned from NetIO routines.
+ */
+
+#ifndef __NETIO_ERRORS_H__
+#define __NETIO_ERRORS_H__
+
+/**
+ * @addtogroup error
+ *
+ * @brief The error codes returned by NetIO functions.
+ *
+ * NetIO functions return 0 (defined as ::NETIO_NO_ERROR) on success, and
+ * a negative value if an error occurs.
+ *
+ * In cases where a NetIO function failed due to a error reported by
+ * system libraries, the error code will be the negation of the
+ * system errno at the time of failure.  The @ref netio_strerror()
+ * function will deliver error strings for both NetIO and system error
+ * codes.
+ *
+ * @{
+ */
+
+/** The set of all NetIO errors. */
+typedef enum
+{
+  /** Operation successfully completed. */
+  NETIO_NO_ERROR        = 0,
+
+  /** A packet was successfully retrieved from an input queue. */
+  NETIO_PKT             = 0,
+
+  /** Largest NetIO error number. */
+  NETIO_ERR_MAX         = -701,
+
+  /** The tile is not registered with the IPP. */
+  NETIO_NOT_REGISTERED  = -701,
+
+  /** No packet was available to retrieve from the input queue. */
+  NETIO_NOPKT           = -702,
+
+  /** The requested function is not implemented. */
+  NETIO_NOT_IMPLEMENTED = -703,
+
+  /** On a registration operation, the target queue already has the maximum
+   *  number of tiles registered for it, and no more may be added.  On a
+   *  packet send operation, the output queue is full and nothing more can
+   *  be queued until some of the queued packets are actually transmitted. */
+  NETIO_QUEUE_FULL      = -704,
+
+  /** The calling process or thread is not bound to exactly one CPU. */
+  NETIO_BAD_AFFINITY    = -705,
+
+  /** Cannot allocate memory on requested controllers. */
+  NETIO_CANNOT_HOME     = -706,
+
+  /** On a registration operation, the IPP specified is not configured
+   *  to support the options requested; for instance, the application
+   *  wants a specific type of tagged headers which the configured IPP
+   *  doesn't support.  Or, the supplied configuration information is
+   *  not self-consistent, or is out of range; for instance, specifying
+   *  both NETIO_RECV and NETIO_NO_RECV, or asking for more than
+   *  NETIO_MAX_SEND_BUFFERS to be preallocated.  On a VLAN or bucket
+   *  configure operation, the number of items, or the base item, was
+   *  out of range.
+   */
+  NETIO_BAD_CONFIG      = -707,
+
+  /** Too many tiles have registered to transmit packets. */
+  NETIO_TOOMANY_XMIT    = -708,
+
+  /** Packet transmission was attempted on a queue which was registered
+      with transmit disabled. */
+  NETIO_UNREG_XMIT      = -709,
+
+  /** This tile is already registered with the IPP. */
+  NETIO_ALREADY_REGISTERED = -710,
+
+  /** The Ethernet link is down. The application should try again later. */
+  NETIO_LINK_DOWN       = -711,
+
+  /** An invalid memory buffer has been specified.  This may be an unmapped
+   * virtual address, or one which does not meet alignment requirements.
+   * For netio_input_register(), this error may be returned when multiple
+   * processes specify different memory regions to be used for NetIO
+   * buffers.  That can happen if these processes specify explicit memory
+   * regions with the ::NETIO_FIXED_BUFFER_VA flag, or if tmc_cmem_init()
+   * has not been called by a common ancestor of the processes.
+   */
+  NETIO_FAULT           = -712,
+
+  /** Cannot combine user-managed shared memory and cache coherence. */
+  NETIO_BAD_CACHE_CONFIG = -713,
+
+  /** Smallest NetIO error number. */
+  NETIO_ERR_MIN         = -713,
+
+#ifndef __DOXYGEN__
+  /** Used internally to mean that no response is needed; never returned to
+   *  an application. */
+  NETIO_NO_RESPONSE     = 1
+#endif
+} netio_error_t;
+
+/** @} */
+
+#endif /* __NETIO_ERRORS_H__ */
diff --git a/arch/tile/include/hv/netio_intf.h b/arch/tile/include/hv/netio_intf.h
new file mode 100644
index 0000000..8d20972
--- /dev/null
+++ b/arch/tile/include/hv/netio_intf.h
@@ -0,0 +1,2975 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * NetIO interface structures and macros.
+ */
+
+#ifndef __NETIO_INTF_H__
+#define __NETIO_INTF_H__
+
+#include <hv/netio_errors.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(__HV__) && !defined(__BOGUX__) && !defined(__KERNEL__)
+#include <assert.h>
+#define netio_assert assert  /**< Enable assertions from macros */
+#else
+#define netio_assert(...) ((void)(0))  /**< Disable assertions from macros */
+#endif
+
+/*
+ * If none of these symbols are defined, we're building libnetio in an
+ * environment where we have pthreads, so we'll enable locking.
+ */
+#if !defined(__HV__) && !defined(__BOGUX__) && !defined(__KERNEL__) && \
+    !defined(__NEWLIB__)
+#define _NETIO_PTHREAD       /**< Include a mutex in netio_queue_t below */
+
+/*
+ * If NETIO_UNLOCKED is defined, we don't do use per-cpu locks on
+ * per-packet NetIO operations.  We still do pthread locking on things
+ * like netio_input_register, though.  This is used for building
+ * libnetio_unlocked.
+ */
+#ifndef NETIO_UNLOCKED
+
+/* Avoid PLT overhead by using our own inlined per-cpu lock. */
+#include <sched.h>
+typedef int _netio_percpu_mutex_t;
+
+static __inline int
+_netio_percpu_mutex_init(_netio_percpu_mutex_t* lock)
+{
+  *lock = 0;
+  return 0;
+}
+
+static __inline int
+_netio_percpu_mutex_lock(_netio_percpu_mutex_t* lock)
+{
+  while (__builtin_expect(__insn_tns(lock), 0))
+    sched_yield();
+  return 0;
+}
+
+static __inline int
+_netio_percpu_mutex_unlock(_netio_percpu_mutex_t* lock)
+{
+  *lock = 0;
+  return 0;
+}
+
+#else /* NETIO_UNLOCKED */
+
+/* Don't do any locking for per-packet NetIO operations. */
+typedef int _netio_percpu_mutex_t;
+#define _netio_percpu_mutex_init(L)
+#define _netio_percpu_mutex_lock(L)
+#define _netio_percpu_mutex_unlock(L)
+
+#endif /* NETIO_UNLOCKED */
+#endif /* !__HV__, !__BOGUX, !__KERNEL__, !__NEWLIB__ */
+
+/** How many tiles can register for a given queue.
+ *  @ingroup setup */
+#define NETIO_MAX_TILES_PER_QUEUE  64
+
+
+/** Largest permissible queue identifier.
+ *  @ingroup setup  */
+#define NETIO_MAX_QUEUE_ID        255
+
+
+#ifndef __DOXYGEN__
+
+/* Metadata packet checksum/ethertype flags. */
+
+/** The L4 checksum has not been calculated. */
+#define _NETIO_PKT_NO_L4_CSUM_SHIFT           0
+#define _NETIO_PKT_NO_L4_CSUM_RMASK           1
+#define _NETIO_PKT_NO_L4_CSUM_MASK \
+         (_NETIO_PKT_NO_L4_CSUM_RMASK << _NETIO_PKT_NO_L4_CSUM_SHIFT)
+
+/** The L3 checksum has not been calculated. */
+#define _NETIO_PKT_NO_L3_CSUM_SHIFT           1
+#define _NETIO_PKT_NO_L3_CSUM_RMASK           1
+#define _NETIO_PKT_NO_L3_CSUM_MASK \
+         (_NETIO_PKT_NO_L3_CSUM_RMASK << _NETIO_PKT_NO_L3_CSUM_SHIFT)
+
+/** The L3 checksum is incorrect (or perhaps has not been calculated). */
+#define _NETIO_PKT_BAD_L3_CSUM_SHIFT          2
+#define _NETIO_PKT_BAD_L3_CSUM_RMASK          1
+#define _NETIO_PKT_BAD_L3_CSUM_MASK \
+         (_NETIO_PKT_BAD_L3_CSUM_RMASK << _NETIO_PKT_BAD_L3_CSUM_SHIFT)
+
+/** The Ethernet packet type is unrecognized. */
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_SHIFT    3
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_RMASK    1
+#define _NETIO_PKT_TYPE_UNRECOGNIZED_MASK \
+         (_NETIO_PKT_TYPE_UNRECOGNIZED_RMASK << \
+          _NETIO_PKT_TYPE_UNRECOGNIZED_SHIFT)
+
+/* Metadata packet type flags. */
+
+/** Where the packet type bits are; this field is the index into
+ *  _netio_pkt_info. */
+#define _NETIO_PKT_TYPE_SHIFT        4
+#define _NETIO_PKT_TYPE_RMASK        0x3F
+
+/** How many VLAN tags the packet has, and, if we have two, which one we
+ *  actually grouped on.  A VLAN within a proprietary (Marvell or Broadcom)
+ *  tag is counted here. */
+#define _NETIO_PKT_VLAN_SHIFT        4
+#define _NETIO_PKT_VLAN_RMASK        0x3
+#define _NETIO_PKT_VLAN_MASK \
+         (_NETIO_PKT_VLAN_RMASK << _NETIO_PKT_VLAN_SHIFT)
+#define _NETIO_PKT_VLAN_NONE         0   /* No VLAN tag. */
+#define _NETIO_PKT_VLAN_ONE          1   /* One VLAN tag. */
+#define _NETIO_PKT_VLAN_TWO_OUTER    2   /* Two VLAN tags, outer one used. */
+#define _NETIO_PKT_VLAN_TWO_INNER    3   /* Two VLAN tags, inner one used. */
+
+/** Which proprietary tags the packet has. */
+#define _NETIO_PKT_TAG_SHIFT         6
+#define _NETIO_PKT_TAG_RMASK         0x3
+#define _NETIO_PKT_TAG_MASK \
+          (_NETIO_PKT_TAG_RMASK << _NETIO_PKT_TAG_SHIFT)
+#define _NETIO_PKT_TAG_NONE          0   /* No proprietary tags. */
+#define _NETIO_PKT_TAG_MRVL          1   /* Marvell HyperG.Stack tags. */
+#define _NETIO_PKT_TAG_MRVL_EXT      2   /* HyperG.Stack extended tags. */
+#define _NETIO_PKT_TAG_BRCM          3   /* Broadcom HiGig tags. */
+
+/** Whether a packet has an LLC + SNAP header. */
+#define _NETIO_PKT_SNAP_SHIFT        8
+#define _NETIO_PKT_SNAP_RMASK        0x1
+#define _NETIO_PKT_SNAP_MASK \
+          (_NETIO_PKT_SNAP_RMASK << _NETIO_PKT_SNAP_SHIFT)
+
+/* NOTE: Bits 9 and 10 are unused. */
+
+/** Length of any custom data before the L2 header, in words. */
+#define _NETIO_PKT_CUSTOM_LEN_SHIFT  11
+#define _NETIO_PKT_CUSTOM_LEN_RMASK  0x1F
+#define _NETIO_PKT_CUSTOM_LEN_MASK \
+          (_NETIO_PKT_CUSTOM_LEN_RMASK << _NETIO_PKT_CUSTOM_LEN_SHIFT)
+
+/** The L4 checksum is incorrect (or perhaps has not been calculated). */
+#define _NETIO_PKT_BAD_L4_CSUM_SHIFT 16
+#define _NETIO_PKT_BAD_L4_CSUM_RMASK 0x1
+#define _NETIO_PKT_BAD_L4_CSUM_MASK \
+          (_NETIO_PKT_BAD_L4_CSUM_RMASK << _NETIO_PKT_BAD_L4_CSUM_SHIFT)
+
+/** Length of the L2 header, in words. */
+#define _NETIO_PKT_L2_LEN_SHIFT  17
+#define _NETIO_PKT_L2_LEN_RMASK  0x1F
+#define _NETIO_PKT_L2_LEN_MASK \
+          (_NETIO_PKT_L2_LEN_RMASK << _NETIO_PKT_L2_LEN_SHIFT)
+
+
+/* Flags in minimal packet metadata. */
+
+/** We need an eDMA checksum on this packet. */
+#define _NETIO_PKT_NEED_EDMA_CSUM_SHIFT            0
+#define _NETIO_PKT_NEED_EDMA_CSUM_RMASK            1
+#define _NETIO_PKT_NEED_EDMA_CSUM_MASK \
+         (_NETIO_PKT_NEED_EDMA_CSUM_RMASK << _NETIO_PKT_NEED_EDMA_CSUM_SHIFT)
+
+/* Data within the packet information table. */
+
+/* Note that, for efficiency, code which uses these fields assumes that none
+ * of the shift values below are zero.  See uses below for an explanation. */
+
+/** Offset within the L2 header of the innermost ethertype (in halfwords). */
+#define _NETIO_PKT_INFO_ETYPE_SHIFT       6
+#define _NETIO_PKT_INFO_ETYPE_RMASK    0x1F
+
+/** Offset within the L2 header of the VLAN tag (in halfwords). */
+#define _NETIO_PKT_INFO_VLAN_SHIFT       11
+#define _NETIO_PKT_INFO_VLAN_RMASK     0x1F
+
+#endif
+
+
+/** The size of a memory buffer representing a small packet.
+ *  @ingroup egress */
+#define SMALL_PACKET_SIZE 256
+
+/** The size of a memory buffer representing a large packet.
+ *  @ingroup egress */
+#define LARGE_PACKET_SIZE 2048
+
+/** The size of a memory buffer representing a jumbo packet.
+ *  @ingroup egress */
+#define JUMBO_PACKET_SIZE (12 * 1024)
+
+
+/* Common ethertypes.
+ * @ingroup ingress */
+/** @{ */
+/** The ethertype of IPv4. */
+#define ETHERTYPE_IPv4 (0x0800)
+/** The ethertype of ARP. */
+#define ETHERTYPE_ARP (0x0806)
+/** The ethertype of VLANs. */
+#define ETHERTYPE_VLAN (0x8100)
+/** The ethertype of a Q-in-Q header. */
+#define ETHERTYPE_Q_IN_Q (0x9100)
+/** The ethertype of IPv6. */
+#define ETHERTYPE_IPv6 (0x86DD)
+/** The ethertype of MPLS. */
+#define ETHERTYPE_MPLS (0x8847)
+/** @} */
+
+
+/** The possible return values of NETIO_PKT_STATUS.
+ * @ingroup ingress
+ */
+typedef enum
+{
+  /** No problems were detected with this packet. */
+  NETIO_PKT_STATUS_OK,
+  /** The packet is undersized; this is expected behavior if the packet's
+    * ethertype is unrecognized, but otherwise the packet is likely corrupt. */
+  NETIO_PKT_STATUS_UNDERSIZE,
+  /** The packet is oversized and some trailing bytes have been discarded.
+      This is expected behavior for short packets, since it's impossible to
+      precisely determine the amount of padding which may have been added to
+      them to make them meet the minimum Ethernet packet size. */
+  NETIO_PKT_STATUS_OVERSIZE,
+  /** The packet was judged to be corrupt by hardware (for instance, it had
+      a bad CRC, or part of it was discarded due to lack of buffer space in
+      the I/O shim) and should be discarded. */
+  NETIO_PKT_STATUS_BAD
+} netio_pkt_status_t;
+
+
+/** Log2 of how many buckets we have. */
+#define NETIO_LOG2_NUM_BUCKETS (10)
+
+/** How many buckets we have.
+ * @ingroup ingress */
+#define NETIO_NUM_BUCKETS (1 << NETIO_LOG2_NUM_BUCKETS)
+
+
+/**
+ * @brief A group-to-bucket identifier.
+ *
+ * @ingroup setup
+ *
+ * This tells us what to do with a given group.
+ */
+typedef union {
+  /** The header broken down into bits. */
+  struct {
+    /** Whether we should balance on L4, if available */
+    unsigned int __balance_on_l4:1;
+    /** Whether we should balance on L3, if available */
+    unsigned int __balance_on_l3:1;
+    /** Whether we should balance on L2, if available */
+    unsigned int __balance_on_l2:1;
+    /** Reserved for future use */
+    unsigned int __reserved:1;
+    /** The base bucket to use to send traffic */
+    unsigned int __bucket_base:NETIO_LOG2_NUM_BUCKETS;
+    /** The mask to apply to the balancing value. This must be one less
+     * than a power of two, e.g. 0x3 or 0xFF.
+     */
+    unsigned int __bucket_mask:NETIO_LOG2_NUM_BUCKETS;
+    /** Pad to 32 bits */
+    unsigned int __padding:(32 - 4 - 2 * NETIO_LOG2_NUM_BUCKETS);
+  } bits;
+  /** To send out the IDN. */
+  unsigned int word;
+}
+netio_group_t;
+
+
+/**
+ * @brief A VLAN-to-bucket identifier.
+ *
+ * @ingroup setup
+ *
+ * This tells us what to do with a given VLAN.
+ */
+typedef netio_group_t netio_vlan_t;
+
+
+/**
+ * A bucket-to-queue mapping.
+ * @ingroup setup
+ */
+typedef unsigned char netio_bucket_t;
+
+
+/**
+ * A packet size can always fit in a netio_size_t.
+ * @ingroup setup
+ */
+typedef unsigned int netio_size_t;
+
+
+/**
+ * @brief Ethernet standard (ingress) packet metadata.
+ *
+ * @ingroup ingress
+ *
+ * This is additional data associated with each packet.
+ * This structure is opaque and accessed through the @ref ingress.
+ *
+ * Also, the buffer population operation currently assumes that standard
+ * metadata is at least as large as minimal metadata, and will need to be
+ * modified if that is no longer the case.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[24];
+#else
+  /** The overall ordinal of the packet */
+  unsigned int __packet_ordinal;
+  /** The ordinal of the packet within the group */
+  unsigned int __group_ordinal;
+  /** The best flow hash IPP could compute. */
+  unsigned int __flow_hash;
+  /** Flags pertaining to checksum calculation, packet type, etc. */
+  unsigned int __flags;
+  /** The first word of "user data". */
+  unsigned int __user_data_0;
+  /** The second word of "user data". */
+  unsigned int __user_data_1;
+#endif
+}
+netio_pkt_metadata_t;
+
+
+/** To ensure that the L3 header is aligned mod 4, the L2 header should be
+ * aligned mod 4 plus 2, since every supported L2 header is 4n + 2 bytes
+ * long.  The standard way to do this is to simply add 2 bytes of padding
+ * before the L2 header.
+ */
+#define NETIO_PACKET_PADDING 2
+
+
+
+/**
+ * @brief Ethernet minimal (egress) packet metadata.
+ *
+ * @ingroup egress
+ *
+ * This structure represents information about packets which have
+ * been processed by @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer().  This structure is opaque
+ * and accessed through the @ref egress.
+ *
+ * @internal This structure is actually copied into the memory used by
+ * standard metadata, which is assumed to be large enough.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[14];
+#else
+  /** The offset of the L2 header from the start of the packet data. */
+  unsigned short l2_offset;
+  /** The offset of the L3 header from the start of the packet data. */
+  unsigned short l3_offset;
+  /** Where to write the checksum. */
+  unsigned char csum_location;
+  /** Where to start checksumming from. */
+  unsigned char csum_start;
+  /** Flags pertaining to checksum calculation etc. */
+  unsigned short flags;
+  /** The L2 length of the packet. */
+  unsigned short l2_length;
+  /** The checksum with which to seed the checksum generator. */
+  unsigned short csum_seed;
+  /** How much to checksum. */
+  unsigned short csum_length;
+#endif
+}
+netio_pkt_minimal_metadata_t;
+
+
+#ifndef __DOXYGEN__
+
+/**
+ * @brief An I/O notification header.
+ *
+ * This is the first word of data received from an I/O shim in a notification
+ * packet. It contains framing and status information.
+ */
+typedef union
+{
+  unsigned int word; /**< The whole word. */
+  /** The various fields. */
+  struct
+  {
+    unsigned int __channel:7;    /**< Resource channel. */
+    unsigned int __type:4;       /**< Type. */
+    unsigned int __ack:1;        /**< Whether an acknowledgement is needed. */
+    unsigned int __reserved:1;   /**< Reserved. */
+    unsigned int __protocol:1;   /**< A protocol-specific word is added. */
+    unsigned int __status:2;     /**< Status of the transfer. */
+    unsigned int __framing:2;    /**< Framing of the transfer. */
+    unsigned int __transfer_size:14; /**< Transfer size in bytes (total). */
+  } bits;
+}
+__netio_pkt_notif_t;
+
+
+/**
+ * Returns the base address of the packet.
+ */
+#define _NETIO_PKT_HANDLE_BASE(p) \
+  ((unsigned char*)((p).word & 0xFFFFFFC0))
+
+/**
+ * Returns the base address of the packet.
+ */
+#define _NETIO_PKT_BASE(p) \
+  _NETIO_PKT_HANDLE_BASE(p->__packet)
+
+/**
+ * @brief An I/O notification packet (second word)
+ *
+ * This is the second word of data received from an I/O shim in a notification
+ * packet.  This is the virtual address of the packet buffer, plus some flag
+ * bits.  (The virtual address of the packet is always 256-byte aligned so we
+ * have room for 8 bits' worth of flags in the low 8 bits.)
+ *
+ * @internal
+ * NOTE: The low two bits must contain "__queue", so the "packet size"
+ * (SIZE_SMALL, SIZE_LARGE, or SIZE_JUMBO) can be determined quickly.
+ *
+ * If __addr or __offset are moved, _NETIO_PKT_BASE
+ * (defined right below this) must be changed.
+ */
+typedef union
+{
+  unsigned int word; /**< The whole word. */
+  /** The various fields. */
+  struct
+  {
+    /** Which queue the packet will be returned to once it is sent back to
+        the IPP.  This is one of the SIZE_xxx values. */
+    unsigned int __queue:2;
+
+    /** The IPP handle of the sending IPP. */
+    unsigned int __ipp_handle:2;
+
+    /** Reserved for future use. */
+    unsigned int __reserved:1;
+
+    /** If 1, this packet has minimal (egress) metadata; otherwise, it
+        has standard (ingress) metadata. */
+    unsigned int __minimal:1;
+
+    /** Offset of the metadata within the packet.  This value is multiplied
+     *  by 64 and added to the base packet address to get the metadata
+     *  address.  Note that this field is aligned within the word such that
+     *  you can easily extract the metadata address with a 26-bit mask. */
+    unsigned int __offset:2;
+
+    /** The top 24 bits of the packet's virtual address. */
+    unsigned int __addr:24;
+  } bits;
+}
+__netio_pkt_handle_t;
+
+#endif /* !__DOXYGEN__ */
+
+
+/**
+ * @brief A handle for an I/O packet's storage.
+ * @ingroup ingress
+ *
+ * netio_pkt_handle_t encodes the concept of a ::netio_pkt_t with its
+ * packet metadata removed.  It is a much smaller type that exists to
+ * facilitate applications where the full ::netio_pkt_t type is too
+ * large, such as those that cache enormous numbers of packets or wish
+ * to transmit packet descriptors over the UDN.
+ *
+ * Because there is no metadata, most ::netio_pkt_t operations cannot be
+ * performed on a netio_pkt_handle_t.  It supports only
+ * netio_free_handle() (to free the buffer) and
+ * NETIO_PKT_CUSTOM_DATA_H() (to access a pointer to its contents).
+ * The application must acquire any additional metadata it wants from the
+ * original ::netio_pkt_t and record it separately.
+ *
+ * A netio_pkt_handle_t can be extracted from a ::netio_pkt_t by calling
+ * NETIO_PKT_HANDLE().  An invalid handle (analogous to NULL) can be
+ * created by assigning the value ::NETIO_PKT_HANDLE_NONE. A handle can
+ * be tested for validity with NETIO_PKT_HANDLE_IS_VALID().
+ */
+typedef struct
+{
+  unsigned int word; /**< Opaque bits. */
+} netio_pkt_handle_t;
+
+/**
+ * @brief A packet descriptor.
+ *
+ * @ingroup ingress
+ * @ingroup egress
+ *
+ * This data structure represents a packet.  The structure is manipulated
+ * through the @ref ingress and the @ref egress.
+ *
+ * While the contents of a netio_pkt_t are opaque, the structure itself is
+ * portable.  This means that it may be shared between all tiles which have
+ * done a netio_input_register() call for the interface on which the pkt_t
+ * was initially received (via netio_get_packet()) or retrieved (via
+ * netio_get_buffer()).  The contents of a netio_pkt_t can be transmitted to
+ * another tile via shared memory, or via a UDN message, or by other means.
+ * The destination tile may then use the pkt_t as if it had originally been
+ * received locally; it may read or write the packet's data, read its
+ * metadata, free the packet, send the packet, transfer the netio_pkt_t to
+ * yet another tile, and so forth.
+ *
+ * Once a netio_pkt_t has been transferred to a second tile, the first tile
+ * should not reference the original copy; in particular, if more than one
+ * tile frees or sends the same netio_pkt_t, the IPP's packet free lists will
+ * become corrupted.  Note also that each tile which reads or modifies
+ * packet data must obey the memory coherency rules outlined in @ref input.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  /** This structure is opaque. */
+  unsigned char opaque[32];
+#else
+  /** For an ingress packet (one with standard metadata), this is the
+   *  notification header we got from the I/O shim.  For an egress packet
+   *  (one with minimal metadata), this word is zero if the packet has not
+   *  been populated, and nonzero if it has. */
+  __netio_pkt_notif_t __notif_header;
+
+  /** Virtual address of the packet buffer, plus state flags. */
+  __netio_pkt_handle_t __packet;
+
+  /** Metadata associated with the packet. */
+  netio_pkt_metadata_t __metadata;
+#endif
+}
+netio_pkt_t;
+
+
+#ifndef __DOXYGEN__
+
+#define __NETIO_PKT_NOTIF_HEADER(pkt) ((pkt)->__notif_header)
+#define __NETIO_PKT_IPP_HANDLE(pkt) ((pkt)->__packet.bits.__ipp_handle)
+#define __NETIO_PKT_QUEUE(pkt) ((pkt)->__packet.bits.__queue)
+#define __NETIO_PKT_NOTIF_HEADER_M(mda, pkt) ((pkt)->__notif_header)
+#define __NETIO_PKT_IPP_HANDLE_M(mda, pkt) ((pkt)->__packet.bits.__ipp_handle)
+#define __NETIO_PKT_MINIMAL(pkt) ((pkt)->__packet.bits.__minimal)
+#define __NETIO_PKT_QUEUE_M(mda, pkt) ((pkt)->__packet.bits.__queue)
+#define __NETIO_PKT_FLAGS_M(mda, pkt) ((mda)->__flags)
+
+/* Packet information table, used by the attribute access functions below. */
+extern const uint16_t _netio_pkt_info[];
+
+#endif /* __DOXYGEN__ */
+
+
+#ifndef __DOXYGEN__
+/* These macros are deprecated and will disappear in a future MDE release. */
+#define NETIO_PKT_GOOD_CHECKSUM(pkt) \
+  NETIO_PKT_L4_CSUM_CORRECT(pkt)
+#define NETIO_PKT_GOOD_CHECKSUM_M(mda, pkt) \
+  NETIO_PKT_L4_CSUM_CORRECT_M(mda, pkt)
+#endif /* __DOXYGEN__ */
+
+
+/* Packet attribute access functions. */
+
+/** Return a pointer to the metadata for a packet.
+ * @ingroup ingress
+ *
+ * Calling this function once and passing the result to other retrieval
+ * functions with a "_M" suffix usually improves performance.  This
+ * function must be called on an 'ingress' packet (i.e. one retrieved
+ * by @ref netio_get_packet(), on which @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer have not been called). Use of this
+ * function on an 'egress' packet will cause an assertion failure.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's standard metadata.
+ */
+static __inline netio_pkt_metadata_t*
+NETIO_PKT_METADATA(netio_pkt_t* pkt)
+{
+  netio_assert(!pkt->__packet.bits.__minimal);
+  return &pkt->__metadata;
+}
+
+
+/** Return a pointer to the minimal metadata for a packet.
+ * @ingroup egress
+ *
+ * Calling this function once and passing the result to other retrieval
+ * functions with a "_MM" suffix usually improves performance.  This
+ * function must be called on an 'egress' packet (i.e. one on which
+ * @ref netio_populate_buffer() or @ref netio_populate_prepend_buffer()
+ * have been called, or one retrieved by @ref netio_get_buffer()). Use of
+ * this function on an 'ingress' packet will cause an assertion failure.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's standard metadata.
+ */
+static __inline netio_pkt_minimal_metadata_t*
+NETIO_PKT_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+  netio_assert(pkt->__packet.bits.__minimal);
+  return (netio_pkt_minimal_metadata_t*) &pkt->__metadata;
+}
+
+
+/** Determine whether a packet has 'minimal' metadata.
+ * @ingroup pktfuncs
+ *
+ * This function will return nonzero if the packet is an 'egress'
+ * packet (i.e. one on which @ref netio_populate_buffer() or
+ * @ref netio_populate_prepend_buffer() have been called, or one
+ * retrieved by @ref netio_get_buffer()), and zero if the packet
+ * is an 'ingress' packet (i.e. one retrieved by @ref netio_get_packet(),
+ * which has not been converted into an 'egress' packet).
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet has minimal metadata.
+ */
+static __inline unsigned int
+NETIO_PKT_IS_MINIMAL(netio_pkt_t* pkt)
+{
+  return pkt->__packet.bits.__minimal;
+}
+
+
+/** Return a handle for a packet's storage.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A handle for the packet's storage.
+ */
+static __inline netio_pkt_handle_t
+NETIO_PKT_HANDLE(netio_pkt_t* pkt)
+{
+  netio_pkt_handle_t h;
+  h.word = pkt->__packet.word;
+  return h;
+}
+
+
+/** A special reserved value indicating the absence of a packet handle.
+ *
+ * @ingroup pktfuncs
+ */
+#define NETIO_PKT_HANDLE_NONE ((netio_pkt_handle_t) { 0 })
+
+
+/** Test whether a packet handle is valid.
+ *
+ * Applications may wish to use the reserved value NETIO_PKT_HANDLE_NONE
+ * to indicate no packet at all.  This function tests to see if a packet
+ * handle is a real handle, not this special reserved value.
+ *
+ * @ingroup pktfuncs
+ *
+ * @param[in] handle Handle on which to operate.
+ * @return One if the packet handle is valid, else zero.
+ */
+static __inline unsigned int
+NETIO_PKT_HANDLE_IS_VALID(netio_pkt_handle_t handle)
+{
+  return handle.word != 0;
+}
+
+
+
+/** Return a pointer to the start of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] handle Handle on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA_H(netio_pkt_handle_t handle)
+{
+  return _NETIO_PKT_HANDLE_BASE(handle) + NETIO_PACKET_PADDING;
+}
+
+
+/** Return the length of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's custom header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_HEADER_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  /*
+   * Note that we effectively need to extract a quantity from the flags word
+   * which is measured in words, and then turn it into bytes by shifting
+   * it left by 2.  We do this all at once by just shifting right two less
+   * bits, and shifting the mask up two bits.
+   */
+  return ((mda->__flags >> (_NETIO_PKT_CUSTOM_LEN_SHIFT - 2)) &
+          (_NETIO_PKT_CUSTOM_LEN_RMASK << 2));
+}
+
+
+/** Return the length of the packet, starting with the custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (__NETIO_PKT_NOTIF_HEADER(pkt).bits.__transfer_size -
+          NETIO_PACKET_PADDING);
+}
+
+
+/** Return a pointer to the start of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return NETIO_PKT_CUSTOM_DATA_H(NETIO_PKT_HANDLE(pkt));
+}
+
+
+/** Return the length of the packet's L2 (Ethernet plus VLAN or SNAP) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  /*
+   * Note that we effectively need to extract a quantity from the flags word
+   * which is measured in words, and then turn it into bytes by shifting
+   * it left by 2.  We do this all at once by just shifting right two less
+   * bits, and shifting the mask up two bits.  We then add two bytes.
+   */
+  return ((mda->__flags >> (_NETIO_PKT_L2_LEN_SHIFT - 2)) &
+          (_NETIO_PKT_L2_LEN_RMASK << 2)) + 2;
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_CUSTOM_LENGTH_M(mda, pkt) -
+          NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda,pkt));
+}
+
+
+/** Return a pointer to the start of the packet's L2 (Ethernet) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_CUSTOM_DATA_M(mda, pkt) +
+          NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt));
+}
+
+
+/** Retrieve the length of the packet, starting with the L3 (generally,
+ *  the IP) header.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_LENGTH_M(mda, pkt) -
+          NETIO_PKT_L2_HEADER_LENGTH_M(mda,pkt));
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup ingress
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_DATA_M(mda, pkt) +
+          NETIO_PKT_L2_HEADER_LENGTH_M(mda, pkt));
+}
+
+
+/** Return the ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given an ordinal number when it is delivered by the IPP.
+ * In the medium term, the ordinal is unique and monotonically increasing,
+ * being incremented by 1 for each packet; the ordinal of the first packet
+ * delivered after the IPP starts is zero.  (Since the ordinal is of finite
+ * size, given enough input packets, it will eventually wrap around to zero;
+ * in the long term, therefore, ordinals are not unique.)  The ordinals
+ * handed out by different IPPs are not disjoint, so two packets from
+ * different IPPs may have identical ordinals.  Packets dropped by the
+ * IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP packet ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_ORDINAL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__packet_ordinal;
+}
+
+
+/** Return the per-group ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given a per-group ordinal number when it is
+ * delivered by the IPP. By default, the group is the packet's VLAN,
+ * although IPP can be recompiled to use different values.  In
+ * the medium term, the ordinal is unique and monotonically
+ * increasing, being incremented by 1 for each packet; the ordinal of
+ * the first packet distributed to a particular group is zero.
+ * (Since the ordinal is of finite size, given enough input packets,
+ * it will eventually wrap around to zero; in the long term,
+ * therefore, ordinals are not unique.)  The ordinals handed out by
+ * different IPPs are not disjoint, so two packets from different IPPs
+ * may have identical ordinals; similarly, packets distributed to
+ * different groups may have identical ordinals.  Packets dropped by
+ * the IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP, per-group ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_GROUP_ORDINAL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__group_ordinal;
+}
+
+
+/** Return the VLAN ID assigned to the packet.
+ * @ingroup ingress
+ *
+ * This value is usually contained within the packet header.
+ *
+ * This value will be zero if the packet does not have a VLAN tag, or if
+ * this value was not extracted from the packet.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's VLAN ID.
+ */
+static __inline unsigned short
+NETIO_PKT_VLAN_ID_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  int vl = (mda->__flags >> _NETIO_PKT_VLAN_SHIFT) & _NETIO_PKT_VLAN_RMASK;
+  unsigned short* pkt_p;
+  int index;
+  unsigned short val;
+
+  if (vl == _NETIO_PKT_VLAN_NONE)
+    return 0;
+
+  pkt_p = (unsigned short*) NETIO_PKT_L2_DATA_M(mda, pkt);
+  index = (mda->__flags >> _NETIO_PKT_TYPE_SHIFT) & _NETIO_PKT_TYPE_RMASK;
+
+  val = pkt_p[(_netio_pkt_info[index] >> _NETIO_PKT_INFO_VLAN_SHIFT) &
+              _NETIO_PKT_INFO_VLAN_RMASK];
+
+#ifdef __TILECC__
+  return (__insn_bytex(val) >> 16) & 0xFFF;
+#else
+  return (__builtin_bswap32(val) >> 16) & 0xFFF;
+#endif
+}
+
+
+/** Return the ethertype of the packet.
+ * @ingroup ingress
+ *
+ * This value is usually contained within the packet header.
+ *
+ * This value is reliable if @ref NETIO_PKT_ETHERTYPE_RECOGNIZED_M()
+ * returns true, and otherwise, may not be well defined.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's ethertype.
+ */
+static __inline unsigned short
+NETIO_PKT_ETHERTYPE_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  unsigned short* pkt_p = (unsigned short*) NETIO_PKT_L2_DATA_M(mda, pkt);
+  int index = (mda->__flags >> _NETIO_PKT_TYPE_SHIFT) & _NETIO_PKT_TYPE_RMASK;
+
+  unsigned short val =
+    pkt_p[(_netio_pkt_info[index] >> _NETIO_PKT_INFO_ETYPE_SHIFT) &
+          _NETIO_PKT_INFO_ETYPE_RMASK];
+
+  return __builtin_bswap32(val) >> 16;
+}
+
+
+/** Return the flow hash computed on the packet.
+ * @ingroup ingress
+ *
+ * For TCP and UDP packets, this hash is calculated by hashing together
+ * the "5-tuple" values, specifically the source IP address, destination
+ * IP address, protocol type, source port and destination port.
+ * The hash value is intended to be helpful for millions of distinct
+ * flows.
+ *
+ * For IPv4 or IPv6 packets which are neither TCP nor UDP, the flow hash is
+ * derived by hashing together the source and destination IP addresses.
+ *
+ * For MPLS-encapsulated packets, the flow hash is derived by hashing
+ * the first MPLS label.
+ *
+ * For all other packets the flow hash is computed from the source
+ * and destination Ethernet addresses.
+ *
+ * The hash is symmetric, meaning it produces the same value if the
+ * source and destination are swapped. The only exceptions are
+ * tunneling protocols 0x04 (IP in IP Encapsulation), 0x29 (Simple
+ * Internet Protocol), 0x2F (General Routing Encapsulation) and 0x32
+ * (Encap Security Payload), which use only the destination address
+ * since the source address is not meaningful.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's 32-bit flow hash.
+ */
+static __inline unsigned int
+NETIO_PKT_FLOW_HASH_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__flow_hash;
+}
+
+
+/** Return the first word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the first
+ * word of user data contains the least significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_low()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's first word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_0_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__user_data_0;
+}
+
+
+/** Return the second word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the second
+ * word of user data contains the most significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_high()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's second word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_1_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return mda->__user_data_1;
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L4 checksum was calculated.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CALCULATED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_NO_L4_CSUM_MASK);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated and found to
+ *  be correct.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CORRECT_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags &
+           (_NETIO_PKT_BAD_L4_CSUM_MASK | _NETIO_PKT_NO_L4_CSUM_MASK));
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L3 (IP) checksum was calculated.
+*/
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CALCULATED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_NO_L3_CSUM_MASK);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated and found to be
+ *  correct.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CORRECT_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags &
+           (_NETIO_PKT_BAD_L3_CSUM_MASK | _NETIO_PKT_NO_L3_CSUM_MASK));
+}
+
+
+/** Determine whether the ethertype was recognized and L3 packet data was
+ *  processed.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the ethertype was recognized and L3 packet data was
+ *   processed.
+ */
+static __inline unsigned int
+NETIO_PKT_ETHERTYPE_RECOGNIZED_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return !(mda->__flags & _NETIO_PKT_TYPE_UNRECOGNIZED_MASK);
+}
+
+
+/** Retrieve the status of a packet and any errors that may have occurred
+ * during ingress processing (length mismatches, CRC errors, etc.).
+ * @ingroup ingress
+ *
+ * Note that packets for which @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns zero are always reported as underlength, as there is no a priori
+ * means to determine their length.  Normally, applications should use
+ * @ref NETIO_PKT_BAD_M() instead of explicitly checking status with this
+ * function.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's status.
+ */
+static __inline netio_pkt_status_t
+NETIO_PKT_STATUS_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (netio_pkt_status_t) __NETIO_PKT_NOTIF_HEADER(pkt).bits.__status;
+}
+
+
+/** Report whether a packet is bad (i.e., was shorter than expected based on
+ *  its headers, or had a bad CRC).
+ * @ingroup ingress
+ *
+ * Note that this function does not verify L3 or L4 checksums.
+ *
+ * @param[in] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet is bad and should be discarded.
+ */
+static __inline unsigned int
+NETIO_PKT_BAD_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return ((NETIO_PKT_STATUS_M(mda, pkt) & 1) &&
+          (NETIO_PKT_ETHERTYPE_RECOGNIZED_M(mda, pkt) ||
+           NETIO_PKT_STATUS_M(mda, pkt) == NETIO_PKT_STATUS_BAD));
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return mmd->l2_length;
+}
+
+
+/** Return the length of the L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd,
+                              netio_pkt_t* pkt)
+{
+  return mmd->l3_offset - mmd->l2_offset;
+}
+
+
+/** Return the length of the packet, starting with the L3 (IP) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return (NETIO_PKT_L2_LENGTH_MM(mmd, pkt) -
+          NETIO_PKT_L2_HEADER_LENGTH_MM(mmd, pkt));
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup egress
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return _NETIO_PKT_BASE(pkt) + mmd->l3_offset;
+}
+
+
+/** Return a pointer to the packet's L2 (Ethernet) header.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return _NETIO_PKT_BASE(pkt) + mmd->l2_offset;
+}
+
+
+/** Retrieve the status of a packet and any errors that may have occurred
+ * during ingress processing (length mismatches, CRC errors, etc.).
+ * @ingroup ingress
+ *
+ * Note that packets for which @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns zero are always reported as underlength, as there is no a priori
+ * means to determine their length.  Normally, applications should use
+ * @ref NETIO_PKT_BAD() instead of explicitly checking status with this
+ * function.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's status.
+ */
+static __inline netio_pkt_status_t
+NETIO_PKT_STATUS(netio_pkt_t* pkt)
+{
+  netio_assert(!pkt->__packet.bits.__minimal);
+
+  return (netio_pkt_status_t) __NETIO_PKT_NOTIF_HEADER(pkt).bits.__status;
+}
+
+
+/** Report whether a packet is bad (i.e., was shorter than expected based on
+ *  its headers, or had a bad CRC).
+ * @ingroup ingress
+ *
+ * Note that this function does not verify L3 or L4 checksums.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the packet is bad and should be discarded.
+ */
+static __inline unsigned int
+NETIO_PKT_BAD(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_BAD_M(mda, pkt);
+}
+
+
+/** Return the length of the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's custom header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_HEADER_LENGTH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt);
+}
+
+
+/** Return the length of the packet, starting with the custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return  The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_CUSTOM_LENGTH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_LENGTH_M(mda, pkt);
+}
+
+
+/** Return a pointer to the packet's custom header.
+ *  A custom header may or may not be present, depending upon the IPP; its
+ *  contents and alignment are also IPP-dependent.  Currently, none of the
+ *  standard IPPs supplied by Tilera produce a custom header.  If present,
+ *  the custom header precedes the L2 header in the packet buffer.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_CUSTOM_DATA(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_CUSTOM_DATA_M(mda, pkt);
+}
+
+
+/** Return the length of the packet's L2 (Ethernet plus VLAN or SNAP) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The length of the packet's L2 header, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_HEADER_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_HEADER_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_HEADER_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return the length of the packet, starting with the L2 (Ethernet) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return  The length of the packet, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L2_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return a pointer to the packet's L2 (Ethernet) header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to start of the packet.
+ */
+static __inline unsigned char*
+NETIO_PKT_L2_DATA(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L2_DATA_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L2_DATA_M(mda, pkt);
+  }
+}
+
+
+/** Retrieve the length of the packet, starting with the L3 (generally, the IP)
+ * header.
+ * @ingroup pktfuncs
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Length of the packet's L3 header and data, in bytes.
+ */
+static __inline netio_size_t
+NETIO_PKT_L3_LENGTH(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L3_LENGTH_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L3_LENGTH_M(mda, pkt);
+  }
+}
+
+
+/** Return a pointer to the packet's L3 (generally, the IP) header.
+ * @ingroup pktfuncs
+ *
+ * Note that we guarantee word alignment of the L3 header.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return A pointer to the packet's L3 header.
+ */
+static __inline unsigned char*
+NETIO_PKT_L3_DATA(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_L3_DATA_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_L3_DATA_M(mda, pkt);
+  }
+}
+
+
+/** Return the ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given an ordinal number when it is delivered by the IPP.
+ * In the medium term, the ordinal is unique and monotonically increasing,
+ * being incremented by 1 for each packet; the ordinal of the first packet
+ * delivered after the IPP starts is zero.  (Since the ordinal is of finite
+ * size, given enough input packets, it will eventually wrap around to zero;
+ * in the long term, therefore, ordinals are not unique.)  The ordinals
+ * handed out by different IPPs are not disjoint, so two packets from
+ * different IPPs may have identical ordinals.  Packets dropped by the
+ * IPP or by the I/O shim are not assigned ordinals.
+ *
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP packet ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_ORDINAL(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ORDINAL_M(mda, pkt);
+}
+
+
+/** Return the per-group ordinal of the packet.
+ * @ingroup ingress
+ *
+ * Each packet is given a per-group ordinal number when it is
+ * delivered by the IPP. By default, the group is the packet's VLAN,
+ * although IPP can be recompiled to use different values.  In
+ * the medium term, the ordinal is unique and monotonically
+ * increasing, being incremented by 1 for each packet; the ordinal of
+ * the first packet distributed to a particular group is zero.
+ * (Since the ordinal is of finite size, given enough input packets,
+ * it will eventually wrap around to zero; in the long term,
+ * therefore, ordinals are not unique.)  The ordinals handed out by
+ * different IPPs are not disjoint, so two packets from different IPPs
+ * may have identical ordinals; similarly, packets distributed to
+ * different groups may have identical ordinals.  Packets dropped by
+ * the IPP or by the I/O shim are not assigned ordinals.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's per-IPP, per-group ordinal.
+ */
+static __inline unsigned int
+NETIO_PKT_GROUP_ORDINAL(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_GROUP_ORDINAL_M(mda, pkt);
+}
+
+
+/** Return the VLAN ID assigned to the packet.
+ * @ingroup ingress
+ *
+ * This is usually also contained within the packet header.  If the packet
+ * does not have a VLAN tag, the VLAN ID returned by this function is zero.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's VLAN ID.
+ */
+static __inline unsigned short
+NETIO_PKT_VLAN_ID(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_VLAN_ID_M(mda, pkt);
+}
+
+
+/** Return the ethertype of the packet.
+ * @ingroup ingress
+ *
+ * This value is reliable if @ref NETIO_PKT_ETHERTYPE_RECOGNIZED()
+ * returns true, and otherwise, may not be well defined.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's ethertype.
+ */
+static __inline unsigned short
+NETIO_PKT_ETHERTYPE(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ETHERTYPE_M(mda, pkt);
+}
+
+
+/** Return the flow hash computed on the packet.
+ * @ingroup ingress
+ *
+ * For TCP and UDP packets, this hash is calculated by hashing together
+ * the "5-tuple" values, specifically the source IP address, destination
+ * IP address, protocol type, source port and destination port.
+ * The hash value is intended to be helpful for millions of distinct
+ * flows.
+ *
+ * For IPv4 or IPv6 packets which are neither TCP nor UDP, the flow hash is
+ * derived by hashing together the source and destination IP addresses.
+ *
+ * For MPLS-encapsulated packets, the flow hash is derived by hashing
+ * the first MPLS label.
+ *
+ * For all other packets the flow hash is computed from the source
+ * and destination Ethernet addresses.
+ *
+ * The hash is symmetric, meaning it produces the same value if the
+ * source and destination are swapped. The only exceptions are
+ * tunneling protocols 0x04 (IP in IP Encapsulation), 0x29 (Simple
+ * Internet Protocol), 0x2F (General Routing Encapsulation) and 0x32
+ * (Encap Security Payload), which use only the destination address
+ * since the source address is not meaningful.
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's 32-bit flow hash.
+ */
+static __inline unsigned int
+NETIO_PKT_FLOW_HASH(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_FLOW_HASH_M(mda, pkt);
+}
+
+
+/** Return the first word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the first
+ * word of user data contains the least significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_low()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's first word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_0(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_USER_DATA_0_M(mda, pkt);
+}
+
+
+/** Return the second word of "user data" for the packet.
+ *
+ * The contents of the user data words depend on the IPP.
+ *
+ * When using the standard ipp1, ipp2, or ipp4 sub-drivers, the second
+ * word of user data contains the most significant bits of the 64-bit
+ * arrival cycle count (see @c get_cycle_count_high()).
+ *
+ * See the <em>System Programmer's Guide</em> for details.
+ *
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return The packet's second word of "user data".
+ */
+static __inline unsigned int
+NETIO_PKT_USER_DATA_1(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_USER_DATA_1_M(mda, pkt);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L4 checksum was calculated.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CALCULATED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L4_CSUM_CALCULATED_M(mda, pkt);
+}
+
+
+/** Determine whether the L4 (TCP/UDP) checksum was calculated and found to
+ *  be correct.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L4_CSUM_CORRECT(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L4_CSUM_CORRECT_M(mda, pkt);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the L3 (IP) checksum was calculated.
+*/
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CALCULATED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L3_CSUM_CALCULATED_M(mda, pkt);
+}
+
+
+/** Determine whether the L3 (IP) checksum was calculated and found to be
+ *  correct.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the checksum was calculated and is correct.
+ */
+static __inline unsigned int
+NETIO_PKT_L3_CSUM_CORRECT(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_L3_CSUM_CORRECT_M(mda, pkt);
+}
+
+
+/** Determine whether the Ethertype was recognized and L3 packet data was
+ *  processed.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ * @return Nonzero if the Ethertype was recognized and L3 packet data was
+ *   processed.
+ */
+static __inline unsigned int
+NETIO_PKT_ETHERTYPE_RECOGNIZED(netio_pkt_t* pkt)
+{
+  netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+  return NETIO_PKT_ETHERTYPE_RECOGNIZED_M(mda, pkt);
+}
+
+
+/** Set an egress packet's L2 length, using a metadata pointer to speed the
+ * computation.
+ * @ingroup egress
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] len Packet L2 length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt,
+                           int len)
+{
+  mmd->l2_length = len;
+}
+
+
+/** Set an egress packet's L2 length.
+ * @ingroup egress
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] len Packet L2 length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_LENGTH(netio_pkt_t* pkt, int len)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_SET_L2_LENGTH_MM(mmd, pkt, len);
+}
+
+
+/** Set an egress packet's L2 header length, using a metadata pointer to
+ *  speed the computation.
+ * @ingroup egress
+ *
+ * It is not normally necessary to call this routine; only the L2 length,
+ * not the header length, is needed to transmit a packet.  It may be useful if
+ * the egress packet will later be processed by code which expects to use
+ * functions like @ref NETIO_PKT_L3_DATA() to get a pointer to the L3 payload.
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] len Packet L2 header length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_HEADER_LENGTH_MM(netio_pkt_minimal_metadata_t* mmd,
+                                  netio_pkt_t* pkt, int len)
+{
+  mmd->l3_offset = mmd->l2_offset + len;
+}
+
+
+/** Set an egress packet's L2 header length.
+ * @ingroup egress
+ *
+ * It is not normally necessary to call this routine; only the L2 length,
+ * not the header length, is needed to transmit a packet.  It may be useful if
+ * the egress packet will later be processed by code which expects to use
+ * functions like @ref NETIO_PKT_L3_DATA() to get a pointer to the L3 payload.
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] len Packet L2 header length, in bytes.
+ */
+static __inline void
+NETIO_PKT_SET_L2_HEADER_LENGTH(netio_pkt_t* pkt, int len)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_SET_L2_HEADER_LENGTH_MM(mmd, pkt, len);
+}
+
+
+/** Set up an egress packet for hardware checksum computation, using a
+ *  metadata pointer to speed the operation.
+ * @ingroup egress
+ *
+ *  NetIO provides the ability to automatically calculate a standard
+ *  16-bit Internet checksum on transmitted packets.  The application
+ *  may specify the point in the packet where the checksum starts, the
+ *  number of bytes to be checksummed, and the two bytes in the packet
+ *  which will be replaced with the completed checksum.  (If the range
+ *  of bytes to be checksummed includes the bytes to be replaced, the
+ *  initial values of those bytes will be included in the checksum.)
+ *
+ *  For some protocols, the packet checksum covers data which is not present
+ *  in the packet, or is at least not contiguous to the main data payload.
+ *  For instance, the TCP checksum includes a "pseudo-header" which includes
+ *  the source and destination IP addresses of the packet.  To accommodate
+ *  this, the checksum engine may be "seeded" with an initial value, which
+ *  the application would need to compute based on the specific protocol's
+ *  requirements.  Note that the seed is given in host byte order (little-
+ *  endian), not network byte order (big-endian); code written to compute a
+ *  pseudo-header checksum in network byte order will need to byte-swap it
+ *  before use as the seed.
+ *
+ *  Note that the checksum is computed as part of the transmission process,
+ *  so it will not be present in the packet upon completion of this routine.
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ * @param[in] start Offset within L2 packet of the first byte to include in
+ *   the checksum.
+ * @param[in] length Number of bytes to include in the checksum.
+ *   the checksum.
+ * @param[in] location Offset within L2 packet of the first of the two bytes
+ *   to be replaced with the calculated checksum.
+ * @param[in] seed Initial value of the running checksum before any of the
+ *   packet data is added.
+ */
+static __inline void
+NETIO_PKT_DO_EGRESS_CSUM_MM(netio_pkt_minimal_metadata_t* mmd,
+                            netio_pkt_t* pkt, int start, int length,
+                            int location, uint16_t seed)
+{
+  mmd->csum_start = start;
+  mmd->csum_length = length;
+  mmd->csum_location = location;
+  mmd->csum_seed = seed;
+  mmd->flags |= _NETIO_PKT_NEED_EDMA_CSUM_MASK;
+}
+
+
+/** Set up an egress packet for hardware checksum computation.
+ * @ingroup egress
+ *
+ *  NetIO provides the ability to automatically calculate a standard
+ *  16-bit Internet checksum on transmitted packets.  The application
+ *  may specify the point in the packet where the checksum starts, the
+ *  number of bytes to be checksummed, and the two bytes in the packet
+ *  which will be replaced with the completed checksum.  (If the range
+ *  of bytes to be checksummed includes the bytes to be replaced, the
+ *  initial values of those bytes will be included in the checksum.)
+ *
+ *  For some protocols, the packet checksum covers data which is not present
+ *  in the packet, or is at least not contiguous to the main data payload.
+ *  For instance, the TCP checksum includes a "pseudo-header" which includes
+ *  the source and destination IP addresses of the packet.  To accommodate
+ *  this, the checksum engine may be "seeded" with an initial value, which
+ *  the application would need to compute based on the specific protocol's
+ *  requirements.  Note that the seed is given in host byte order (little-
+ *  endian), not network byte order (big-endian); code written to compute a
+ *  pseudo-header checksum in network byte order will need to byte-swap it
+ *  before use as the seed.
+ *
+ *  Note that the checksum is computed as part of the transmission process,
+ *  so it will not be present in the packet upon completion of this routine.
+ *
+ * @param[in,out] pkt Packet on which to operate.
+ * @param[in] start Offset within L2 packet of the first byte to include in
+ *   the checksum.
+ * @param[in] length Number of bytes to include in the checksum.
+ *   the checksum.
+ * @param[in] location Offset within L2 packet of the first of the two bytes
+ *   to be replaced with the calculated checksum.
+ * @param[in] seed Initial value of the running checksum before any of the
+ *   packet data is added.
+ */
+static __inline void
+NETIO_PKT_DO_EGRESS_CSUM(netio_pkt_t* pkt, int start, int length,
+                         int location, uint16_t seed)
+{
+  netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+  NETIO_PKT_DO_EGRESS_CSUM_MM(mmd, pkt, start, length, location, seed);
+}
+
+
+/** Return the number of bytes which could be prepended to a packet, using a
+ *  metadata pointer to speed the operation.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ *
+ * @param[in,out] mda Pointer to packet's standard metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+  return (pkt->__packet.bits.__offset << 6) +
+         NETIO_PKT_CUSTOM_HEADER_LENGTH_M(mda, pkt);
+}
+
+
+/** Return the number of bytes which could be prepended to a packet, using a
+ *  metadata pointer to speed the operation.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ * @ingroup egress
+ *
+ * @param[in,out] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL_MM(netio_pkt_minimal_metadata_t* mmd, netio_pkt_t* pkt)
+{
+  return (pkt->__packet.bits.__offset << 6) + mmd->l2_offset;
+}
+
+
+/** Return the number of bytes which could be prepended to a packet.
+ *  See @ref netio_populate_prepend_buffer() to get a full description of
+ *  prepending.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline int
+NETIO_PKT_PREPEND_AVAIL(netio_pkt_t* pkt)
+{
+  if (NETIO_PKT_IS_MINIMAL(pkt))
+  {
+    netio_pkt_minimal_metadata_t* mmd = NETIO_PKT_MINIMAL_METADATA(pkt);
+
+    return NETIO_PKT_PREPEND_AVAIL_MM(mmd, pkt);
+  }
+  else
+  {
+    netio_pkt_metadata_t* mda = NETIO_PKT_METADATA(pkt);
+
+    return NETIO_PKT_PREPEND_AVAIL_M(mda, pkt);
+  }
+}
+
+
+/** Flush a packet's minimal metadata from the cache, using a metadata pointer
+ *  to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                    netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's minimal metadata from the cache, using a metadata
+ *  pointer to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                  netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's minimal metadata from the cache,
+ *  using a metadata pointer to speed the operation.
+ * @ingroup egress
+ *
+ * @param[in] mmd Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_MINIMAL_METADATA_MM(netio_pkt_minimal_metadata_t* mmd,
+                                        netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's metadata from the cache, using a metadata pointer
+ *  to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's minimal metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's metadata from the cache, using a metadata
+ *  pointer to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's metadata from the cache,
+ *  using a metadata pointer to speed the operation.
+ * @ingroup ingress
+ *
+ * @param[in] mda Pointer to packet's metadata.
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_METADATA_M(netio_pkt_metadata_t* mda, netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's minimal metadata from the cache.
+ * @ingroup egress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_MINIMAL_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Invalidate a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_INV_METADATA(netio_pkt_t* pkt)
+{
+}
+
+
+/** Flush and then invalidate a packet's metadata from the cache.
+ * @ingroup ingress
+ *
+ * @param[in] pkt Packet on which to operate.
+ */
+static __inline void
+NETIO_PKT_FLUSH_INV_METADATA(netio_pkt_t* pkt)
+{
+}
+
+/** Number of NUMA nodes we can distribute buffers to.
+ * @ingroup setup */
+#define NETIO_NUM_NODE_WEIGHTS  16
+
+/**
+ * @brief An object for specifying the characteristics of NetIO communication
+ * endpoint.
+ *
+ * @ingroup setup
+ *
+ * The @ref netio_input_register() function uses this structure to define
+ * how an application tile will communicate with an IPP.
+ *
+ *
+ * Future updates to NetIO may add new members to this structure,
+ * which can affect the success of the registration operation.  Thus,
+ * if dynamically initializing the structure, applications are urged to
+ * zero it out first, for example:
+ *
+ * @code
+ * netio_input_config_t config;
+ * memset(&config, 0, sizeof (config));
+ * config.flags = NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE;
+ * config.num_receive_packets = NETIO_MAX_RECEIVE_PKTS;
+ * config.queue_id = 0;
+ *     .
+ *     .
+ *     .
+ * @endcode
+ *
+ * since that guarantees that any unused structure members, including
+ * members which did not exist when the application was first developed,
+ * will not have unexpected values.
+ *
+ * If statically initializing the structure, we strongly recommend use of
+ * C99-style named initializers, for example:
+ *
+ * @code
+ * netio_input_config_t config = {
+ *    .flags = NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE,
+ *    .num_receive_packets = NETIO_MAX_RECEIVE_PKTS,
+ *    .queue_id = 0,
+ * },
+ * @endcode
+ *
+ * instead of the old-style structure initialization:
+ *
+ * @code
+ * // Bad example! Currently equivalent to the above, but don't do this.
+ * netio_input_config_t config = {
+ *    NETIO_RECV | NETIO_XMIT_CSUM | NETIO_TAG_NONE, NETIO_MAX_RECEIVE_PKTS, 0
+ * },
+ * @endcode
+ *
+ * since the C99 style requires no changes to the code if elements of the
+ * config structure are rearranged.  (It also makes the initialization much
+ * easier to understand.)
+ *
+ * Except for items which address a particular tile's transmit or receive
+ * characteristics, such as the ::NETIO_RECV flag, applications are advised
+ * to specify the same set of configuration data on all registrations.
+ * This prevents differing results if multiple tiles happen to do their
+ * registration operations in a different order on different invocations of
+ * the application.  This is particularly important for things like link
+ * management flags, and buffer size and homing specifications.
+ *
+ * Unless the ::NETIO_FIXED_BUFFER_VA flag is specified in flags, the NetIO
+ * buffer pool is automatically created and mapped into the application's
+ * virtual address space at an address chosen by the operating system,
+ * using the common memory (cmem) facility in the Tilera Multicore
+ * Components library.  The cmem facility allows multiple processes to gain
+ * access to shared memory which is mapped into each process at an
+ * identical virtual address.  In order for this to work, the processes
+ * must have a common ancestor, which must create the common memory using
+ * tmc_cmem_init().
+ *
+ * In programs using the iLib process creation API, or in programs which use
+ * only one process (which include programs using the pthreads library),
+ * tmc_cmem_init() is called automatically.  All other applications
+ * must call it explicitly, before any child processes which might call
+ * netio_input_register() are created.
+ */
+typedef struct
+{
+  /** Registration characteristics.
+
+      This value determines several characteristics of the registration;
+      flags for different types of behavior are ORed together to make the
+      final flag value.  Generally applications should specify exactly
+      one flag from each of the following categories:
+
+      - Whether the application will be receiving packets on this queue
+        (::NETIO_RECV or ::NETIO_NO_RECV).
+
+      - Whether the application will be transmitting packets on this queue,
+        and if so, whether it will request egress checksum calculation
+        (::NETIO_XMIT, ::NETIO_XMIT_CSUM, or ::NETIO_NO_XMIT).  It is
+        legal to call netio_get_buffer() without one of the XMIT flags,
+        as long as ::NETIO_RECV is specified; in this case, the retrieved
+        buffers must be passed to another tile for transmission.
+
+      - Whether the application expects any vendor-specific tags in
+        its packets' L2 headers (::NETIO_TAG_NONE, ::NETIO_TAG_BRCM,
+        or ::NETIO_TAG_MRVL).  This must match the configuration of the
+        target IPP.
+
+      To accommodate applications written to previous versions of the NetIO
+      interface, none of the flags above are currently required; if omitted,
+      NetIO behaves more or less as if ::NETIO_RECV | ::NETIO_XMIT_CSUM |
+      ::NETIO_TAG_NONE were used.  However, explicit specification of
+      the relevant flags allows NetIO to do a better job of resource
+      allocation, allows earlier detection of certain configuration errors,
+      and may enable advanced features or higher performance in the future,
+      so their use is strongly recommended.
+
+      Note that specifying ::NETIO_NO_RECV along with ::NETIO_NO_XMIT
+      is a special case, intended primarily for use by programs which
+      retrieve network statistics or do link management operations.
+      When these flags are both specified, the resulting queue may not
+      be used with NetIO routines other than netio_get(), netio_set(),
+      and netio_input_unregister().  See @ref link for more information
+      on link management.
+
+      Other flags are optional; their use is described below.
+  */
+  int flags;
+
+  /** Interface name.  This is a string which identifies the specific
+      Ethernet controller hardware to be used.  The format of the string
+      is a device type and a device index, separated by a slash; so,
+      the first 10 Gigabit Ethernet controller is named "xgbe/0", while
+      the second 10/100/1000 Megabit Ethernet controller is named "gbe/1".
+   */
+  const char* interface;
+
+  /** Receive packet queue size.  This specifies the maximum number
+      of ingress packets that can be received on this queue without
+      being retrieved by @ref netio_get_packet().  If the IPP's distribution
+      algorithm calls for a packet to be sent to this queue, and this
+      number of packets are already pending there, the new packet
+      will either be discarded, or sent to another tile registered
+      for the same queue_id (see @ref drops).  This value must
+      be at least ::NETIO_MIN_RECEIVE_PKTS, can always be at least
+      ::NETIO_MAX_RECEIVE_PKTS, and may be larger than that on certain
+      interfaces.
+   */
+  int num_receive_packets;
+
+  /** The queue ID being requested.  Legal values for this range from 0
+      to ::NETIO_MAX_QUEUE_ID, inclusive.  ::NETIO_MAX_QUEUE_ID is always
+      greater than or equal to the number of tiles; this allows one queue
+      for each tile, plus at least one additional queue.  Some applications
+      may wish to use the additional queue as a destination for unwanted
+      packets, since packets delivered to queues for which no tiles have
+      registered are discarded.
+   */
+  unsigned int queue_id;
+
+  /** Maximum number of small send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds
+      empty small egress buffers requested from the IPP but not yet
+      retrieved via @ref netio_get_buffer().  This value must be greater
+      than zero if the application will ever use @ref netio_get_buffer()
+      to allocate empty small egress buffers; it may be no larger than
+      ::NETIO_MAX_SEND_BUFFERS.  See @ref epp for more details on empty
+      buffer caching.
+   */
+  int num_send_buffers_small_total;
+
+  /** Number of small send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty small egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_small_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_small_prealloc;
+
+  /** Maximum number of large send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds empty
+      large egress buffers requested from the IPP but not yet retrieved via
+      @ref netio_get_buffer().  This value must be greater than zero if the
+      application will ever use @ref netio_get_buffer() to allocate empty
+      large egress buffers; it may be no larger than ::NETIO_MAX_SEND_BUFFERS.
+      See @ref epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_large_total;
+
+  /** Number of large send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty large egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_large_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_large_prealloc;
+
+  /** Maximum number of jumbo send buffers to be held in the local empty
+      buffer cache.  This specifies the size of the area which holds empty
+      jumbo egress buffers requested from the IPP but not yet retrieved via
+      @ref netio_get_buffer().  This value must be greater than zero if the
+      application will ever use @ref netio_get_buffer() to allocate empty
+      jumbo egress buffers; it may be no larger than ::NETIO_MAX_SEND_BUFFERS.
+      See @ref epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_jumbo_total;
+
+  /** Number of jumbo send buffers to be preallocated at registration.
+      If this value is nonzero, the specified number of empty jumbo egress
+      buffers will be requested from the IPP during the netio_input_register
+      operation; this may speed the execution of @ref netio_get_buffer().
+      This may be no larger than @ref num_send_buffers_jumbo_total.  See @ref
+      epp for more details on empty buffer caching.
+   */
+  int num_send_buffers_jumbo_prealloc;
+
+  /** Total packet buffer size.  This determines the total size, in bytes,
+      of the NetIO buffer pool.  Note that the maximum number of available
+      buffers of each size is determined during hypervisor configuration
+      (see the <em>System Programmer's Guide</em> for details); this just
+      influences how much host memory is allocated for those buffers.
+
+      The buffer pool is allocated from common memory, which will be
+      automatically initialized if needed.  If your buffer pool is larger
+      than 240 MB, you might need to explicitly call @c tmc_cmem_init(),
+      as described in the Application Libraries Reference Manual (UG227).
+
+      Packet buffers are currently allocated in chunks of 16 MB; this
+      value will be rounded up to the next larger multiple of 16 MB.
+      If this value is zero, a default of 32 MB will be used; this was
+      the value used by previous versions of NetIO.  Note that taking this
+      default also affects the placement of buffers on Linux NUMA nodes.
+      See @ref buffer_node_weights for an explanation of buffer placement.
+
+      In order to successfully allocate packet buffers, Linux must have
+      available huge pages on the relevant Linux NUMA nodes.  See the
+      <em>System Programmer's Guide</em> for information on configuring
+      huge page support in Linux.
+   */
+  uint64_t total_buffer_size;
+
+  /** Buffer placement weighting factors.
+
+      This array specifies the relative amount of buffering to place
+      on each of the available Linux NUMA nodes.  This array is
+      indexed by the NUMA node, and the values in the array are
+      proportional to the amount of buffer space to allocate on that
+      node.
+
+      If memory striping is enabled in the Hypervisor, then there is
+      only one logical NUMA node (node 0). In that case, NetIO will by
+      default ignore the suggested buffer node weights, and buffers
+      will be striped across the physical memory controllers. See
+      UG209 System Programmer's Guide for a description of the
+      hypervisor option that controls memory striping.
+
+      If memory striping is disabled, then there are up to four NUMA
+      nodes, corresponding to the four DDRAM controllers in the TILE
+      processor architecture.  See UG100 Tile Processor Architecture
+      Overview for a diagram showing the location of each of the DDRAM
+      controllers relative to the tile array.
+
+      For instance, if memory striping is disabled, the following
+      configuration strucure:
+
+      @code
+      netio_input_config_t config = {
+            .
+            .
+            .
+        .total_buffer_size = 4 * 16 * 1024 * 1024;
+        .buffer_node_weights = { 1, 0, 1, 0 },
+      },
+      @endcode
+
+      would result in 32 MB of buffers being placed on controller 0, and
+      32 MB on controller 2.  (Since buffers are allocated in units of
+      16 MB, some sets of weights will not be able to be matched exactly.)
+
+      For the weights to be effective, @ref total_buffer_size must be
+      nonzero.  If @ref total_buffer_size is zero, causing the default
+      32 MB of buffer space to be used, then any specified weights will
+      be ignored, and buffers will positioned as they were in previous
+      versions of NetIO:
+
+      - For xgbe/0 and gbe/0, 16 MB of buffers will be placed on controller 1,
+        and the other 16 MB will be placed on controller 2.
+
+      - For xgbe/1 and gbe/1, 16 MB of buffers will be placed on controller 2,
+        and the other 16 MB will be placed on controller 3.
+
+      If @ref total_buffer_size is nonzero, but all weights are zero,
+      then all buffer space will be allocated on Linux NUMA node zero.
+
+      By default, the specified buffer placement is treated as a hint;
+      if sufficient free memory is not available on the specified
+      controllers, the buffers will be allocated elsewhere.  However,
+      if the ::NETIO_STRICT_HOMING flag is specified in @ref flags, then a
+      failure to allocate buffer space exactly as requested will cause the
+      registration operation to fail with an error of ::NETIO_CANNOT_HOME.
+
+      Note that maximal network performance cannot be achieved with
+      only one memory controller.
+   */
+  uint8_t buffer_node_weights[NETIO_NUM_NODE_WEIGHTS];
+
+  /** Fixed virtual address for packet buffers.  Only valid when
+      ::NETIO_FIXED_BUFFER_VA is specified in @ref flags; see the
+      description of that flag for details.
+   */
+  void* fixed_buffer_va;
+
+  /**
+      Maximum number of outstanding send packet requests.  This value is
+      only relevant when an EPP is in use; it determines the number of
+      slots in the EPP's outgoing packet queue which this tile is allowed
+      to consume, and thus the number of packets which may be sent before
+      the sending tile must wait for an acknowledgment from the EPP.
+      Modifying this value is generally only helpful when using @ref
+      netio_send_packet_vector(), where it can help improve performance by
+      allowing a single vector send operation to process more packets.
+      Typically it is not specified, and the default, which divides the
+      outgoing packet slots evenly between all tiles on the chip, is used.
+
+      If a registration asks for more outgoing packet queue slots than are
+      available, ::NETIO_TOOMANY_XMIT will be returned.  The total number
+      of packet queue slots which are available for all tiles for each EPP
+      is subject to change, but is currently ::NETIO_TOTAL_SENDS_OUTSTANDING.
+
+
+      This value is ignored if ::NETIO_XMIT is not specified in flags.
+      If you want to specify a large value here for a specific tile, you are
+      advised to specify NETIO_NO_XMIT on other, non-transmitting tiles so
+      that they do not consume a default number of packet slots.  Any tile
+      transmitting is required to have at least ::NETIO_MIN_SENDS_OUTSTANDING
+      slots allocated to it; values less than that will be silently
+      increased by the NetIO library.
+   */
+  int num_sends_outstanding;
+}
+netio_input_config_t;
+
+
+/** Registration flags; used in the @ref netio_input_config_t structure.
+ * @addtogroup setup
+ */
+/** @{ */
+
+/** Fail a registration request if we can't put packet buffers
+    on the specified memory controllers. */
+#define NETIO_STRICT_HOMING   0x00000002
+
+/** This application expects no tags on its L2 headers. */
+#define NETIO_TAG_NONE        0x00000004
+
+/** This application expects Marvell extended tags on its L2 headers. */
+#define NETIO_TAG_MRVL        0x00000008
+
+/** This application expects Broadcom tags on its L2 headers. */
+#define NETIO_TAG_BRCM        0x00000010
+
+/** This registration may call routines which receive packets. */
+#define NETIO_RECV            0x00000020
+
+/** This registration may not call routines which receive packets. */
+#define NETIO_NO_RECV         0x00000040
+
+/** This registration may call routines which transmit packets. */
+#define NETIO_XMIT            0x00000080
+
+/** This registration may call routines which transmit packets with
+    checksum acceleration. */
+#define NETIO_XMIT_CSUM       0x00000100
+
+/** This registration may not call routines which transmit packets. */
+#define NETIO_NO_XMIT         0x00000200
+
+/** This registration wants NetIO buffers mapped at an application-specified
+    virtual address.
+
+    NetIO buffers are by default created by the TMC common memory facility,
+    which must be configured by a common ancestor of all processes sharing
+    a network interface.  When this flag is specified, NetIO buffers are
+    instead mapped at an address chosen by the application (and specified
+    in @ref netio_input_config_t::fixed_buffer_va).  This allows multiple
+    unrelated but cooperating processes to share a NetIO interface.
+    All processes sharing the same interface must specify this flag,
+    and all must specify the same fixed virtual address.
+
+    @ref netio_input_config_t::fixed_buffer_va must be a
+    multiple of 16 MB, and the packet buffers will occupy @ref
+    netio_input_config_t::total_buffer_size bytes of virtual address
+    space, beginning at that address.  If any of those virtual addresses
+    are currently occupied by other memory objects, like application or
+    shared library code or data, @ref netio_input_register() will return
+    ::NETIO_FAULT.  While it is impossible to provide a fixed_buffer_va
+    which will work for all applications, a good first guess might be to
+    use 0xb0000000 minus @ref netio_input_config_t::total_buffer_size.
+    If that fails, it might be helpful to consult the running application's
+    virtual address description file (/proc/<em>pid</em>/maps) to see
+    which regions of virtual address space are available.
+ */
+#define NETIO_FIXED_BUFFER_VA 0x00000400
+
+/** This registration call will not complete unless the network link
+    is up.  The process will wait several seconds for this to happen (the
+    precise interval is link-dependent), but if the link does not come up,
+    ::NETIO_LINK_DOWN will be returned.  This flag is the default if
+    ::NETIO_NOREQUIRE_LINK_UP is not specified.  Note that this flag by
+    itself does not request that the link be brought up; that can be done
+    with the ::NETIO_AUTO_LINK_UPDN or ::NETIO_AUTO_LINK_UP flags (the
+    latter is the default if no NETIO_AUTO_LINK_xxx flags are specified),
+    or by explicitly setting the link's desired state via netio_set().
+    If the link is not brought up by one of those methods, and this flag
+    is specified, the registration operation will return ::NETIO_LINK_DOWN.
+    This flag is ignored if it is specified along with ::NETIO_NO_XMIT and
+    ::NETIO_NO_RECV.  See @ref link for more information on link
+    management.
+ */
+#define NETIO_REQUIRE_LINK_UP    0x00000800
+
+/** This registration call will complete even if the network link is not up.
+    Whenever the link is not up, packets will not be sent or received:
+    netio_get_packet() will return ::NETIO_NOPKT once all queued packets
+    have been drained, and netio_send_packet() and similar routines will
+    return NETIO_QUEUE_FULL once the outgoing packet queue in the EPP
+    or the I/O shim is full.  See @ref link for more information on link
+    management.
+ */
+#define NETIO_NOREQUIRE_LINK_UP  0x00001000
+
+#ifndef __DOXYGEN__
+/*
+ * These are part of the implementation of the NETIO_AUTO_LINK_xxx flags,
+ * but should not be used directly by applications, and are thus not
+ * documented.
+ */
+#define _NETIO_AUTO_UP        0x00002000
+#define _NETIO_AUTO_DN        0x00004000
+#define _NETIO_AUTO_PRESENT   0x00008000
+#endif
+
+/** Set the desired state of the link to up, allowing any speeds which are
+    supported by the link hardware, as part of this registration operation.
+    Do not take down the link automatically.  This is the default if
+    no other NETIO_AUTO_LINK_xxx flags are specified.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_UP     (_NETIO_AUTO_PRESENT | _NETIO_AUTO_UP)
+
+/** Set the desired state of the link to up, allowing any speeds which are
+    supported by the link hardware, as part of this registration operation.
+    Set the desired state of the link to down the next time no tiles are
+    registered for packet reception or transmission.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_UPDN   (_NETIO_AUTO_PRESENT | _NETIO_AUTO_UP | \
+                                _NETIO_AUTO_DN)
+
+/** Set the desired state of the link to down the next time no tiles are
+    registered for packet reception or transmission.  This flag is ignored
+    if it is specified along with ::NETIO_NO_XMIT and ::NETIO_NO_RECV.
+    See @ref link for more information on link management.
+ */
+#define NETIO_AUTO_LINK_DN     (_NETIO_AUTO_PRESENT | _NETIO_AUTO_DN)
+
+/** Do not bring up the link automatically as part of this registration
+    operation.  Do not take down the link automatically.  This flag
+    is ignored if it is specified along with ::NETIO_NO_XMIT and
+    ::NETIO_NO_RECV.  See @ref link for more information on link management.
+  */
+#define NETIO_AUTO_LINK_NONE   _NETIO_AUTO_PRESENT
+
+
+/** Minimum number of receive packets. */
+#define NETIO_MIN_RECEIVE_PKTS            16
+
+/** Lower bound on the maximum number of receive packets; may be higher
+    than this on some interfaces. */
+#define NETIO_MAX_RECEIVE_PKTS           128
+
+/** Maximum number of send buffers, per packet size. */
+#define NETIO_MAX_SEND_BUFFERS            16
+
+/** Number of EPP queue slots, and thus outstanding sends, per EPP. */
+#define NETIO_TOTAL_SENDS_OUTSTANDING   2015
+
+/** Minimum number of EPP queue slots, and thus outstanding sends, per
+ *  transmitting tile. */
+#define NETIO_MIN_SENDS_OUTSTANDING       16
+
+
+/**@}*/
+
+#ifndef __DOXYGEN__
+
+/**
+ * An object for providing Ethernet packets to a process.
+ */
+struct __netio_queue_impl_t;
+
+/**
+ * An object for managing the user end of a NetIO queue.
+ */
+struct __netio_queue_user_impl_t;
+
+#endif /* !__DOXYGEN__ */
+
+
+/** A netio_queue_t describes a NetIO communications endpoint.
+ * @ingroup setup
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  uint8_t opaque[8];                 /**< This is an opaque structure. */
+#else
+  struct __netio_queue_impl_t* __system_part;    /**< The system part. */
+  struct __netio_queue_user_impl_t* __user_part; /**< The user part. */
+#ifdef _NETIO_PTHREAD
+  _netio_percpu_mutex_t lock;                    /**< Queue lock. */
+#endif
+#endif
+}
+netio_queue_t;
+
+
+/**
+ * @brief Packet send context.
+ *
+ * @ingroup egress
+ *
+ * Packet send context for use with netio_send_packet_prepare and _commit.
+ */
+typedef struct
+{
+#ifdef __DOXYGEN__
+  uint8_t opaque[44];   /**< This is an opaque structure. */
+#else
+  uint8_t flags;        /**< Defined below */
+  uint8_t datalen;      /**< Number of valid words pointed to by data. */
+  uint32_t request[9];  /**< Request to be sent to the EPP or shim.  Note
+                             that this is smaller than the 11-word maximum
+                             request size, since some constant values are
+                             not saved in the context. */
+  uint32_t *data;       /**< Data to be sent to the EPP or shim via IDN. */
+#endif
+}
+netio_send_pkt_context_t;
+
+
+#ifndef __DOXYGEN__
+#define SEND_PKT_CTX_USE_EPP   1  /**< We're sending to an EPP. */
+#define SEND_PKT_CTX_SEND_CSUM 2  /**< Request includes a checksum. */
+#endif
+
+/**
+ * @brief Packet vector entry.
+ *
+ * @ingroup egress
+ *
+ * This data structure is used with netio_send_packet_vector() to send multiple
+ * packets with one NetIO call.  The structure should be initialized by
+ * calling netio_pkt_vector_set(), rather than by setting the fields
+ * directly.
+ *
+ * This structure is guaranteed to be a power of two in size, no
+ * bigger than one L2 cache line, and to be aligned modulo its size.
+ */
+typedef struct
+#ifndef __DOXYGEN__
+__attribute__((aligned(8)))
+#endif
+{
+  /** Reserved for use by the user application.  When initialized with
+   *  the netio_set_pkt_vector_entry() function, this field is guaranteed
+   *  to be visible to readers only after all other fields are already
+   *  visible.  This way it can be used as a valid flag or generation
+   *  counter. */
+  uint8_t user_data;
+
+  /* Structure members below this point should not be accessed directly by
+   * applications, as they may change in the future. */
+
+  /** Low 8 bits of the packet address to send.  The high bits are
+   *  acquired from the 'handle' field. */
+  uint8_t buffer_address_low;
+
+  /** Number of bytes to transmit. */
+  uint16_t size;
+
+  /** The raw handle from a netio_pkt_t.  If this is NETIO_PKT_HANDLE_NONE,
+   *  this vector entry will be skipped and no packet will be transmitted. */
+  netio_pkt_handle_t handle;
+}
+netio_pkt_vector_entry_t;
+
+
+/**
+ * @brief Initialize fields in a packet vector entry.
+ *
+ * @ingroup egress
+ *
+ * @param[out] v Pointer to the vector entry to be initialized.
+ * @param[in] pkt Packet to be transmitted when the vector entry is passed to
+ *        netio_send_packet_vector().  Note that the packet's attributes
+ *        (e.g., its L2 offset and length) are captured at the time this
+ *        routine is called; subsequent changes in those attributes will not
+ *        be reflected in the packet which is actually transmitted.
+ *        Changes in the packet's contents, however, will be so reflected.
+ *        If this is NULL, no packet will be transmitted.
+ * @param[in] user_data User data to be set in the vector entry.
+ *        This function guarantees that the "user_data" field will become
+ *        visible to a reader only after all other fields have become visible.
+ *        This allows a structure in a ring buffer to be written and read
+ *        by a polling reader without any locks or other synchronization.
+ */
+static __inline void
+netio_pkt_vector_set(volatile netio_pkt_vector_entry_t* v, netio_pkt_t* pkt,
+                     uint8_t user_data)
+{
+  if (pkt)
+  {
+    if (NETIO_PKT_IS_MINIMAL(pkt))
+    {
+      netio_pkt_minimal_metadata_t* mmd =
+        (netio_pkt_minimal_metadata_t*) &pkt->__metadata;
+      v->buffer_address_low = (uintptr_t) NETIO_PKT_L2_DATA_MM(mmd, pkt) & 0xFF;
+      v->size = NETIO_PKT_L2_LENGTH_MM(mmd, pkt);
+    }
+    else
+    {
+      netio_pkt_metadata_t* mda = &pkt->__metadata;
+      v->buffer_address_low = (uintptr_t) NETIO_PKT_L2_DATA_M(mda, pkt) & 0xFF;
+      v->size = NETIO_PKT_L2_LENGTH_M(mda, pkt);
+    }
+    v->handle.word = pkt->__packet.word;
+  }
+  else
+  {
+    v->handle.word = 0;   /* Set handle to NETIO_PKT_HANDLE_NONE. */
+  }
+
+  __asm__("" : : : "memory");
+
+  v->user_data = user_data;
+}
+
+
+/**
+ * Flags and structures for @ref netio_get() and @ref netio_set().
+ * @ingroup config
+ */
+
+/** @{ */
+/** Parameter class; addr is a NETIO_PARAM_xxx value. */
+#define NETIO_PARAM       0
+/** Interface MAC address. This address is only valid with @ref netio_get().
+ *  The value is a 6-byte MAC address.  Depending upon the overall system
+ *  design, a MAC address may or may not be available for each interface. */
+#define NETIO_PARAM_MAC        0
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, the I/O shim will suspend output when a pause
+ *  frame is received.  If the value is zero, pause frames will be ignored. */
+#define NETIO_PARAM_PAUSE_IN   1
+
+/** Determine whether to send pause frames if the I/O shim packet FIFOs are
+ *  nearly full.  If the value is zero, pause frames are not sent.  If
+ *  the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times. */
+#define NETIO_PARAM_PAUSE_OUT  2
+
+/** Jumbo frame support.  The value is a 4-byte integer.  If the value is
+ *  nonzero, the MAC will accept frames of up to 10240 bytes.  If the value
+ *  is zero, the MAC will only accept frames of up to 1544 bytes. */
+#define NETIO_PARAM_JUMBO      3
+
+/** I/O shim's overflow statistics register.  The value is two 16-bit integers.
+ *  The first 16-bit value (or the low 16 bits, if the value is treated as a
+ *  32-bit number) is the count of packets which were completely dropped and
+ *  not delivered by the shim.  The second 16-bit value (or the high 16 bits,
+ *  if the value is treated as a 32-bit number) is the count of packets
+ *  which were truncated and thus only partially delivered by the shim.  This
+ *  register is automatically reset to zero after it has been read.
+ */
+#define NETIO_PARAM_OVERFLOW   4
+
+/** IPP statistics.  This address is only valid with @ref netio_get().  The
+ *  value is a netio_stat_t structure.  Unlike the I/O shim statistics, the
+ *  IPP statistics are not all reset to zero on read; see the description
+ *  of the netio_stat_t for details. */
+#define NETIO_PARAM_STAT 5
+
+/** Possible link state.  The value is a combination of "NETIO_LINK_xxx"
+ *  flags.  With @ref netio_get(), this will indicate which flags are
+ *  actually supported by the hardware.
+ *
+ *  For historical reasons, specifying this value to netio_set() will have
+ *  the same behavior as using ::NETIO_PARAM_LINK_CONFIG, but this usage is
+ *  discouraged.
+ */
+#define NETIO_PARAM_LINK_POSSIBLE_STATE 6
+
+/** Link configuration. The value is a combination of "NETIO_LINK_xxx" flags.
+ *  With @ref netio_set(), this will attempt to immediately bring up the
+ *  link using whichever of the requested flags are supported by the
+ *  hardware, or take down the link if the flags are zero; if this is
+ *  not possible, an error will be returned.  Many programs will want
+ *  to use ::NETIO_PARAM_LINK_DESIRED_STATE instead.
+ *
+ *  For historical reasons, specifying this value to netio_get() will
+ *  have the same behavior as using ::NETIO_PARAM_LINK_POSSIBLE_STATE,
+ *  but this usage is discouraged.
+ */
+#define NETIO_PARAM_LINK_CONFIG NETIO_PARAM_LINK_POSSIBLE_STATE
+
+/** Current link state. This address is only valid with @ref netio_get().
+ *  The value is zero or more of the "NETIO_LINK_xxx" flags, ORed together.
+ *  If the link is down, the value ANDed with NETIO_LINK_SPEED will be
+ *  zero; if the link is up, the value ANDed with NETIO_LINK_SPEED will
+ *  result in exactly one of the NETIO_LINK_xxx values, indicating the
+ *  current speed. */
+#define NETIO_PARAM_LINK_CURRENT_STATE 7
+
+/** Variant symbol for current state, retained for compatibility with
+ *  pre-MDE-2.1 programs. */
+#define NETIO_PARAM_LINK_STATUS NETIO_PARAM_LINK_CURRENT_STATE
+
+/** Packet Coherence protocol. This address is only valid with @ref netio_get().
+ *  The value is nonzero if the interface is configured for cache-coherent DMA.
+ */
+#define NETIO_PARAM_COHERENT 8
+
+/** Desired link state. The value is a conbination of "NETIO_LINK_xxx"
+ *  flags, which specify the desired state for the link.  With @ref
+ *  netio_set(), this will, in the background, attempt to bring up the link
+ *  using whichever of the requested flags are reasonable, or take down the
+ *  link if the flags are zero.  The actual link up or down operation may
+ *  happen after this call completes.  If the link state changes in the
+ *  future, the system will continue to try to get back to the desired link
+ *  state; for instance, if the link is brought up successfully, and then
+ *  the network cable is disconnected, the link will go down.  However, the
+ *  desired state of the link is still up, so if the cable is reconnected,
+ *  the link will be brought up again.
+ *
+ *  With @ref netio_get(), this will indicate the desired state for the
+ *  link, as set with a previous netio_set() call, or implicitly by a
+ *  netio_input_register() or netio_input_unregister() operation.  This may
+ *  not reflect the current state of the link; to get that, use
+ *  ::NETIO_PARAM_LINK_CURRENT_STATE. */
+#define NETIO_PARAM_LINK_DESIRED_STATE 9
+
+/** NetIO statistics structure.  Retrieved using the ::NETIO_PARAM_STAT
+ *  address passed to @ref netio_get(). */
+typedef struct
+{
+  /** Number of packets which have been received by the IPP and forwarded
+   *  to a tile's receive queue for processing.  This value wraps at its
+   *  maximum, and is not cleared upon read. */
+  uint32_t packets_received;
+
+  /** Number of packets which have been dropped by the IPP, because they could
+   *  not be received, or could not be forwarded to a tile.  The former happens
+   *  when the IPP does not have a free packet buffer of suitable size for an
+   *  incoming frame.  The latter happens when all potential destination tiles
+   *  for a packet, as defined by the group, bucket, and queue configuration,
+   *  have full receive queues.   This value wraps at its maximum, and is not
+   *  cleared upon read. */
+  uint32_t packets_dropped;
+
+  /*
+   * Note: the #defines after each of the following four one-byte values
+   * denote their location within the third word of the netio_stat_t.  They
+   * are intended for use only by the IPP implementation and are thus omitted
+   * from the Doxygen output.
+   */
+
+  /** Number of packets dropped because no worker was able to accept a new
+   *  packet.  This value saturates at its maximum, and is cleared upon
+   *  read. */
+  uint8_t drops_no_worker;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_WORKER   0
+#endif
+
+  /** Number of packets dropped because no small buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_smallbuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_SMALLBUF 1
+#endif
+
+  /** Number of packets dropped because no large buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_largebuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_LARGEBUF 2
+#endif
+
+  /** Number of packets dropped because no jumbo buffers were available.
+   *  This value saturates at its maximum, and is cleared upon read. */
+  uint8_t drops_no_jumbobuf;
+#ifndef __DOXYGEN__
+#define NETIO_STAT_DROPS_NO_JUMBOBUF 3
+#endif
+}
+netio_stat_t;
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define NETIO_LINK_10M         0x01
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define NETIO_LINK_100M        0x02
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define NETIO_LINK_1G          0x04
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define NETIO_LINK_10G         0x08
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define NETIO_LINK_ANYSPEED    0x10
+
+/** All legal link speeds. */
+#define NETIO_LINK_SPEED  (NETIO_LINK_10M  | \
+                           NETIO_LINK_100M | \
+                           NETIO_LINK_1G   | \
+                           NETIO_LINK_10G  | \
+                           NETIO_LINK_ANYSPEED)
+
+
+/** MAC register class.  Addr is a register offset within the MAC.
+ *  Registers within the XGbE and GbE MACs are documented in the Tile
+ *  Processor I/O Device Guide (UG104). MAC registers start at address
+ *  0x4000, and do not include the MAC_INTERFACE registers. */
+#define NETIO_MAC             1
+
+/** MDIO register class (IEEE 802.3 clause 22 format).  Addr is the "addr"
+ *  member of a netio_mdio_addr_t structure. */
+#define NETIO_MDIO            2
+
+/** MDIO register class (IEEE 802.3 clause 45 format).  Addr is the "addr"
+ *  member of a netio_mdio_addr_t structure. */
+#define NETIO_MDIO_CLAUSE45   3
+
+/** NetIO MDIO address type.  Retrieved or provided using the ::NETIO_MDIO
+ *  address passed to @ref netio_get() or @ref netio_set(). */
+typedef union
+{
+  struct
+  {
+    unsigned int reg:16;  /**< MDIO register offset.  For clause 22 access,
+                               must be less than 32. */
+    unsigned int phy:5;   /**< Which MDIO PHY to access. */
+    unsigned int dev:5;   /**< Which MDIO device to access within that PHY.
+                               Applicable for clause 45 access only; ignored
+                               for clause 22 access. */
+  }
+  bits;                   /**< Container for bitfields. */
+  uint64_t addr;          /**< Value to pass to @ref netio_get() or
+                           *   @ref netio_set(). */
+}
+netio_mdio_addr_t;
+
+/** @} */
+
+#endif /* __NETIO_INTF_H__ */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index 112b1e2..b4c8e8e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_SMP)		+= smpboot.o smp.o tlb.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_PCI)		+= pci.o
diff --git a/arch/tile/kernel/backtrace.c b/arch/tile/kernel/backtrace.c
index d3c41c1..55a6a74 100644
--- a/arch/tile/kernel/backtrace.c
+++ b/arch/tile/kernel/backtrace.c
@@ -369,6 +369,10 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location,
 					/* Weird; reserved value, ignore it. */
 					continue;
 				}
+				if (info_operand & ENTRY_POINT_INFO_OP)	{
+					/* This info op is ignored by the backtracer. */
+					continue;
+				}
 
 				/* Skip info ops which are not in the
 				 * "one_ago" mode we want right now.
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index b1e06d0..dbc213a 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -21,7 +21,6 @@
 #include <linux/kdev_t.h>
 #include <linux/fs.h>
 #include <linux/fcntl.h>
-#include <linux/smp_lock.h>
 #include <linux/uaccess.h>
 #include <linux/signal.h>
 #include <asm/syscalls.h>
@@ -148,14 +147,20 @@ long tile_compat_sys_msgrcv(int msqid,
 #define compat_sys_readahead sys32_readahead
 #define compat_sys_sync_file_range compat_sys_sync_file_range2
 
-/* The native 64-bit "struct stat" matches the 32-bit "struct stat64". */
-#define compat_sys_stat64 sys_newstat
-#define compat_sys_lstat64 sys_newlstat
-#define compat_sys_fstat64 sys_newfstat
-#define compat_sys_fstatat64 sys_newfstatat
+/* We leverage the "struct stat64" type for 32-bit time_t/nsec. */
+#define compat_sys_stat64 sys_stat64
+#define compat_sys_lstat64 sys_lstat64
+#define compat_sys_fstat64 sys_fstat64
+#define compat_sys_fstatat64 sys_fstatat64
 
-/* Pass full 64-bit values through ptrace. */
-#define compat_sys_ptrace tile_compat_sys_ptrace
+/* The native sys_ptrace dynamically handles compat binaries. */
+#define compat_sys_ptrace sys_ptrace
+
+/* Call the trampolines to manage pt_regs where necessary. */
+#define compat_sys_execve _compat_sys_execve
+#define compat_sys_sigaltstack _compat_sys_sigaltstack
+#define compat_sys_rt_sigreturn _compat_sys_rt_sigreturn
+#define sys_clone _sys_clone
 
 /*
  * Note that we can't include <linux/unistd.h> here since the header
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 9c710db..dbb0dfc 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -15,7 +15,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
@@ -256,9 +255,9 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
 	return err;
 }
 
-long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			     struct compat_sigaltstack __user *uoss_ptr,
-			     struct pt_regs *regs)
+long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
+			    struct compat_sigaltstack __user *uoss_ptr,
+			    struct pt_regs *regs)
 {
 	stack_t uss, uoss;
 	int ret;
@@ -291,12 +290,12 @@ long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
 	return ret;
 }
 
-long _compat_sys_rt_sigreturn(struct pt_regs *regs)
+/* The assembly shim for this function arranges to ignore the return value. */
+long compat_sys_rt_sigreturn(struct pt_regs *regs)
 {
 	struct compat_rt_sigframe __user *frame =
 		(struct compat_rt_sigframe __user *) compat_ptr(regs->sp);
 	sigset_t set;
-	long r0;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -309,13 +308,13 @@ long _compat_sys_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
-	if (_compat_sys_sigaltstack(&frame->uc.uc_stack, NULL, regs) != 0)
+	if (compat_sys_sigaltstack(&frame->uc.uc_stack, NULL, regs) != 0)
 		goto badframe;
 
-	return r0;
+	return 0;
 
 badframe:
 	force_sig(SIGSEGV, current);
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index 2c54fd4..493a0e6 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -54,7 +54,7 @@ void early_printk(const char *fmt, ...)
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
-	raw_local_irq_disable_all();
+	arch_local_irq_disable_all();
 	va_start(ap, fmt);
 	early_printk("Kernel panic - not syncing: ");
 	early_vprintk(fmt, ap);
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S
index 3d01383..fd8dc42 100644
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -15,7 +15,9 @@
 #include <linux/linkage.h>
 #include <linux/unistd.h>
 #include <asm/irqflags.h>
+#include <asm/processor.h>
 #include <arch/abi.h>
+#include <arch/spr_def.h>
 
 #ifdef __tilegx__
 #define bnzt bnezt
@@ -25,28 +27,6 @@ STD_ENTRY(current_text_addr)
 	{ move r0, lr; jrp lr }
 	STD_ENDPROC(current_text_addr)
 
-STD_ENTRY(_sim_syscall)
-	/*
-	 * Wait for r0-r9 to be ready (and lr on the off chance we
-	 * want the syscall to locate its caller), then make a magic
-	 * simulator syscall.
-	 *
-	 * We carefully stall until the registers are readable in case they
-	 * are the target of a slow load, etc. so that tile-sim will
-	 * definitely be able to read all of them inside the magic syscall.
-	 *
-	 * Technically this is wrong for r3-r9 and lr, since an interrupt
-	 * could come in and restore the registers with a slow load right
-	 * before executing the mtspr. We may need to modify tile-sim to
-	 * explicitly stall for this case, but we do not yet have
-	 * a way to implement such a stall.
-	 */
-	{ and zero, lr, r9 ; and zero, r8, r7 }
-	{ and zero, r6, r5 ; and zero, r4, r3 }
-	{ and zero, r2, r1 ; mtspr SIM_CONTROL, r0 }
-	{ jrp lr }
-	STD_ENDPROC(_sim_syscall)
-
 /*
  * Implement execve().  The i386 code has a note that forking from kernel
  * space results in no copy on write until the execve, so we should be
@@ -102,7 +82,7 @@ STD_ENTRY(KBacktraceIterator_init_current)
 STD_ENTRY(cpu_idle_on_new_stack)
 	{
 	 move sp, r1
-	 mtspr SYSTEM_SAVE_1_0, r2
+	 mtspr SPR_SYSTEM_SAVE_K_0, r2
 	}
 	jal free_thread_info
 	j cpu_idle
@@ -124,15 +104,15 @@ STD_ENTRY(smp_nap)
 STD_ENTRY(_cpu_idle)
 	{
 	 lnk r0
-	 movei r1, 1
+	 movei r1, KERNEL_PL
 	}
 	{
 	 addli r0, r0, _cpu_idle_nap - .
 	 mtspr INTERRUPT_CRITICAL_SECTION, r1
 	}
-	IRQ_ENABLE(r2, r3)         /* unmask, but still with ICS set */
-	mtspr EX_CONTEXT_1_1, r1   /* PL1, ICS clear */
-	mtspr EX_CONTEXT_1_0, r0
+	IRQ_ENABLE(r2, r3)             /* unmask, but still with ICS set */
+	mtspr SPR_EX_CONTEXT_K_1, r1   /* Kernel PL, ICS clear */
+	mtspr SPR_EX_CONTEXT_K_0, r0
 	iret
 	.global _cpu_idle_nap
 _cpu_idle_nap:
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 584b965..e910530 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -151,12 +151,12 @@ enum direction_protect {
 
 static void enable_firewall_interrupts(void)
 {
-	raw_local_irq_unmask_now(INT_UDN_FIREWALL);
+	arch_local_irq_unmask_now(INT_UDN_FIREWALL);
 }
 
 static void disable_firewall_interrupts(void)
 {
-	raw_local_irq_mask_now(INT_UDN_FIREWALL);
+	arch_local_irq_mask_now(INT_UDN_FIREWALL);
 }
 
 /* Set up hardwall on this cpu based on the passed hardwall_info. */
@@ -768,6 +768,7 @@ static int hardwall_release(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations dev_hardwall_fops = {
+	.open           = nonseekable_open,
 	.unlocked_ioctl = hardwall_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = hardwall_compat_ioctl,
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index 2b4f6c0..90e7c44 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -23,6 +23,7 @@
 #include <asm/asm-offsets.h>
 #include <hv/hypervisor.h>
 #include <arch/chip.h>
+#include <arch/spr_def.h>
 
 /*
  * This module contains the entry code for kernel images. It performs the
@@ -76,7 +77,7 @@ ENTRY(_start)
 	}
 1:
 
-	/* Get our processor number and save it away in SAVE_1_0. */
+	/* Get our processor number and save it away in SAVE_K_0. */
 	jal hv_inquire_topology
 	mulll_uu r4, r1, r2        /* r1 == y, r2 == width */
 	add r4, r4, r0             /* r0 == x, so r4 == cpu == y*width + x */
@@ -124,7 +125,7 @@ ENTRY(_start)
 	lw r0, r0
 	lw sp, r1
 	or r4, sp, r4
-	mtspr SYSTEM_SAVE_1_0, r4  /* save ksp0 + cpu */
+	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
 	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index 8f58bdf..5eed4a0 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -32,8 +32,8 @@
 # error "No support for kernel preemption currently"
 #endif
 
-#if INT_INTCTRL_1 < 32 || INT_INTCTRL_1 >= 48
-# error INT_INTCTRL_1 coded to set high interrupt mask
+#if INT_INTCTRL_K < 32 || INT_INTCTRL_K >= 48
+# error INT_INTCTRL_K coded to set high interrupt mask
 #endif
 
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
@@ -132,8 +132,8 @@ intvec_\vecname:
 
 	/* Temporarily save a register so we have somewhere to work. */
 
-	mtspr   SYSTEM_SAVE_1_1, r0
-	mfspr   r0, EX_CONTEXT_1_1
+	mtspr   SPR_SYSTEM_SAVE_K_1, r0
+	mfspr   r0, SPR_EX_CONTEXT_K_1
 
 	/* The cmpxchg code clears sp to force us to reset it here on fault. */
 	{
@@ -167,18 +167,18 @@ intvec_\vecname:
 	 * The page_fault handler may be downcalled directly by the
 	 * hypervisor even when Linux is running and has ICS set.
 	 *
-	 * In this case the contents of EX_CONTEXT_1_1 reflect the
+	 * In this case the contents of EX_CONTEXT_K_1 reflect the
 	 * previous fault and can't be relied on to choose whether or
 	 * not to reinitialize the stack pointer.  So we add a test
-	 * to see whether SYSTEM_SAVE_1_2 has the high bit set,
+	 * to see whether SYSTEM_SAVE_K_2 has the high bit set,
 	 * and if so we don't reinitialize sp, since we must be coming
 	 * from Linux.  (In fact the precise case is !(val & ~1),
 	 * but any Linux PC has to have the high bit set.)
 	 *
-	 * Note that the hypervisor *always* sets SYSTEM_SAVE_1_2 for
+	 * Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
 	 * any path that turns into a downcall to one of our TLB handlers.
 	 */
-	mfspr   r0, SYSTEM_SAVE_1_2
+	mfspr   r0, SPR_SYSTEM_SAVE_K_2
 	{
 	 blz    r0, 0f    /* high bit in S_S_1_2 is for a PC to use */
 	 move   r0, sp
@@ -187,12 +187,12 @@ intvec_\vecname:
 
 2:
 	/*
-	 * SYSTEM_SAVE_1_0 holds the cpu number in the low bits, and
+	 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and
 	 * the current stack top in the higher bits.  So we recover
 	 * our stack top by just masking off the low bits, then
 	 * point sp at the top aligned address on the actual stack page.
 	 */
-	mfspr   r0, SYSTEM_SAVE_1_0
+	mfspr   r0, SPR_SYSTEM_SAVE_K_0
 	mm      r0, r0, zero, LOG2_THREAD_SIZE, 31
 
 0:
@@ -254,7 +254,7 @@ intvec_\vecname:
 	 sw     sp, r3
 	 addli  sp, sp, PTREGS_OFFSET_PC - PTREGS_OFFSET_REG(3)
 	}
-	mfspr   r0, EX_CONTEXT_1_0
+	mfspr   r0, SPR_EX_CONTEXT_K_0
 	.ifc \processing,handle_syscall
 	/*
 	 * Bump the saved PC by one bundle so that when we return, we won't
@@ -267,7 +267,7 @@ intvec_\vecname:
 	 sw     sp, r0
 	 addli  sp, sp, PTREGS_OFFSET_EX1 - PTREGS_OFFSET_PC
 	}
-	mfspr   r0, EX_CONTEXT_1_1
+	mfspr   r0, SPR_EX_CONTEXT_K_1
 	{
 	 sw     sp, r0
 	 addi   sp, sp, PTREGS_OFFSET_FAULTNUM - PTREGS_OFFSET_EX1
@@ -289,7 +289,7 @@ intvec_\vecname:
 	 .endif
 	 addli  sp, sp, PTREGS_OFFSET_REG(0) - PTREGS_OFFSET_FAULTNUM
 	}
-	mfspr   r0, SYSTEM_SAVE_1_1    /* Original r0 */
+	mfspr   r0, SPR_SYSTEM_SAVE_K_1    /* Original r0 */
 	{
 	 sw     sp, r0
 	 addi   sp, sp, -PTREGS_OFFSET_REG(0) - 4
@@ -309,12 +309,12 @@ intvec_\vecname:
 	 * See discussion below at "finish_interrupt_save".
 	 */
 	.ifc \c_routine, do_page_fault
-	mfspr   r2, SYSTEM_SAVE_1_3   /* address of page fault */
-	mfspr   r3, SYSTEM_SAVE_1_2   /* info about page fault */
+	mfspr   r2, SPR_SYSTEM_SAVE_K_3   /* address of page fault */
+	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
 	.else
 	.ifc \vecnum, INT_DOUBLE_FAULT
 	{
-	 mfspr  r2, SYSTEM_SAVE_1_2   /* double fault info from HV */
+	 mfspr  r2, SPR_SYSTEM_SAVE_K_2   /* double fault info from HV */
 	 movei  r3, 0
 	}
 	.else
@@ -467,7 +467,7 @@ intvec_\vecname:
 	/* Load tp with our per-cpu offset. */
 #ifdef CONFIG_SMP
 	{
-	 mfspr  r20, SYSTEM_SAVE_1_0
+	 mfspr  r20, SPR_SYSTEM_SAVE_K_0
 	 moveli r21, lo16(__per_cpu_offset)
 	}
 	{
@@ -487,7 +487,7 @@ intvec_\vecname:
 	 * We load flags in r32 here so we can jump to .Lrestore_regs
 	 * directly after do_page_fault_ics() if necessary.
 	 */
-	mfspr   r32, EX_CONTEXT_1_1
+	mfspr   r32, SPR_EX_CONTEXT_K_1
 	{
 	 andi   r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
 	 PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
@@ -957,11 +957,11 @@ STD_ENTRY(interrupt_return)
 	pop_reg_zero r21, r3, sp, PTREGS_OFFSET_EX1 - PTREGS_OFFSET_PC
 	pop_reg_zero lr, r4, sp, PTREGS_OFFSET_REG(52) - PTREGS_OFFSET_EX1
 	{
-	 mtspr  EX_CONTEXT_1_0, r21
+	 mtspr  SPR_EX_CONTEXT_K_0, r21
 	 move   r5, zero
 	}
 	{
-	 mtspr  EX_CONTEXT_1_1, lr
+	 mtspr  SPR_EX_CONTEXT_K_1, lr
 	 andi   lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
 	}
 
@@ -1020,7 +1020,7 @@ STD_ENTRY(interrupt_return)
 
 	/* Set r1 to errno if we are returning an error, otherwise zero. */
 	{
-	 moveli r29, 1024
+	 moveli r29, 4096
 	 sub    r1, zero, r0
 	}
 	slt_u   r29, r1, r29
@@ -1199,7 +1199,7 @@ STD_ENTRY(interrupt_return)
 	STD_ENDPROC(interrupt_return)
 
 	/*
-	 * This interrupt variant clears the INT_INTCTRL_1 interrupt mask bit
+	 * This interrupt variant clears the INT_INTCTRL_K interrupt mask bit
 	 * before returning, so we can properly get more downcalls.
 	 */
 	.pushsection .text.handle_interrupt_downcall,"ax"
@@ -1208,11 +1208,11 @@ handle_interrupt_downcall:
 	check_single_stepping normal, .Ldispatch_downcall
 .Ldispatch_downcall:
 
-	/* Clear INTCTRL_1 from the set of interrupts we ever enable. */
+	/* Clear INTCTRL_K from the set of interrupts we ever enable. */
 	GET_INTERRUPTS_ENABLED_MASK_PTR(r30)
 	{
 	 addi   r30, r30, 4
-	 movei  r31, INT_MASK(INT_INTCTRL_1)
+	 movei  r31, INT_MASK(INT_INTCTRL_K)
 	}
 	{
 	 lw     r20, r30
@@ -1227,7 +1227,7 @@ handle_interrupt_downcall:
 	}
 	FEEDBACK_REENTER(handle_interrupt_downcall)
 
-	/* Allow INTCTRL_1 to be enabled next time we enable interrupts. */
+	/* Allow INTCTRL_K to be enabled next time we enable interrupts. */
 	lw      r20, r30
 	or      r20, r20, r31
 	sw      r30, r20
@@ -1342,8 +1342,8 @@ handle_syscall:
 	lw      r20, r20
 
 	/* Jump to syscall handler. */
-	jalr    r20; .Lhandle_syscall_link:
-	FEEDBACK_REENTER(handle_syscall)
+	jalr    r20
+.Lhandle_syscall_link: /* value of "lr" after "jalr r20" above */
 
 	/*
 	 * Write our r0 onto the stack so it gets restored instead
@@ -1352,6 +1352,9 @@ handle_syscall:
 	PTREGS_PTR(r29, PTREGS_OFFSET_REG(0))
 	sw      r29, r0
 
+.Lsyscall_sigreturn_skip:
+	FEEDBACK_REENTER(handle_syscall)
+
 	/* Do syscall trace again, if requested. */
 	lw	r30, r31
 	andi    r30, r30, _TIF_SYSCALL_TRACE
@@ -1472,7 +1475,12 @@ handle_ill:
 	lw      r26, r24
 	sw      r28, r26
 
-	/* Clear TIF_SINGLESTEP */
+	/*
+	 * Clear TIF_SINGLESTEP to prevent recursion if we execute an ill.
+	 * The normal non-arch flow redundantly clears TIF_SINGLESTEP, but we
+	 * need to clear it here and can't really impose on all other arches.
+	 * So what's another write between friends?
+	 */
 	GET_THREAD_INFO(r0)
 
 	addi    r1, r0, THREAD_INFO_FLAGS_OFFSET
@@ -1509,7 +1517,7 @@ handle_ill:
 /* Various stub interrupt handlers and syscall handlers */
 
 STD_ENTRY_LOCAL(_kernel_double_fault)
-	mfspr   r1, EX_CONTEXT_1_0
+	mfspr   r1, SPR_EX_CONTEXT_K_0
 	move    r2, lr
 	move    r3, sp
 	move    r4, r52
@@ -1518,34 +1526,44 @@ STD_ENTRY_LOCAL(_kernel_double_fault)
 	STD_ENDPROC(_kernel_double_fault)
 
 STD_ENTRY_LOCAL(bad_intr)
-	mfspr   r2, EX_CONTEXT_1_0
+	mfspr   r2, SPR_EX_CONTEXT_K_0
 	panic   "Unhandled interrupt %#x: PC %#lx"
 	STD_ENDPROC(bad_intr)
 
 /* Put address of pt_regs in reg and jump. */
 #define PTREGS_SYSCALL(x, reg)                          \
-	STD_ENTRY(x);                                   \
+	STD_ENTRY(_##x);                                \
 	{                                               \
 	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
-	 j      _##x                                    \
+	 j      x                                       \
 	};                                              \
-	STD_ENDPROC(x)
+	STD_ENDPROC(_##x)
+
+/*
+ * Special-case sigreturn to not write r0 to the stack on return.
+ * This is technically more efficient, but it also avoids difficulties
+ * in the 64-bit OS when handling 32-bit compat code, since we must not
+ * sign-extend r0 for the sigreturn return-value case.
+ */
+#define PTREGS_SYSCALL_SIGRETURN(x, reg)                \
+	STD_ENTRY(_##x);                                \
+	addli   lr, lr, .Lsyscall_sigreturn_skip - .Lhandle_syscall_link; \
+	{                                               \
+	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
+	 j      x                                       \
+	};                                              \
+	STD_ENDPROC(_##x)
 
 PTREGS_SYSCALL(sys_execve, r3)
 PTREGS_SYSCALL(sys_sigaltstack, r2)
-PTREGS_SYSCALL(sys_rt_sigreturn, r0)
+PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0)
+PTREGS_SYSCALL(sys_cmpxchg_badaddr, r1)
 
-/* Save additional callee-saves to pt_regs, put address in reg and jump. */
-#define PTREGS_SYSCALL_ALL_REGS(x, reg)                 \
-	STD_ENTRY(x);                                   \
-	push_extra_callee_saves reg;                    \
-	j       _##x;                                   \
-	STD_ENDPROC(x)
-
-PTREGS_SYSCALL_ALL_REGS(sys_fork, r0)
-PTREGS_SYSCALL_ALL_REGS(sys_vfork, r0)
-PTREGS_SYSCALL_ALL_REGS(sys_clone, r4)
-PTREGS_SYSCALL_ALL_REGS(sys_cmpxchg_badaddr, r1)
+/* Save additional callee-saves to pt_regs, put address in r4 and jump. */
+STD_ENTRY(_sys_clone)
+	push_extra_callee_saves r4
+	j       sys_clone
+	STD_ENDPROC(_sys_clone)
 
 /*
  * This entrypoint is taken for the cmpxchg and atomic_update fast
@@ -1558,12 +1576,14 @@ PTREGS_SYSCALL_ALL_REGS(sys_cmpxchg_badaddr, r1)
  * to be available to it on entry.  It does not modify any callee-save
  * registers (including "lr").  It does not check what PL it is being
  * called at, so you'd better not call it other than at PL0.
+ * The <atomic.h> wrapper assumes it only clobbers r20-r29, so if
+ * it ever is necessary to use more registers, be aware.
  *
  * It does not use the stack, but since it might be re-interrupted by
  * a page fault which would assume the stack was valid, it does
  * save/restore the stack pointer and zero it out to make sure it gets reset.
  * Since we always keep interrupts disabled, the hypervisor won't
- * clobber our EX_CONTEXT_1_x registers, so we don't save/restore them
+ * clobber our EX_CONTEXT_K_x registers, so we don't save/restore them
  * (other than to advance the PC on return).
  *
  * We have to manually validate the user vs kernel address range
@@ -1769,7 +1789,7 @@ ENTRY(sys_cmpxchg)
 	/* Do slow mtspr here so the following "mf" waits less. */
 	{
 	 move   sp, r27
-	 mtspr  EX_CONTEXT_1_0, r28
+	 mtspr  SPR_EX_CONTEXT_K_0, r28
 	}
 	mf
 
@@ -1788,7 +1808,7 @@ ENTRY(sys_cmpxchg)
 	}
 	{
 	 move   sp, r27
-	 mtspr  EX_CONTEXT_1_0, r28
+	 mtspr  SPR_EX_CONTEXT_K_0, r28
 	}
 	iret
 
@@ -1816,7 +1836,7 @@ ENTRY(sys_cmpxchg)
 #endif
 
 	/* Issue the slow SPR here while the tns result is in flight. */
-	mfspr   r28, EX_CONTEXT_1_0
+	mfspr   r28, SPR_EX_CONTEXT_K_0
 
 	{
 	 addi   r28, r28, 8    /* return to the instruction after the swint1 */
@@ -1904,7 +1924,7 @@ ENTRY(sys_cmpxchg)
 .Lcmpxchg64_mismatch:
 	{
 	 move   sp, r27
-	 mtspr  EX_CONTEXT_1_0, r28
+	 mtspr  SPR_EX_CONTEXT_K_0, r28
 	}
 	mf
 	{
@@ -1985,8 +2005,13 @@ int_unalign:
 	int_hand     INT_PERF_COUNT, PERF_COUNT, \
 		     op_handle_perf_interrupt, handle_nmi
 	int_hand     INT_INTCTRL_3, INTCTRL_3, bad_intr
+#if CONFIG_KERNEL_PL == 2
+	dc_dispatch  INT_INTCTRL_2, INTCTRL_2
+	int_hand     INT_INTCTRL_1, INTCTRL_1, bad_intr
+#else
 	int_hand     INT_INTCTRL_2, INTCTRL_2, bad_intr
 	dc_dispatch  INT_INTCTRL_1, INTCTRL_1
+#endif
 	int_hand     INT_INTCTRL_0, INTCTRL_0, bad_intr
 	int_hand     INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
 		     hv_message_intr, handle_interrupt_downcall
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 9a27d56..128805e 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -26,7 +26,7 @@
 #define IS_HW_CLEARED 1
 
 /*
- * The set of interrupts we enable for raw_local_irq_enable().
+ * The set of interrupts we enable for arch_local_irq_enable().
  * This is initialized to have just a single interrupt that the kernel
  * doesn't actually use as a sentinel.  During kernel init,
  * interrupts are added as the kernel gets prepared to support them.
@@ -61,9 +61,9 @@ static DEFINE_SPINLOCK(available_irqs_lock);
 
 #if CHIP_HAS_IPI()
 /* Use SPRs to manipulate device interrupts. */
-#define mask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_SET_1, irq_mask)
-#define unmask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_RESET_1, irq_mask)
-#define clear_irqs(irq_mask) __insn_mtspr(SPR_IPI_EVENT_RESET_1, irq_mask)
+#define mask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_SET_K, irq_mask)
+#define unmask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_RESET_K, irq_mask)
+#define clear_irqs(irq_mask) __insn_mtspr(SPR_IPI_EVENT_RESET_K, irq_mask)
 #else
 /* Use HV to manipulate device interrupts. */
 #define mask_irqs(irq_mask) hv_disable_intr(irq_mask)
@@ -89,16 +89,16 @@ void tile_dev_intr(struct pt_regs *regs, int intnum)
 	 * masked by a previous interrupt.  Then, mask out the ones
 	 * we're going to handle.
 	 */
-	unsigned long masked = __insn_mfspr(SPR_IPI_MASK_1);
-	original_irqs = __insn_mfspr(SPR_IPI_EVENT_1) & ~masked;
-	__insn_mtspr(SPR_IPI_MASK_SET_1, original_irqs);
+	unsigned long masked = __insn_mfspr(SPR_IPI_MASK_K);
+	original_irqs = __insn_mfspr(SPR_IPI_EVENT_K) & ~masked;
+	__insn_mtspr(SPR_IPI_MASK_SET_K, original_irqs);
 #else
 	/*
 	 * Hypervisor performs the equivalent of the Gx code above and
 	 * then puts the pending interrupt mask into a system save reg
 	 * for us to find.
 	 */
-	original_irqs = __insn_mfspr(SPR_SYSTEM_SAVE_1_3);
+	original_irqs = __insn_mfspr(SPR_SYSTEM_SAVE_K_3);
 #endif
 	remaining_irqs = original_irqs;
 
@@ -225,7 +225,7 @@ void __cpuinit setup_irq_regs(void)
 	/* Enable interrupt delivery. */
 	unmask_irqs(~0UL);
 #if CHIP_HAS_IPI()
-	raw_local_irq_unmask(INT_IPI_1);
+	arch_local_irq_unmask(INT_IPI_K);
 #endif
 }
 
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index ba7a265..0d8b9e9 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -182,13 +182,13 @@ static void kexec_find_and_set_command_line(struct kimage *image)
 
 		if ((entry & IND_SOURCE)) {
 			void *va =
-				kmap_atomic_pfn(entry >> PAGE_SHIFT, KM_USER0);
+				kmap_atomic_pfn(entry >> PAGE_SHIFT);
 			r = kexec_bn2cl(va);
 			if (r) {
 				command_line = r;
 				break;
 			}
-			kunmap_atomic(va, KM_USER0);
+			kunmap_atomic(va);
 		}
 	}
 
@@ -198,7 +198,7 @@ static void kexec_find_and_set_command_line(struct kimage *image)
 
 		hverr = hv_set_command_line(
 			(HV_VirtAddr) command_line, strlen(command_line));
-		kunmap_atomic(command_line, KM_USER0);
+		kunmap_atomic(command_line);
 	} else {
 		pr_info("%s: no command line found; making empty\n",
 		       __func__);
diff --git a/arch/tile/kernel/messaging.c b/arch/tile/kernel/messaging.c
index 6d23ed2..0858ee6 100644
--- a/arch/tile/kernel/messaging.c
+++ b/arch/tile/kernel/messaging.c
@@ -34,7 +34,7 @@ void __cpuinit init_messaging(void)
 		panic("hv_register_message_state: error %d", rc);
 
 	/* Make sure downcall interrupts will be enabled. */
-	raw_local_irq_unmask(INT_INTCTRL_1);
+	arch_local_irq_unmask(INT_INTCTRL_K);
 }
 
 void hv_message_intr(struct pt_regs *regs, int intnum)
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
new file mode 100644
index 0000000..a1ee25b
--- /dev/null
+++ b/arch/tile/kernel/pci.c
@@ -0,0 +1,621 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/byteorder.h>
+#include <asm/hv_driver.h>
+#include <hv/drv_pcie_rc_intf.h>
+
+
+/*
+ * Initialization flow and process
+ * -------------------------------
+ *
+ * This files containes the routines to search for PCI buses,
+ * enumerate the buses, and configure any attached devices.
+ *
+ * There are two entry points here:
+ * 1) tile_pci_init
+ *    This sets up the pci_controller structs, and opens the
+ *    FDs to the hypervisor.  This is called from setup_arch() early
+ *    in the boot process.
+ * 2) pcibios_init
+ *    This probes the PCI bus(es) for any attached hardware.  It's
+ *    called by subsys_initcall.  All of the real work is done by the
+ *    generic Linux PCI layer.
+ *
+ */
+
+/*
+ * This flag tells if the platform is TILEmpower that needs
+ * special configuration for the PLX switch chip.
+ */
+int __write_once tile_plx_gen1;
+
+static struct pci_controller controllers[TILE_NUM_PCIE];
+static int num_controllers;
+
+static struct pci_ops tile_cfg_ops;
+
+
+/*
+ * We don't need to worry about the alignment of resources.
+ */
+resource_size_t pcibios_align_resource(void *data, const struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	return res->start;
+}
+EXPORT_SYMBOL(pcibios_align_resource);
+
+/*
+ * Open a FD to the hypervisor PCI device.
+ *
+ * controller_id is the controller number, config type is 0 or 1 for
+ * config0 or config1 operations.
+ */
+static int __init tile_pcie_open(int controller_id, int config_type)
+{
+	char filename[32];
+	int fd;
+
+	sprintf(filename, "pcie/%d/config%d", controller_id, config_type);
+
+	fd = hv_dev_open((HV_VirtAddr)filename, 0);
+
+	return fd;
+}
+
+
+/*
+ * Get the IRQ numbers from the HV and set up the handlers for them.
+ */
+static int __init tile_init_irqs(int controller_id,
+				 struct pci_controller *controller)
+{
+	char filename[32];
+	int fd;
+	int ret;
+	int x;
+	struct pcie_rc_config rc_config;
+
+	sprintf(filename, "pcie/%d/ctl", controller_id);
+	fd = hv_dev_open((HV_VirtAddr)filename, 0);
+	if (fd < 0) {
+		pr_err("PCI: hv_dev_open(%s) failed\n", filename);
+		return -1;
+	}
+	ret = hv_dev_pread(fd, 0, (HV_VirtAddr)(&rc_config),
+			   sizeof(rc_config), PCIE_RC_CONFIG_MASK_OFF);
+	hv_dev_close(fd);
+	if (ret != sizeof(rc_config)) {
+		pr_err("PCI: wanted %zd bytes, got %d\n",
+		       sizeof(rc_config), ret);
+		return -1;
+	}
+	/* Record irq_base so that we can map INTx to IRQ # later. */
+	controller->irq_base = rc_config.intr;
+
+	for (x = 0; x < 4; x++)
+		tile_irq_activate(rc_config.intr + x,
+				  TILE_IRQ_HW_CLEAR);
+
+	if (rc_config.plx_gen1)
+		controller->plx_gen1 = 1;
+
+	return 0;
+}
+
+/*
+ * First initialization entry point, called from setup_arch().
+ *
+ * Find valid controllers and fill in pci_controller structs for each
+ * of them.
+ *
+ * Returns the number of controllers discovered.
+ */
+int __init tile_pci_init(void)
+{
+	int i;
+
+	pr_info("PCI: Searching for controllers...\n");
+
+	/* Do any configuration we need before using the PCIe */
+
+	for (i = 0; i < TILE_NUM_PCIE; i++) {
+		int hv_cfg_fd0 = -1;
+		int hv_cfg_fd1 = -1;
+		int hv_mem_fd = -1;
+		char name[32];
+		struct pci_controller *controller;
+
+		/*
+		 * Open the fd to the HV.  If it fails then this
+		 * device doesn't exist.
+		 */
+		hv_cfg_fd0 = tile_pcie_open(i, 0);
+		if (hv_cfg_fd0 < 0)
+			continue;
+		hv_cfg_fd1 = tile_pcie_open(i, 1);
+		if (hv_cfg_fd1 < 0) {
+			pr_err("PCI: Couldn't open config fd to HV "
+			    "for controller %d\n", i);
+			goto err_cont;
+		}
+
+		sprintf(name, "pcie/%d/mem", i);
+		hv_mem_fd = hv_dev_open((HV_VirtAddr)name, 0);
+		if (hv_mem_fd < 0) {
+			pr_err("PCI: Could not open mem fd to HV!\n");
+			goto err_cont;
+		}
+
+		pr_info("PCI: Found PCI controller #%d\n", i);
+
+		controller = &controllers[num_controllers];
+
+		if (tile_init_irqs(i, controller)) {
+			pr_err("PCI: Could not initialize "
+			       "IRQs, aborting.\n");
+			goto err_cont;
+		}
+
+		controller->index = num_controllers;
+		controller->hv_cfg_fd[0] = hv_cfg_fd0;
+		controller->hv_cfg_fd[1] = hv_cfg_fd1;
+		controller->hv_mem_fd = hv_mem_fd;
+		controller->first_busno = 0;
+		controller->last_busno = 0xff;
+		controller->ops = &tile_cfg_ops;
+
+		num_controllers++;
+		continue;
+
+err_cont:
+		if (hv_cfg_fd0 >= 0)
+			hv_dev_close(hv_cfg_fd0);
+		if (hv_cfg_fd1 >= 0)
+			hv_dev_close(hv_cfg_fd1);
+		if (hv_mem_fd >= 0)
+			hv_dev_close(hv_mem_fd);
+		continue;
+	}
+
+	/*
+	 * Before using the PCIe, see if we need to do any platform-specific
+	 * configuration, such as the PLX switch Gen 1 issue on TILEmpower.
+	 */
+	for (i = 0; i < num_controllers; i++) {
+		struct pci_controller *controller = &controllers[i];
+
+		if (controller->plx_gen1)
+			tile_plx_gen1 = 1;
+	}
+
+	return num_controllers;
+}
+
+/*
+ * (pin - 1) converts from the PCI standard's [1:4] convention to
+ * a normal [0:3] range.
+ */
+static int tile_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
+{
+	struct pci_controller *controller =
+		(struct pci_controller *)dev->sysdata;
+	return (pin - 1) + controller->irq_base;
+}
+
+
+static void __init fixup_read_and_payload_sizes(void)
+{
+	struct pci_dev *dev = NULL;
+	int smallest_max_payload = 0x1; /* Tile maxes out at 256 bytes. */
+	int max_read_size = 0x2; /* Limit to 512 byte reads. */
+	u16 new_values;
+
+	/* Scan for the smallest maximum payload size. */
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		int pcie_caps_offset;
+		u32 devcap;
+		int max_payload;
+
+		pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP);
+		if (pcie_caps_offset == 0)
+			continue;
+
+		pci_read_config_dword(dev, pcie_caps_offset + PCI_EXP_DEVCAP,
+				      &devcap);
+		max_payload = devcap & PCI_EXP_DEVCAP_PAYLOAD;
+		if (max_payload < smallest_max_payload)
+			smallest_max_payload = max_payload;
+	}
+
+	/* Now, set the max_payload_size for all devices to that value. */
+	new_values = (max_read_size << 12) | (smallest_max_payload << 5);
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		int pcie_caps_offset;
+		u16 devctl;
+
+		pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP);
+		if (pcie_caps_offset == 0)
+			continue;
+
+		pci_read_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL,
+				     &devctl);
+		devctl &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ);
+		devctl |= new_values;
+		pci_write_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL,
+				      devctl);
+	}
+}
+
+
+/*
+ * Second PCI initialization entry point, called by subsys_initcall.
+ *
+ * The controllers have been set up by the time we get here, by a call to
+ * tile_pci_init.
+ */
+static int __init pcibios_init(void)
+{
+	int i;
+
+	pr_info("PCI: Probing PCI hardware\n");
+
+	/*
+	 * Delay a bit in case devices aren't ready.  Some devices are
+	 * known to require at least 20ms here, but we use a more
+	 * conservative value.
+	 */
+	mdelay(250);
+
+	/* Scan all of the recorded PCI controllers.  */
+	for (i = 0; i < num_controllers; i++) {
+		struct pci_controller *controller = &controllers[i];
+		struct pci_bus *bus;
+
+		pr_info("PCI: initializing controller #%d\n", i);
+
+		/*
+		 * This comes from the generic Linux PCI driver.
+		 *
+		 * It reads the PCI tree for this bus into the Linux
+		 * data structures.
+		 *
+		 * This is inlined in linux/pci.h and calls into
+		 * pci_scan_bus_parented() in probe.c.
+		 */
+		bus = pci_scan_bus(0, controller->ops, controller);
+		controller->root_bus = bus;
+		controller->last_busno = bus->subordinate;
+
+	}
+
+	/* Do machine dependent PCI interrupt routing */
+	pci_fixup_irqs(pci_common_swizzle, tile_map_irq);
+
+	/*
+	 * This comes from the generic Linux PCI driver.
+	 *
+	 * It allocates all of the resources (I/O memory, etc)
+	 * associated with the devices read in above.
+	 */
+
+	pci_assign_unassigned_resources();
+
+	/* Configure the max_read_size and max_payload_size values. */
+	fixup_read_and_payload_sizes();
+
+	/* Record the I/O resources in the PCI controller structure. */
+	for (i = 0; i < num_controllers; i++) {
+		struct pci_bus *root_bus = controllers[i].root_bus;
+		struct pci_bus *next_bus;
+		struct pci_dev *dev;
+
+		list_for_each_entry(dev, &root_bus->devices, bus_list) {
+			/* Find the PCI host controller, ie. the 1st bridge. */
+			if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
+				(PCI_SLOT(dev->devfn) == 0)) {
+				next_bus = dev->subordinate;
+				controllers[i].mem_resources[0] =
+					*next_bus->resource[0];
+				controllers[i].mem_resources[1] =
+					 *next_bus->resource[1];
+				controllers[i].mem_resources[2] =
+					 *next_bus->resource[2];
+
+				break;
+			}
+		}
+
+	}
+
+	return 0;
+}
+subsys_initcall(pcibios_init);
+
+/*
+ * No bus fixups needed.
+ */
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+{
+	/* Nothing needs to be done. */
+}
+
+/*
+ * This can be called from the generic PCI layer, but doesn't need to
+ * do anything.
+ */
+char __devinit *pcibios_setup(char *str)
+{
+	/* Nothing needs to be done. */
+	return str;
+}
+
+/*
+ * This is called from the generic Linux layer.
+ */
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+/*
+ * Enable memory and/or address decoding, as appropriate, for the
+ * device described by the 'dev' struct.
+ *
+ * This is called from the generic PCI layer, and can be called
+ * for bridges or endpoints.
+ */
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	u8 header_type;
+	int i;
+	struct resource *r;
+
+	pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	if ((header_type & 0x7F) == PCI_HEADER_TYPE_BRIDGE) {
+		/*
+		 * For bridges, we enable both memory and I/O decoding
+		 * in call cases.
+		 */
+		cmd |= PCI_COMMAND_IO;
+		cmd |= PCI_COMMAND_MEMORY;
+	} else {
+		/*
+		 * For endpoints, we enable memory and/or I/O decoding
+		 * only if they have a memory resource of that type.
+		 */
+		for (i = 0; i < 6; i++) {
+			r = &dev->resource[i];
+			if (r->flags & IORESOURCE_UNSET) {
+				pr_err("PCI: Device %s not available "
+				       "because of resource collisions\n",
+				       pci_name(dev));
+				return -EINVAL;
+			}
+			if (r->flags & IORESOURCE_IO)
+				cmd |= PCI_COMMAND_IO;
+			if (r->flags & IORESOURCE_MEM)
+				cmd |= PCI_COMMAND_MEMORY;
+		}
+	}
+
+	/*
+	 * We only write the command if it changed.
+	 */
+	if (cmd != old_cmd)
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	return 0;
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len)
+		return NULL;
+	if (max && len > max)
+		len = max;
+
+	if (!(flags & IORESOURCE_MEM)) {
+		pr_info("PCI: Trying to map invalid resource %#lx\n", flags);
+		start = 0;
+	}
+
+	return (void __iomem *)start;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+
+/****************************************************************
+ *
+ * Tile PCI config space read/write routines
+ *
+ ****************************************************************/
+
+/*
+ * These are the normal read and write ops
+ * These are expanded with macros from  pci_bus_read_config_byte() etc.
+ *
+ * devfn is the combined PCI slot & function.
+ *
+ * offset is in bytes, from the start of config space for the
+ * specified bus & slot.
+ */
+
+static int __devinit tile_cfg_read(struct pci_bus *bus,
+				   unsigned int devfn,
+				   int offset,
+				   int size,
+				   u32 *val)
+{
+	struct pci_controller *controller = bus->sysdata;
+	int busnum = bus->number & 0xff;
+	int slot = (devfn >> 3) & 0x1f;
+	int function = devfn & 0x7;
+	u32 addr;
+	int config_mode = 1;
+
+	/*
+	 * There is no bridge between the Tile and bus 0, so we
+	 * use config0 to talk to bus 0.
+	 *
+	 * If we're talking to a bus other than zero then we
+	 * must have found a bridge.
+	 */
+	if (busnum == 0) {
+		/*
+		 * We fake an empty slot for (busnum == 0) && (slot > 0),
+		 * since there is only one slot on bus 0.
+		 */
+		if (slot) {
+			*val = 0xFFFFFFFF;
+			return 0;
+		}
+		config_mode = 0;
+	}
+
+	addr = busnum << 20;		/* Bus in 27:20 */
+	addr |= slot << 15;		/* Slot (device) in 19:15 */
+	addr |= function << 12;		/* Function is in 14:12 */
+	addr |= (offset & 0xFFF);	/* byte address in 0:11 */
+
+	return hv_dev_pread(controller->hv_cfg_fd[config_mode], 0,
+			    (HV_VirtAddr)(val), size, addr);
+}
+
+
+/*
+ * See tile_cfg_read() for relevent comments.
+ * Note that "val" is the value to write, not a pointer to that value.
+ */
+static int __devinit tile_cfg_write(struct pci_bus *bus,
+				    unsigned int devfn,
+				    int offset,
+				    int size,
+				    u32 val)
+{
+	struct pci_controller *controller = bus->sysdata;
+	int busnum = bus->number & 0xff;
+	int slot = (devfn >> 3) & 0x1f;
+	int function = devfn & 0x7;
+	u32 addr;
+	int config_mode = 1;
+	HV_VirtAddr valp = (HV_VirtAddr)&val;
+
+	/*
+	 * For bus 0 slot 0 we use config 0 accesses.
+	 */
+	if (busnum == 0) {
+		/*
+		 * We fake an empty slot for (busnum == 0) && (slot > 0),
+		 * since there is only one slot on bus 0.
+		 */
+		if (slot)
+			return 0;
+		config_mode = 0;
+	}
+
+	addr = busnum << 20;		/* Bus in 27:20 */
+	addr |= slot << 15;		/* Slot (device) in 19:15 */
+	addr |= function << 12;		/* Function is in 14:12 */
+	addr |= (offset & 0xFFF);	/* byte address in 0:11 */
+
+#ifdef __BIG_ENDIAN
+	/* Point to the correct part of the 32-bit "val". */
+	valp += 4 - size;
+#endif
+
+	return hv_dev_pwrite(controller->hv_cfg_fd[config_mode], 0,
+			     valp, size, addr);
+}
+
+
+static struct pci_ops tile_cfg_ops = {
+	.read =         tile_cfg_read,
+	.write =        tile_cfg_write,
+};
+
+
+/*
+ * In the following, each PCI controller's mem_resources[1]
+ * represents its (non-prefetchable) PCI memory resource.
+ * mem_resources[0] and mem_resources[2] refer to its PCI I/O and
+ * prefetchable PCI memory resources, respectively.
+ * For more details, see pci_setup_bridge() in setup-bus.c.
+ * By comparing the target PCI memory address against the
+ * end address of controller 0, we can determine the controller
+ * that should accept the PCI memory access.
+ */
+#define TILE_READ(size, type)						\
+type _tile_read##size(unsigned long addr)				\
+{									\
+	type val;							\
+	int idx = 0;							\
+	if (addr > controllers[0].mem_resources[1].end &&		\
+	    addr > controllers[0].mem_resources[2].end)			\
+		idx = 1;                                                \
+	if (hv_dev_pread(controllers[idx].hv_mem_fd, 0,			\
+			 (HV_VirtAddr)(&val), sizeof(type), addr))	\
+		pr_err("PCI: read %zd bytes at 0x%lX failed\n",		\
+		       sizeof(type), addr);				\
+	return val;							\
+}									\
+EXPORT_SYMBOL(_tile_read##size)
+
+TILE_READ(b, u8);
+TILE_READ(w, u16);
+TILE_READ(l, u32);
+TILE_READ(q, u64);
+
+#define TILE_WRITE(size, type)						\
+void _tile_write##size(type val, unsigned long addr)			\
+{									\
+	int idx = 0;							\
+	if (addr > controllers[0].mem_resources[1].end &&		\
+	    addr > controllers[0].mem_resources[2].end)			\
+		idx = 1;                                                \
+	if (hv_dev_pwrite(controllers[idx].hv_mem_fd, 0,		\
+			  (HV_VirtAddr)(&val), sizeof(type), addr))	\
+		pr_err("PCI: write %zd bytes at 0x%lX failed\n",	\
+		       sizeof(type), addr);				\
+}									\
+EXPORT_SYMBOL(_tile_write##size)
+
+TILE_WRITE(b, u8);
+TILE_WRITE(w, u16);
+TILE_WRITE(l, u32);
+TILE_WRITE(q, u64);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 84c2911..e90eb53 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -212,11 +212,19 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	childregs->sp = sp;  /* override with new user stack pointer */
 
 	/*
+	 * If CLONE_SETTLS is set, set "tp" in the new task to "r4",
+	 * which is passed in as arg #5 to sys_clone().
+	 */
+	if (clone_flags & CLONE_SETTLS)
+		childregs->tp = regs->regs[4];
+
+	/*
 	 * Copy the callee-saved registers from the passed pt_regs struct
 	 * into the context-switch callee-saved registers area.
-	 * We have to restore the callee-saved registers since we may
-	 * be cloning a userspace task with userspace register state,
-	 * and we won't be unwinding the same kernel frames to restore them.
+	 * This way when we start the interrupt-return sequence, the
+	 * callee-save registers will be correctly in registers, which
+	 * is how we assume the compiler leaves them as we start doing
+	 * the normal return-from-interrupt path after calling C code.
 	 * Zero out the C ABI save area to mark the top of the stack.
 	 */
 	ksp = (unsigned long) childregs;
@@ -304,15 +312,25 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 /* Allow user processes to access the DMA SPRs */
 void grant_dma_mpls(void)
 {
+#if CONFIG_KERNEL_PL == 2
+	__insn_mtspr(SPR_MPL_DMA_CPL_SET_1, 1);
+	__insn_mtspr(SPR_MPL_DMA_NOTIFY_SET_1, 1);
+#else
 	__insn_mtspr(SPR_MPL_DMA_CPL_SET_0, 1);
 	__insn_mtspr(SPR_MPL_DMA_NOTIFY_SET_0, 1);
+#endif
 }
 
 /* Forbid user processes from accessing the DMA SPRs */
 void restrict_dma_mpls(void)
 {
+#if CONFIG_KERNEL_PL == 2
+	__insn_mtspr(SPR_MPL_DMA_CPL_SET_2, 1);
+	__insn_mtspr(SPR_MPL_DMA_NOTIFY_SET_2, 1);
+#else
 	__insn_mtspr(SPR_MPL_DMA_CPL_SET_1, 1);
 	__insn_mtspr(SPR_MPL_DMA_NOTIFY_SET_1, 1);
+#endif
 }
 
 /* Pause the DMA engine, then save off its state registers. */
@@ -523,19 +541,15 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 	 * Switch kernel SP, PC, and callee-saved registers.
 	 * In the context of the new task, return the old task pointer
 	 * (i.e. the task that actually called __switch_to).
-	 * Pass the value to use for SYSTEM_SAVE_1_0 when we reset our sp.
+	 * Pass the value to use for SYSTEM_SAVE_K_0 when we reset our sp.
 	 */
 	return __switch_to(prev, next, next_current_ksp0(next));
 }
 
-long _sys_fork(struct pt_regs *regs)
-{
-	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
-long _sys_clone(unsigned long clone_flags, unsigned long newsp,
-		void __user *parent_tidptr, void __user *child_tidptr,
-		struct pt_regs *regs)
+/* Note there is an implicit fifth argument if (clone_flags & CLONE_SETTLS). */
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+		void __user *, parent_tidptr, void __user *, child_tidptr,
+		struct pt_regs *, regs)
 {
 	if (!newsp)
 		newsp = regs->sp;
@@ -543,18 +557,13 @@ long _sys_clone(unsigned long clone_flags, unsigned long newsp,
 		       parent_tidptr, child_tidptr);
 }
 
-long _sys_vfork(struct pt_regs *regs)
-{
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp,
-		       regs, 0, NULL, NULL);
-}
-
 /*
  * sys_execve() executes a new program.
  */
-long _sys_execve(const char __user *path,
-		 const char __user *const __user *argv,
-		 const char __user *const __user *envp, struct pt_regs *regs)
+SYSCALL_DEFINE4(execve, const char __user *, path,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp,
+		struct pt_regs *, regs)
 {
 	long error;
 	char *filename;
@@ -570,9 +579,10 @@ out:
 }
 
 #ifdef CONFIG_COMPAT
-long _compat_sys_execve(const char __user *path,
-			const compat_uptr_t __user *argv,
-			const compat_uptr_t __user *envp, struct pt_regs *regs)
+long compat_sys_execve(const char __user *path,
+		       const compat_uptr_t __user *argv,
+		       const compat_uptr_t __user *envp,
+		       struct pt_regs *regs)
 {
 	long error;
 	char *filename;
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index 7161bd0..e92e405 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -32,25 +32,6 @@ void user_disable_single_step(struct task_struct *child)
 }
 
 /*
- * This routine will put a word on the process's privileged stack.
- */
-static void putreg(struct task_struct *task,
-		   unsigned long addr, unsigned long value)
-{
-	unsigned int regno = addr / sizeof(unsigned long);
-	struct pt_regs *childregs = task_pt_regs(task);
-	childregs->regs[regno] = value;
-	childregs->flags |= PT_FLAGS_RESTORE_REGS;
-}
-
-static unsigned long getreg(struct task_struct *task, unsigned long addr)
-{
-	unsigned int regno = addr / sizeof(unsigned long);
-	struct pt_regs *childregs = task_pt_regs(task);
-	return childregs->regs[regno];
-}
-
-/*
  * Called by kernel/ptrace.c when detaching..
  */
 void ptrace_disable(struct task_struct *child)
@@ -64,61 +45,80 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 }
 
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+		 unsigned long addr, unsigned long data)
 {
-	unsigned long __user *datap;
+	unsigned long __user *datap = (long __user __force *)data;
 	unsigned long tmp;
-	int i;
 	long ret = -EIO;
-
-#ifdef CONFIG_COMPAT
-	if (task_thread_info(current)->status & TS_COMPAT)
-		data = (u32)data;
-	if (task_thread_info(child)->status & TS_COMPAT)
-		addr = (u32)addr;
-#endif
-	datap = (unsigned long __user __force *)data;
+	char *childreg;
+	struct pt_regs copyregs;
+	int ex1_offset;
 
 	switch (request) {
 
 	case PTRACE_PEEKUSR:  /* Read register from pt_regs. */
-		if (addr & (sizeof(data)-1))
-			break;
-		if (addr < 0 || addr >= PTREGS_SIZE)
+		if (addr >= PTREGS_SIZE)
 			break;
-		tmp = getreg(child, addr);   /* Read register */
-		ret = put_user(tmp, datap);
+		childreg = (char *)task_pt_regs(child) + addr;
+#ifdef CONFIG_COMPAT
+		if (is_compat_task()) {
+			if (addr & (sizeof(compat_long_t)-1))
+				break;
+			ret = put_user(*(compat_long_t *)childreg,
+				       (compat_long_t __user *)datap);
+		} else
+#endif
+		{
+			if (addr & (sizeof(long)-1))
+				break;
+			ret = put_user(*(long *)childreg, datap);
+		}
 		break;
 
 	case PTRACE_POKEUSR:  /* Write register in pt_regs. */
-		if (addr & (sizeof(data)-1))
+		if (addr >= PTREGS_SIZE)
 			break;
-		if (addr < 0 || addr >= PTREGS_SIZE)
-			break;
-		putreg(child, addr, data);   /* Write register */
+		childreg = (char *)task_pt_regs(child) + addr;
+
+		/* Guard against overwrites of the privilege level. */
+		ex1_offset = PTREGS_OFFSET_EX1;
+#if defined(CONFIG_COMPAT) && defined(__BIG_ENDIAN)
+		if (is_compat_task())   /* point at low word */
+			ex1_offset += sizeof(compat_long_t);
+#endif
+		if (addr == ex1_offset)
+			data = PL_ICS_EX1(USER_PL, EX1_ICS(data));
+
+#ifdef CONFIG_COMPAT
+		if (is_compat_task()) {
+			if (addr & (sizeof(compat_long_t)-1))
+				break;
+			*(compat_long_t *)childreg = data;
+		} else
+#endif
+		{
+			if (addr & (sizeof(long)-1))
+				break;
+			*(long *)childreg = data;
+		}
 		ret = 0;
 		break;
 
 	case PTRACE_GETREGS:  /* Get all registers from the child. */
-		if (!access_ok(VERIFY_WRITE, datap, PTREGS_SIZE))
-			break;
-		for (i = 0; i < PTREGS_SIZE; i += sizeof(long)) {
-			ret = __put_user(getreg(child, i), datap);
-			if (ret != 0)
-				break;
-			datap++;
+		if (copy_to_user(datap, task_pt_regs(child),
+				 sizeof(struct pt_regs)) == 0) {
+			ret = 0;
 		}
 		break;
 
 	case PTRACE_SETREGS:  /* Set all registers in the child. */
-		if (!access_ok(VERIFY_READ, datap, PTREGS_SIZE))
-			break;
-		for (i = 0; i < PTREGS_SIZE; i += sizeof(long)) {
-			ret = __get_user(tmp, datap);
-			if (ret != 0)
-				break;
-			putreg(child, i, tmp);
-			datap++;
+		if (copy_from_user(&copyregs, datap,
+				   sizeof(struct pt_regs)) == 0) {
+			copyregs.ex1 =
+				PL_ICS_EX1(USER_PL, EX1_ICS(copyregs.ex1));
+			*task_pt_regs(child) = copyregs;
+			ret = 0;
 		}
 		break;
 
diff --git a/arch/tile/kernel/reboot.c b/arch/tile/kernel/reboot.c
index acd86d2..baa3d90 100644
--- a/arch/tile/kernel/reboot.c
+++ b/arch/tile/kernel/reboot.c
@@ -27,7 +27,7 @@
 void machine_halt(void)
 {
 	warn_early_printk();
-	raw_local_irq_disable_all();
+	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_halt();
 }
@@ -35,14 +35,14 @@ void machine_halt(void)
 void machine_power_off(void)
 {
 	warn_early_printk();
-	raw_local_irq_disable_all();
+	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_power_off();
 }
 
 void machine_restart(char *cmd)
 {
-	raw_local_irq_disable_all();
+	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_restart((HV_VirtAddr) "vmlinux", (HV_VirtAddr) cmd);
 }
diff --git a/arch/tile/kernel/regs_32.S b/arch/tile/kernel/regs_32.S
index e88d6e12..caa1310 100644
--- a/arch/tile/kernel/regs_32.S
+++ b/arch/tile/kernel/regs_32.S
@@ -85,7 +85,7 @@ STD_ENTRY_SECTION(__switch_to, .sched.text)
 	{
 	  /* Update sp and ksp0 simultaneously to avoid backtracer warnings. */
 	  move sp, r13
-	  mtspr SYSTEM_SAVE_1_0, r2
+	  mtspr SPR_SYSTEM_SAVE_K_0, r2
 	}
 	FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
 .L__switch_to_pc:
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index e7d54c7..f185736 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -30,8 +30,6 @@
 #include <linux/timex.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
-#include <asm/sections.h>
-#include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
@@ -187,11 +185,11 @@ early_param("vmalloc", parse_vmalloc);
 
 #ifdef CONFIG_HIGHMEM
 /*
- * Determine for each controller where its lowmem is mapped and how
- * much of it is mapped there.  On controller zero, the first few
- * megabytes are mapped at 0xfd000000 as code, so in principle we
- * could start our data mappings higher up, but for now we don't
- * bother, to avoid additional confusion.
+ * Determine for each controller where its lowmem is mapped and how much of
+ * it is mapped there.  On controller zero, the first few megabytes are
+ * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * start our data mappings higher up, but for now we don't bother, to avoid
+ * additional confusion.
  *
  * One question is whether, on systems with more than 768 Mb and
  * controllers of different sizes, to map in a proportionate amount of
@@ -311,7 +309,7 @@ static void __init setup_memory(void)
 #endif
 
 	/* We are using a char to hold the cpu_2_node[] mapping */
-	BUG_ON(MAX_NUMNODES > 127);
+	BUILD_BUG_ON(MAX_NUMNODES > 127);
 
 	/* Discover the ranges of memory available to us */
 	for (i = 0; ; ++i) {
@@ -842,7 +840,7 @@ static int __init topology_init(void)
 	for_each_online_node(i)
 		register_one_node(i);
 
-	for_each_present_cpu(i)
+	for (i = 0; i < smp_height * smp_width; ++i)
 		register_cpu(&cpu_devices[i], i);
 
 	return 0;
@@ -870,11 +868,14 @@ void __cpuinit setup_cpu(int boot)
 
 	/* Allow asynchronous TLB interrupts. */
 #if CHIP_HAS_TILE_DMA()
-	raw_local_irq_unmask(INT_DMATLB_MISS);
-	raw_local_irq_unmask(INT_DMATLB_ACCESS);
+	arch_local_irq_unmask(INT_DMATLB_MISS);
+	arch_local_irq_unmask(INT_DMATLB_ACCESS);
 #endif
 #if CHIP_HAS_SN_PROC()
-	raw_local_irq_unmask(INT_SNITLB_MISS);
+	arch_local_irq_unmask(INT_SNITLB_MISS);
+#endif
+#ifdef __tilegx__
+	arch_local_irq_unmask(INT_SINGLE_STEP_K);
 #endif
 
 	/*
@@ -893,11 +894,12 @@ void __cpuinit setup_cpu(int boot)
 #endif
 
 	/*
-	 * Set the MPL for interrupt control 0 to user level.
-	 * This includes access to the SYSTEM_SAVE and EX_CONTEXT SPRs,
-	 * as well as the PL 0 interrupt mask.
+	 * Set the MPL for interrupt control 0 & 1 to the corresponding
+	 * values.  This includes access to the SYSTEM_SAVE and EX_CONTEXT
+	 * SPRs, as well as the interrupt mask.
 	 */
 	__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+	__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
 
 	/* Initialize IRQ support for this cpu. */
 	setup_irq_regs();
@@ -1033,7 +1035,7 @@ static void __init validate_va(void)
 	 * In addition, make sure we CAN'T use the end of memory, since
 	 * we use the last chunk of each pgd for the pgd_list.
 	 */
-	int i, fc_fd_ok = 0;
+	int i, user_kernel_ok = 0;
 	unsigned long max_va = 0;
 	unsigned long list_va =
 		((PGD_LIST_OFFSET / sizeof(pgd_t)) << PGDIR_SHIFT);
@@ -1044,13 +1046,13 @@ static void __init validate_va(void)
 			break;
 		if (range.start <= MEM_USER_INTRPT &&
 		    range.start + range.size >= MEM_HV_INTRPT)
-			fc_fd_ok = 1;
+			user_kernel_ok = 1;
 		if (range.start == 0)
 			max_va = range.size;
 		BUG_ON(range.start + range.size > list_va);
 	}
-	if (!fc_fd_ok)
-		early_panic("Hypervisor not configured for VAs 0xfc/0xfd\n");
+	if (!user_kernel_ok)
+		early_panic("Hypervisor not configured for user/kernel VAs\n");
 	if (max_va == 0)
 		early_panic("Hypervisor not configured for low VAs\n");
 	if (max_va < KERNEL_HIGH_VADDR)
@@ -1334,6 +1336,10 @@ static void __init pcpu_fc_populate_pte(unsigned long addr)
 	pte_t *pte;
 
 	BUG_ON(pgd_addr_invalid(addr));
+	if (addr < VMALLOC_START || addr >= VMALLOC_END)
+		panic("PCPU addr %#lx outside vmalloc range %#lx..%#lx;"
+		      " try increasing CONFIG_VMALLOC_RESERVE\n",
+		      addr, VMALLOC_START, VMALLOC_END);
 
 	pgd = swapper_pg_dir + pgd_index(addr);
 	pud = pud_offset(pgd, addr);
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index ce183aa..1260321 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
@@ -41,8 +40,8 @@
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
 
-long _sys_sigaltstack(const stack_t __user *uss,
-		      stack_t __user *uoss, struct pt_regs *regs)
+SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss,
+		stack_t __user *, uoss, struct pt_regs *, regs)
 {
 	return do_sigaltstack(uss, uoss, regs->sp);
 }
@@ -53,7 +52,7 @@ long _sys_sigaltstack(const stack_t __user *uss,
  */
 
 int restore_sigcontext(struct pt_regs *regs,
-		       struct sigcontext __user *sc, long *pr0)
+		       struct sigcontext __user *sc)
 {
 	int err = 0;
 	int i;
@@ -71,19 +70,20 @@ int restore_sigcontext(struct pt_regs *regs,
 	for (i = 0; i < sizeof(struct pt_regs)/sizeof(long); ++i)
 		err |= __get_user(regs->regs[i], &sc->gregs[i]);
 
+	/* Ensure that the PL is always set to USER_PL. */
+	regs->ex1 = PL_ICS_EX1(USER_PL, EX1_ICS(regs->ex1));
+
 	regs->faultnum = INT_SWINT_1_SIGRETURN;
 
-	err |= __get_user(*pr0, &sc->gregs[0]);
 	return err;
 }
 
-/* sigreturn() returns long since it restores r0 in the interrupted code. */
-long _sys_rt_sigreturn(struct pt_regs *regs)
+/* The assembly shim for this function arranges to ignore the return value. */
+SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
 {
 	struct rt_sigframe __user *frame =
 		(struct rt_sigframe __user *)(regs->sp);
 	sigset_t set;
-	long r0;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -96,13 +96,13 @@ long _sys_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
 		goto badframe;
 
-	return r0;
+	return 0;
 
 badframe:
 	force_sig(SIGSEGV, current);
@@ -330,7 +330,7 @@ void do_signal(struct pt_regs *regs)
 			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 		}
 
-		return;
+		goto done;
 	}
 
 	/* Did we come from a system call? */
@@ -358,4 +358,8 @@ void do_signal(struct pt_regs *regs)
 		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
 	}
+
+done:
+	/* Avoid double syscall restart if there are nested signals. */
+	regs->faultnum = INT_SWINT_1_SIGRETURN;
 }
diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c
index 5ec4b9c..1eb3b39 100644
--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -15,7 +15,7 @@
  * Derived from iLib's single-stepping code.
  */
 
-#ifndef __tilegx__   /* No support for single-step yet. */
+#ifndef __tilegx__   /* Hardware support for single step unavailable. */
 
 /* These functions are only used on the TILE platform */
 #include <linux/slab.h>
@@ -660,4 +660,75 @@ void single_step_once(struct pt_regs *regs)
 		regs->pc += 8;
 }
 
+#else
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <arch/spr_def.h>
+
+static DEFINE_PER_CPU(unsigned long, ss_saved_pc);
+
+
+/*
+ * Called directly on the occasion of an interrupt.
+ *
+ * If the process doesn't have single step set, then we use this as an
+ * opportunity to turn single step off.
+ *
+ * It has been mentioned that we could conditionally turn off single stepping
+ * on each entry into the kernel and rely on single_step_once to turn it
+ * on for the processes that matter (as we already do), but this
+ * implementation is somewhat more efficient in that we muck with registers
+ * once on a bum interrupt rather than on every entry into the kernel.
+ *
+ * If SINGLE_STEP_CONTROL_K has CANCELED set, then an interrupt occurred,
+ * so we have to run through this process again before we can say that an
+ * instruction has executed.
+ *
+ * swint will set CANCELED, but it's a legitimate instruction.  Fortunately
+ * it changes the PC.  If it hasn't changed, then we know that the interrupt
+ * wasn't generated by swint and we'll need to run this process again before
+ * we can say an instruction has executed.
+ *
+ * If either CANCELED == 0 or the PC's changed, we send out SIGTRAPs and get
+ * on with our lives.
+ */
+
+void gx_singlestep_handle(struct pt_regs *regs, int fault_num)
+{
+	unsigned long *ss_pc = &__get_cpu_var(ss_saved_pc);
+	struct thread_info *info = (void *)current_thread_info();
+	int is_single_step = test_ti_thread_flag(info, TIF_SINGLESTEP);
+	unsigned long control = __insn_mfspr(SPR_SINGLE_STEP_CONTROL_K);
+
+	if (is_single_step == 0) {
+		__insn_mtspr(SPR_SINGLE_STEP_EN_K_K, 0);
+
+	} else if ((*ss_pc != regs->pc) ||
+		   (!(control & SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK))) {
+
+		ptrace_notify(SIGTRAP);
+		control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK;
+		control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK;
+		__insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control);
+	}
+}
+
+
+/*
+ * Called from need_singlestep.  Set up the control registers and the enable
+ * register, then return back.
+ */
+
+void single_step_once(struct pt_regs *regs)
+{
+	unsigned long *ss_pc = &__get_cpu_var(ss_saved_pc);
+	unsigned long control = __insn_mfspr(SPR_SINGLE_STEP_CONTROL_K);
+
+	*ss_pc = regs->pc;
+	control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK;
+	control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK;
+	__insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control);
+	__insn_mtspr(SPR_SINGLE_STEP_EN_K_K, 1 << USER_PL);
+}
+
 #endif /* !__tilegx__ */
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 1cb5ec7..9575b37 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -115,7 +115,7 @@ static void smp_start_cpu_interrupt(void)
 static void smp_stop_cpu_interrupt(void)
 {
 	set_cpu_online(smp_processor_id(), 0);
-	raw_local_irq_disable_all();
+	arch_local_irq_disable_all();
 	for (;;)
 		asm("nap");
 }
@@ -212,7 +212,7 @@ void __init ipi_init(void)
 
 		tile.x = cpu_x(cpu);
 		tile.y = cpu_y(cpu);
-		if (hv_get_ipi_pte(tile, 1, &pte) != 0)
+		if (hv_get_ipi_pte(tile, KERNEL_PL, &pte) != 0)
 			panic("Failed to initialize IPI for cpu %d\n", cpu);
 
 		offset = hv_pte_get_pfn(pte) << PAGE_SHIFT;
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 74d62d0..b949edc 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -18,7 +18,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
-#include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index ea2e0ce..0d54106b 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -30,6 +30,10 @@
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 
+#define KBT_ONGOING	0  /* Backtrace still ongoing */
+#define KBT_DONE	1  /* Backtrace cleanly completed */
+#define KBT_RUNNING	2  /* Can't run backtrace on a running task */
+#define KBT_LOOP	3  /* Backtrace entered a loop */
 
 /* Is address on the specified kernel stack? */
 static int in_kernel_stack(struct KBacktraceIterator *kbt, VirtualAddress sp)
@@ -207,11 +211,11 @@ static int KBacktraceIterator_next_item_inclusive(
 	for (;;) {
 		do {
 			if (!KBacktraceIterator_is_sigreturn(kbt))
-				return 1;
+				return KBT_ONGOING;
 		} while (backtrace_next(&kbt->it));
 
 		if (!KBacktraceIterator_restart(kbt))
-			return 0;
+			return KBT_DONE;
 	}
 }
 
@@ -264,7 +268,7 @@ void KBacktraceIterator_init(struct KBacktraceIterator *kbt,
 	kbt->pgtable = NULL;
 	kbt->verbose = 0;   /* override in caller if desired */
 	kbt->profile = 0;   /* override in caller if desired */
-	kbt->end = 0;
+	kbt->end = KBT_ONGOING;
 	kbt->new_context = 0;
 	if (is_current) {
 		HV_PhysAddr pgdir_pa = hv_inquire_context().page_table;
@@ -290,7 +294,7 @@ void KBacktraceIterator_init(struct KBacktraceIterator *kbt,
 	if (regs == NULL) {
 		if (is_current || t->state == TASK_RUNNING) {
 			/* Can't do this; we need registers */
-			kbt->end = 1;
+			kbt->end = KBT_RUNNING;
 			return;
 		}
 		pc = get_switch_to_pc();
@@ -305,26 +309,29 @@ void KBacktraceIterator_init(struct KBacktraceIterator *kbt,
 	}
 
 	backtrace_init(&kbt->it, read_memory_func, kbt, pc, lr, sp, r52);
-	kbt->end = !KBacktraceIterator_next_item_inclusive(kbt);
+	kbt->end = KBacktraceIterator_next_item_inclusive(kbt);
 }
 EXPORT_SYMBOL(KBacktraceIterator_init);
 
 int KBacktraceIterator_end(struct KBacktraceIterator *kbt)
 {
-	return kbt->end;
+	return kbt->end != KBT_ONGOING;
 }
 EXPORT_SYMBOL(KBacktraceIterator_end);
 
 void KBacktraceIterator_next(struct KBacktraceIterator *kbt)
 {
+	VirtualAddress old_pc = kbt->it.pc, old_sp = kbt->it.sp;
 	kbt->new_context = 0;
-	if (!backtrace_next(&kbt->it) &&
-	    !KBacktraceIterator_restart(kbt)) {
-			kbt->end = 1;
-			return;
-		}
-
-	kbt->end = !KBacktraceIterator_next_item_inclusive(kbt);
+	if (!backtrace_next(&kbt->it) && !KBacktraceIterator_restart(kbt)) {
+		kbt->end = KBT_DONE;
+		return;
+	}
+	kbt->end = KBacktraceIterator_next_item_inclusive(kbt);
+	if (old_pc == kbt->it.pc && old_sp == kbt->it.sp) {
+		/* Trapped in a loop; give up. */
+		kbt->end = KBT_LOOP;
+	}
 }
 EXPORT_SYMBOL(KBacktraceIterator_next);
 
@@ -387,6 +394,8 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 			break;
 		}
 	}
+	if (kbt->end == KBT_LOOP)
+		pr_err("Stack dump stopped; next frame identical to this one\n");
 	if (headers)
 		pr_err("Stack dump complete\n");
 }
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index f0f87ea..e2187d2 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -20,7 +20,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
@@ -110,6 +109,15 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 #define sys_sync_file_range sys_sync_file_range2
 #endif
 
+/* Call the trampolines to manage pt_regs where necessary. */
+#define sys_execve _sys_execve
+#define sys_sigaltstack _sys_sigaltstack
+#define sys_rt_sigreturn _sys_rt_sigreturn
+#define sys_clone _sys_clone
+#ifndef __tilegx__
+#define sys_cmpxchg_badaddr _sys_cmpxchg_badaddr
+#endif
+
 /*
  * Note that we can't include <linux/unistd.h> here since the header
  * guard will defeat us; <asm/unistd.h> checks for __SYSCALL as well.
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 6bed820..f2e156e 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -132,7 +132,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
 {
 	BUG_ON(ticks > MAX_TICK);
 	__insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
-	raw_local_irq_unmask_now(INT_TILE_TIMER);
+	arch_local_irq_unmask_now(INT_TILE_TIMER);
 	return 0;
 }
 
@@ -143,7 +143,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
 static void tile_timer_set_mode(enum clock_event_mode mode,
 				struct clock_event_device *evt)
 {
-	raw_local_irq_mask_now(INT_TILE_TIMER);
+	arch_local_irq_mask_now(INT_TILE_TIMER);
 }
 
 /*
@@ -172,7 +172,7 @@ void __cpuinit setup_tile_timer(void)
 	evt->cpumask = cpumask_of(smp_processor_id());
 
 	/* Start out with timer not firing. */
-	raw_local_irq_mask_now(INT_TILE_TIMER);
+	arch_local_irq_mask_now(INT_TILE_TIMER);
 
 	/* Register tile timer. */
 	clockevents_register_device(evt);
@@ -188,7 +188,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
 	 * Mask the timer interrupt here, since we are a oneshot timer
 	 * and there are now by definition no events pending.
 	 */
-	raw_local_irq_mask(INT_TILE_TIMER);
+	arch_local_irq_mask(INT_TILE_TIMER);
 
 	/* Track time spent here in an interrupt context */
 	irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 0f362dc..5474fc2 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -260,7 +260,7 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		address = regs->pc;
 		break;
 	case INT_UNALIGN_DATA:
-#ifndef __tilegx__  /* FIXME: GX: no single-step yet */
+#ifndef __tilegx__  /* Emulated support for single step debugging */
 		if (unaligned_fixup >= 0) {
 			struct single_step_state *state =
 				current_thread_info()->step_state;
@@ -278,7 +278,7 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 	case INT_DOUBLE_FAULT:
 		/*
 		 * For double fault, "reason" is actually passed as
-		 * SYSTEM_SAVE_1_2, the hypervisor's double-fault info, so
+		 * SYSTEM_SAVE_K_2, the hypervisor's double-fault info, so
 		 * we can provide the original fault number rather than
 		 * the uninteresting "INT_DOUBLE_FAULT" so the user can
 		 * learn what actually struck while PL0 ICS was set.
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
new file mode 100644
index 0000000..b88f9c0
--- /dev/null
+++ b/arch/tile/kvm/Kconfig
@@ -0,0 +1,38 @@
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+	bool "Virtualization"
+	---help---
+	  Say Y here to get to see options for using your Linux host to run
+	  other operating systems inside virtual machines (guests).
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled.
+
+if VIRTUALIZATION
+
+config KVM
+	tristate "Kernel-based Virtual Machine (KVM) support"
+	depends on HAVE_KVM && MODULES && EXPERIMENTAL
+	select PREEMPT_NOTIFIERS
+	select ANON_INODES
+	---help---
+	  Support hosting paravirtualized guest machines.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  To compile this as a module, choose M here: the module
+	  will be called kvm.
+
+	  If unsure, say N.
+
+source drivers/vhost/Kconfig
+source drivers/virtio/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 746dc81..93122d5 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -3,8 +3,8 @@
 #
 
 lib-y = cacheflush.o checksum.o cpumask.o delay.o \
-	mb_incoherent.o uaccess.o \
-	memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \
+	mb_incoherent.o uaccess.o memmove.o \
+	memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
 	strchr_$(BITS).o strlen_$(BITS).o
 
 ifeq ($(CONFIG_TILEGX),y)
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 8040b42..7a5cc70 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -300,7 +300,7 @@ void __init __init_atomic_per_cpu(void)
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 
 	/* Validate power-of-two and "bigger than cpus" assumption */
-	BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
+	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
 
 	/*
@@ -314,17 +314,17 @@ void __init __init_atomic_per_cpu(void)
 	BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0);
 
 	/* The locks must all fit on one page. */
-	BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
+	BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
 
 	/*
 	 * We use the page offset of the atomic value's address as
 	 * an index into atomic_locks, excluding the low 3 bits.
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
-	BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
+	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
 
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 
 	/* The futex code makes this assumption, so we validate it here. */
-	BUG_ON(sizeof(atomic_t) != sizeof(int));
+	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 }
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index ce5dbf5..1509c55 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -45,6 +45,9 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
 EXPORT_SYMBOL(__copy_in_user_inatomic);
 #endif
 
+/* arch/tile/lib/mb_incoherent.S */
+EXPORT_SYMBOL(__mb_incoherent);
+
 /* hypervisor glue */
 #include <hv/hypervisor.h>
 EXPORT_SYMBOL(hv_dev_open);
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
index 6235283..cc3d9ba 100644
--- a/arch/tile/lib/memchr_32.c
+++ b/arch/tile/lib/memchr_32.c
@@ -18,12 +18,24 @@
 
 void *memchr(const void *s, int c, size_t n)
 {
+	const uint32_t *last_word_ptr;
+	const uint32_t *p;
+	const char *last_byte_ptr;
+	uintptr_t s_int;
+	uint32_t goal, before_mask, v, bits;
+	char *ret;
+
+	if (__builtin_expect(n == 0, 0)) {
+		/* Don't dereference any memory if the array is empty. */
+		return NULL;
+	}
+
 	/* Get an aligned pointer. */
-	const uintptr_t s_int = (uintptr_t) s;
-	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	s_int = (uintptr_t) s;
+	p = (const uint32_t *)(s_int & -4);
 
 	/* Create four copies of the byte for which we are looking. */
-	const uint32_t goal = 0x01010101 * (uint8_t) c;
+	goal = 0x01010101 * (uint8_t) c;
 
 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)
 	 * Note that this shift count expression works because we know
 	 * shift counts are taken mod 32.
 	 */
-	const uint32_t before_mask = (1 << (s_int << 3)) - 1;
-	uint32_t v = (*p | before_mask) ^ (goal & before_mask);
+	before_mask = (1 << (s_int << 3)) - 1;
+	v = (*p | before_mask) ^ (goal & before_mask);
 
 	/* Compute the address of the last byte. */
-	const char *const last_byte_ptr = (const char *)s + n - 1;
+	last_byte_ptr = (const char *)s + n - 1;
 
 	/* Compute the address of the word containing the last byte. */
-	const uint32_t *const last_word_ptr =
-	    (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
-
-	uint32_t bits;
-	char *ret;
-
-	if (__builtin_expect(n == 0, 0)) {
-		/* Don't dereference any memory if the array is empty. */
-		return NULL;
-	}
+	last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
 
 	while ((bits = __insn_seqb(v, goal)) == 0) {
 		if (__builtin_expect(p == last_word_ptr, 0)) {
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 30c3b7e..2a419a6 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -10,14 +10,16 @@
  *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
- *
- * This file shares the implementation of the userspace memcpy and
- * the kernel's memcpy, copy_to_user and copy_from_user.
  */
 
 #include <arch/chip.h>
 
 
+/*
+ * This file shares the implementation of the userspace memcpy and
+ * the kernel's memcpy, copy_to_user and copy_from_user.
+ */
+
 #include <linux/linkage.h>
 
 /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
@@ -53,9 +55,9 @@
  */
 ENTRY(__copy_from_user_inatomic)
 .type __copy_from_user_inatomic, @function
-        FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
+	FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
 	  .text.memcpy_common, \
-          .Lend_memcpy_common - __copy_from_user_inatomic)
+	  .Lend_memcpy_common - __copy_from_user_inatomic)
 	{ movei r29, IS_COPY_FROM_USER; j memcpy_common }
 	.size __copy_from_user_inatomic, . - __copy_from_user_inatomic
 
@@ -64,7 +66,7 @@ ENTRY(__copy_from_user_inatomic)
  */
 ENTRY(__copy_from_user_zeroing)
 .type __copy_from_user_zeroing, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+	FEEDBACK_REENTER(__copy_from_user_inatomic)
 	{ movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
 	.size __copy_from_user_zeroing, . - __copy_from_user_zeroing
 
@@ -74,13 +76,13 @@ ENTRY(__copy_from_user_zeroing)
  */
 ENTRY(__copy_to_user_inatomic)
 .type __copy_to_user_inatomic, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+	FEEDBACK_REENTER(__copy_from_user_inatomic)
 	{ movei r29, IS_COPY_TO_USER; j memcpy_common }
 	.size __copy_to_user_inatomic, . - __copy_to_user_inatomic
 
 ENTRY(memcpy)
 .type memcpy, @function
-        FEEDBACK_REENTER(__copy_from_user_inatomic)
+	FEEDBACK_REENTER(__copy_from_user_inatomic)
 	{ movei r29, IS_MEMCPY }
 	.size memcpy, . - memcpy
 	/* Fall through */
@@ -157,35 +159,35 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
 #if CHIP_HAS_WH64()
-        /* No need to prefetch dst, we'll just do the wh64
-         * right before we copy a line.
+	/* No need to prefetch dst, we'll just do the wh64
+	 * right before we copy a line.
 	 */
 #endif
 
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, .; move r27, lr }
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bnzt zero, .; move r27, lr }
 EX:	{ lw r6, r3; addi r3, r3, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
 #if !CHIP_HAS_WH64()
-        /* Prefetch the dest */
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
-        /* Use a real load to cause a TLB miss if necessary.  We aren't using
-         * r28, so this should be fine.
-         */
+	/* Prefetch the dest */
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bnzt zero, . }
+	/* Use a real load to cause a TLB miss if necessary.  We aren't using
+	 * r28, so this should be fine.
+	 */
 EX:	{ lw r28, r9; addi r9, r9, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
-        { prefetch r9; addi r9, r9, 64 }
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bnzt zero, . }
-        { prefetch r9; addi r9, r9, 64 }
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bnzt zero, . }
+	{ prefetch r9; addi r9, r9, 64 }
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bnzt zero, . }
+	{ prefetch r9; addi r9, r9, 64 }
 #endif
-        /* Intentionally stall for a few cycles to leave L2 cache alone. */
-        { bz zero, .Lbig_loop2 }
+	/* Intentionally stall for a few cycles to leave L2 cache alone. */
+	{ bz zero, .Lbig_loop2 }
 
 	/* On entry to this loop:
 	 * - r0 points to the start of dst line 0
@@ -197,7 +199,7 @@ EX:	{ lw r28, r9; addi r9, r9, 64 }
 	 *   to some "safe" recently loaded address.
 	 * - r5 contains *(r1 + 60)       [i.e. last word of source line 0]
 	 * - r6 contains *(r1 + 64 + 60)  [i.e. last word of source line 1]
-         * - r9 contains ((r0 + 63) & -64)
+	 * - r9 contains ((r0 + 63) & -64)
 	 *     [start of next dst cache line.]
 	 */
 
@@ -208,137 +210,137 @@ EX:	{ lw r28, r9; addi r9, r9, 64 }
 	/* Copy line 0, first stalling until r5 is ready. */
 EX:	{ move r12, r5; lw r16, r1 }
 	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+	/* Prefetch several lines ahead. */
 EX:	{ lw r5, r3; addi r3, r3, 64 }
-        { jal .Lcopy_line }
+	{ jal .Lcopy_line }
 
 	/* Copy line 1, first stalling until r6 is ready. */
 EX:	{ move r12, r6; lw r16, r1 }
 	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+	/* Prefetch several lines ahead. */
 EX:	{ lw r6, r3; addi r3, r3, 64 }
 	{ jal .Lcopy_line }
 
 	/* Copy line 2, first stalling until r7 is ready. */
 EX:	{ move r12, r7; lw r16, r1 }
 	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
-        /* Prefetch several lines ahead. */
+	/* Prefetch several lines ahead. */
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-        /* Use up a caches-busy cycle by jumping back to the top of the
-         * loop. Might as well get it out of the way now.
-         */
-        { j .Lbig_loop }
+	/* Use up a caches-busy cycle by jumping back to the top of the
+	 * loop. Might as well get it out of the way now.
+	 */
+	{ j .Lbig_loop }
 
 
 	/* On entry:
 	 * - r0 points to the destination line.
 	 * - r1 points to the source line.
-         * - r3 is the next prefetch address.
+	 * - r3 is the next prefetch address.
 	 * - r9 holds the last address used for wh64.
 	 * - r12 = WORD_15
-         * - r16 = WORD_0.
-         * - r17 == r1 + 16.
-         * - r27 holds saved lr to restore.
+	 * - r16 = WORD_0.
+	 * - r17 == r1 + 16.
+	 * - r27 holds saved lr to restore.
 	 *
 	 * On exit:
 	 * - r0 is incremented by 64.
 	 * - r1 is incremented by 64, unless that would point to a word
-         *   beyond the end of the source array, in which case it is redirected
-         *   to point to an arbitrary word already in the cache.
+	 *   beyond the end of the source array, in which case it is redirected
+	 *   to point to an arbitrary word already in the cache.
 	 * - r2 is decremented by 64.
-         * - r3 is unchanged, unless it points to a word beyond the
-         *   end of the source array, in which case it is redirected
-         *   to point to an arbitrary word already in the cache.
-         *   Redirecting is OK since if we are that close to the end
-         *   of the array we will not come back to this subroutine
-         *   and use the contents of the prefetched address.
+	 * - r3 is unchanged, unless it points to a word beyond the
+	 *   end of the source array, in which case it is redirected
+	 *   to point to an arbitrary word already in the cache.
+	 *   Redirecting is OK since if we are that close to the end
+	 *   of the array we will not come back to this subroutine
+	 *   and use the contents of the prefetched address.
 	 * - r4 is nonzero iff r2 >= 64.
-         * - r9 is incremented by 64, unless it points beyond the
-         *   end of the last full destination cache line, in which
-         *   case it is redirected to a "safe address" that can be
-         *   clobbered (sp - 64)
+	 * - r9 is incremented by 64, unless it points beyond the
+	 *   end of the last full destination cache line, in which
+	 *   case it is redirected to a "safe address" that can be
+	 *   clobbered (sp - 64)
 	 * - lr contains the value in r27.
 	 */
 
 /* r26 unused */
 
 .Lcopy_line:
-        /* TODO: when r3 goes past the end, we would like to redirect it
-         * to prefetch the last partial cache line (if any) just once, for the
-         * benefit of the final cleanup loop. But we don't want to
-         * prefetch that line more than once, or subsequent prefetches
-         * will go into the RTF. But then .Lbig_loop should unconditionally
-         * branch to top of loop to execute final prefetch, and its
-         * nop should become a conditional branch.
-         */
-
-        /* We need two non-memory cycles here to cover the resources
-         * used by the loads initiated by the caller.
-         */
-        { add r15, r1, r2 }
+	/* TODO: when r3 goes past the end, we would like to redirect it
+	 * to prefetch the last partial cache line (if any) just once, for the
+	 * benefit of the final cleanup loop. But we don't want to
+	 * prefetch that line more than once, or subsequent prefetches
+	 * will go into the RTF. But then .Lbig_loop should unconditionally
+	 * branch to top of loop to execute final prefetch, and its
+	 * nop should become a conditional branch.
+	 */
+
+	/* We need two non-memory cycles here to cover the resources
+	 * used by the loads initiated by the caller.
+	 */
+	{ add r15, r1, r2 }
 .Lcopy_line2:
-        { slt_u r13, r3, r15; addi r17, r1, 16 }
+	{ slt_u r13, r3, r15; addi r17, r1, 16 }
 
-        /* NOTE: this will stall for one cycle as L1 is busy. */
+	/* NOTE: this will stall for one cycle as L1 is busy. */
 
-        /* Fill second L1D line. */
+	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
 #if CHIP_HAS_WH64()
-        /* Prepare destination line for writing. */
+	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
 #else
-        /* Prefetch dest line */
+	/* Prefetch dest line */
 	{ prefetch r9; addi r9, r9, 64 }
 #endif
-        /* Load seven words that are L1D hits to cover wh64 L2 usage. */
+	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
-        /* Load the three remaining words from the last L1D line, which
-         * we know has already filled the L1D.
-         */
+	/* Load the three remaining words from the last L1D line, which
+	 * we know has already filled the L1D.
+	 */
 EX:	{ lw r4, r1;  addi r1, r1, 4;   addi r20, r1, 16 }   /* r4 = WORD_12 */
 EX:	{ lw r8, r1;  addi r1, r1, 4;   slt_u r13, r20, r15 }/* r8 = WORD_13 */
 EX:	{ lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 }  /* r11 = WORD_14 */
 
-        /* Load the three remaining words from the first L1D line, first
-         * stalling until it has filled by "looking at" r16.
-         */
+	/* Load the three remaining words from the first L1D line, first
+	 * stalling until it has filled by "looking at" r16.
+	 */
 EX:	{ lw r13, r1; addi r1, r1, 4; move zero, r16 }   /* r13 = WORD_1 */
 EX:	{ lw r14, r1; addi r1, r1, 4 }                   /* r14 = WORD_2 */
 EX:	{ lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
 
-        /* Load second word from the second L1D line, first
-         * stalling until it has filled by "looking at" r17.
-         */
+	/* Load second word from the second L1D line, first
+	 * stalling until it has filled by "looking at" r17.
+	 */
 EX:	{ lw r19, r1; addi r1, r1, 4; move zero, r17 }  /* r19 = WORD_5 */
 
-        /* Store last word to the destination line, potentially dirtying it
-         * for the first time, which keeps the L2 busy for two cycles.
-         */
+	/* Store last word to the destination line, potentially dirtying it
+	 * for the first time, which keeps the L2 busy for two cycles.
+	 */
 EX:	{ sw r10, r12 }                                 /* store(WORD_15) */
 
-        /* Use two L1D hits to cover the sw L2 access above. */
+	/* Use two L1D hits to cover the sw L2 access above. */
 EX:	{ lw r10, r1; addi r1, r1, 4 }                  /* r10 = WORD_6 */
 EX:	{ lw r12, r1; addi r1, r1, 4 }                  /* r12 = WORD_7 */
 
-        /* Fill third L1D line. */
+	/* Fill third L1D line. */
 EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 
-        /* Store first L1D line. */
+	/* Store first L1D line. */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
 #if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
 #else
-        /* Back up the r9 to a cache line we are already storing to
+	/* Back up the r9 to a cache line we are already storing to
 	 * if it gets past the end of the dest vector.  Strictly speaking,
 	 * we don't need to back up to the start of a cache line, but it's free
 	 * and tidy, so why not?
-         */
+	 */
 EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
 #endif
-        /* Store second L1D line. */
+	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
 EX:	{ sw r0, r10; addi r0, r0, 4 }                  /* store(WORD_6) */
@@ -348,30 +350,30 @@ EX:	{ lw r13, r1; addi r1, r1, 4; move zero, r18 }  /* r13 = WORD_9 */
 EX:	{ lw r14, r1; addi r1, r1, 4 }                  /* r14 = WORD_10 */
 EX:	{ lw r15, r1; move r1, r20   }                  /* r15 = WORD_11 */
 
-        /* Store third L1D line. */
+	/* Store third L1D line. */
 EX:	{ sw r0, r18; addi r0, r0, 4 }                  /* store(WORD_8) */
 EX:	{ sw r0, r13; addi r0, r0, 4 }                  /* store(WORD_9) */
 EX:	{ sw r0, r14; addi r0, r0, 4 }                  /* store(WORD_10) */
 EX:	{ sw r0, r15; addi r0, r0, 4 }                  /* store(WORD_11) */
 
-        /* Store rest of fourth L1D line. */
+	/* Store rest of fourth L1D line. */
 EX:	{ sw r0, r4;  addi r0, r0, 4 }                  /* store(WORD_12) */
-        {
+	{
 EX:	sw r0, r8                                       /* store(WORD_13) */
-        addi r0, r0, 4
+	addi r0, r0, 4
 	/* Will r2 be > 64 after we subtract 64 below? */
-        shri r4, r2, 7
-        }
-        {
+	shri r4, r2, 7
+	}
+	{
 EX:	sw r0, r11                                      /* store(WORD_14) */
-        addi r0, r0, 8
-        /* Record 64 bytes successfully copied. */
-        addi r2, r2, -64
-        }
+	addi r0, r0, 8
+	/* Record 64 bytes successfully copied. */
+	addi r2, r2, -64
+	}
 
 	{ jrp lr; move lr, r27 }
 
-        /* Convey to the backtrace library that the stack frame is size
+	/* Convey to the backtrace library that the stack frame is size
 	 * zero, and the real return address is on the stack rather than
 	 * in 'lr'.
 	 */
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index dfedea7..f7d4a6a 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -54,7 +54,7 @@ typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
  * we must run with interrupts disabled to avoid the risk of some
  * other code seeing the incoherent data in our cache.  (Recall that
  * our cache is indexed by PA, so even if the other code doesn't use
- * our KM_MEMCPY virtual addresses, they'll still hit in cache using
+ * our kmap_atomic virtual addresses, they'll still hit in cache using
  * the normal VAs that aren't supposed to hit in cache.)
  */
 static void memcpy_multicache(void *dest, const void *source,
@@ -64,6 +64,7 @@ static void memcpy_multicache(void *dest, const void *source,
 	unsigned long flags, newsrc, newdst;
 	pmd_t *pmdp;
 	pte_t *ptep;
+	int type0, type1;
 	int cpu = get_cpu();
 
 	/*
@@ -77,7 +78,8 @@ static void memcpy_multicache(void *dest, const void *source,
 	sim_allow_multiple_caching(1);
 
 	/* Set up the new dest mapping */
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0;
+	type0 = kmap_atomic_idx_push();
+	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
 	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
 	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
 	ptep = pte_offset_kernel(pmdp, newdst);
@@ -87,7 +89,8 @@ static void memcpy_multicache(void *dest, const void *source,
 	}
 
 	/* Set up the new source mapping */
-	idx += (KM_MEMCPY0 - KM_MEMCPY1);
+	type1 = kmap_atomic_idx_push();
+	idx += (type0 - type1);
 	src_pte = hv_pte_set_nc(src_pte);
 	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
 	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
@@ -119,6 +122,8 @@ static void memcpy_multicache(void *dest, const void *source,
 	 * We're done: notify the simulator that all is back to normal,
 	 * and re-enable interrupts and pre-emption.
 	 */
+	kmap_atomic_idx_pop();
+	kmap_atomic_idx_pop();
 	sim_allow_multiple_caching(0);
 	local_irq_restore(flags);
 	put_cpu();
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove.c
index fd615ae..fd615ae 100644
--- a/arch/tile/lib/memmove_32.c
+++ b/arch/tile/lib/memmove.c
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index d014c1f..57dbb3a 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -18,6 +18,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
+#undef memset
 
 void *memset(void *s, int c, size_t n)
 {
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 485e24d..5cd1c40 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -167,23 +167,30 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
 	 * when we compare them.
 	 */
 	u32 my_ticket_;
+	u32 iterations = 0;
 
-	/* Take out the next ticket; this will also stop would-be readers. */
-	if (val & 1)
-		val = get_rwlock(rwlock);
-	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
+	/*
+	 * Wait until there are no readers, then bump up the next
+	 * field and capture the ticket value.
+	 */
+	for (;;) {
+		if (!(val & 1)) {
+			if ((val >> RD_COUNT_SHIFT) == 0)
+				break;
+			rwlock->lock = val;
+		}
+		delay_backoff(iterations++);
+		val = __insn_tns((int *)&rwlock->lock);
+	}
 
-	/* Extract my ticket value from the original word. */
+	/* Take out the next ticket and extract my ticket value. */
+	rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
 	my_ticket_ = val >> WR_NEXT_SHIFT;
 
-	/*
-	 * Wait until the "current" field matches our ticket, and
-	 * there are no remaining readers.
-	 */
+	/* Wait until the "current" field matches our ticket. */
 	for (;;) {
 		u32 curr_ = val >> WR_CURR_SHIFT;
-		u32 readers = val >> RD_COUNT_SHIFT;
-		u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
+		u32 delta = ((my_ticket_ - curr_) & WR_MASK);
 		if (likely(delta == 0))
 			break;
 
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index f26f88e..4974292 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,6 +16,8 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
+#undef strlen
+
 size_t strlen(const char *s)
 {
 	/* Get an aligned pointer. */
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 704f3e8..dcebfc8 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -24,7 +24,6 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/tty.h>
@@ -66,10 +65,10 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
 #ifndef __tilegx__
 /*
  * Synthesize the fault a PL0 process would get by doing a word-load of
- * an unaligned address or a high kernel address.  Called indirectly
- * from sys_cmpxchg() in kernel/intvec.S.
+ * an unaligned address or a high kernel address.
  */
-int _sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *regs)
+SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address,
+		struct pt_regs *, regs)
 {
 	if (address >= PAGE_OFFSET)
 		force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address,
@@ -563,10 +562,10 @@ do_sigbus:
 /*
  * When we take an ITLB or DTLB fault or access violation in the
  * supervisor while the critical section bit is set, the hypervisor is
- * reluctant to write new values into the EX_CONTEXT_1_x registers,
+ * reluctant to write new values into the EX_CONTEXT_K_x registers,
  * since that might indicate we have not yet squirreled the SPR
  * contents away and can thus safely take a recursive interrupt.
- * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_1_2.
+ * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_K_2.
  *
  * Note that this routine is called before homecache_tlb_defer_enter(),
  * which means that we can properly unlock any atomics that might
@@ -610,7 +609,7 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 	 * fault.  We didn't set up a kernel stack on initial entry to
 	 * sys_cmpxchg, but instead had one set up by the fault, which
 	 * (because sys_cmpxchg never releases ICS) came to us via the
-	 * SYSTEM_SAVE_1_2 mechanism, and thus EX_CONTEXT_1_[01] are
+	 * SYSTEM_SAVE_K_2 mechanism, and thus EX_CONTEXT_K_[01] are
 	 * still referencing the original user code.  We release the
 	 * atomic lock and rewrite pt_regs so that it appears that we
 	 * came from user-space directly, and after we finish the
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 12ab137..31dbbd9 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -56,50 +56,6 @@ void kunmap(struct page *page)
 }
 EXPORT_SYMBOL(kunmap);
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ &&
-			    /* type != KM_BIO_DST_IRQ && */
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-	    type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * Describe a single atomic mapping of a page on a given cpu at a
  * given address, and allow it to be linked into a list.
@@ -240,10 +196,10 @@ void kmap_atomic_fix_kpte(struct page *page, int finished)
  * When holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 	pte_t *pte;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
@@ -255,8 +211,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
-
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	pte = kmap_get_pte(vaddr);
@@ -269,28 +224,35 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page)
 {
 	/* PAGE_NONE is a magic value that tells us to check immutability. */
-	return kmap_atomic_prot(page, type, PAGE_NONE);
+	return kmap_atomic_prot(page, PAGE_NONE);
 }
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	/*
-	 * Force other mappings to Oops if they try to access this pte without
-	 * first remapping it.  Keeping stale mappings around is a bad idea.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) {
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
 		pte_t *pte = kmap_get_pte(vaddr);
 		pte_t pteval = *pte;
+		int idx, type;
+
+		type = kmap_atomic_idx();
+		idx = type + KM_TYPE_NR*smp_processor_id();
+
+		/*
+		 * Force other mappings to Oops if they try to access this pte
+		 * without first remapping it.  Keeping stale mappings around
+		 * is a bad idea.
+		 */
 		BUG_ON(!pte_present(pteval) && !pte_migrating(pteval));
 		kmap_atomic_unregister(pte_page(pteval), vaddr);
 		kpte_clear_flush(pte, vaddr);
+		kmap_atomic_idx_pop();
 	} else {
 		/* Must be a lowmem page */
 		BUG_ON(vaddr < PAGE_OFFSET);
@@ -300,19 +262,19 @@ void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
+EXPORT_SYMBOL(__kunmap_atomic);
 
 /*
  * This API is supposed to allow us to map memory without a "struct page".
  * Currently we don't support this, though this may change in the future.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	return kmap_atomic(pfn_to_page(pfn), type);
+	return kmap_atomic(pfn_to_page(pfn));
 }
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
-	return kmap_atomic_prot(pfn_to_page(pfn), type, prot);
+	return kmap_atomic_prot(pfn_to_page(pfn), prot);
 }
 
 struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index fb3b4a5..d78df3a 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -37,6 +37,8 @@
 #include <asm/pgalloc.h>
 #include <asm/homecache.h>
 
+#include <arch/sim.h>
+
 #include "migrate.h"
 
 
@@ -217,13 +219,6 @@ static unsigned long cache_flush_length(unsigned long length)
 	return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
 }
 
-/* On the simulator, confirm lines have been evicted everywhere. */
-static void validate_lines_evicted(unsigned long pfn, size_t length)
-{
-	sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED,
-		    (HV_PhysAddr)pfn << PAGE_SHIFT, length);
-}
-
 /* Flush a page out of whatever cache(s) it is in. */
 void homecache_flush_cache(struct page *page, int order)
 {
@@ -234,7 +229,7 @@ void homecache_flush_cache(struct page *page, int order)
 
 	homecache_mask(page, pages, &home_mask);
 	flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
-	validate_lines_evicted(pfn, pages * PAGE_SIZE);
+	sim_validate_lines_evicted(PFN_PHYS(pfn), pages * PAGE_SIZE);
 }
 
 
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 24688b6..201a582 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -21,7 +21,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/sysctl.h>
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index d89c9ea..0b9ce69 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -988,8 +988,12 @@ static long __write_once initfree = 1;
 /* Select whether to free (1) or mark unusable (0) the __init pages. */
 static int __init set_initfree(char *str)
 {
-	strict_strtol(str, 0, &initfree);
-	pr_info("initfree: %s free init pages\n", initfree ? "will" : "won't");
+	long val;
+	if (strict_strtol(str, 0, &val)) {
+		initfree = val;
+		pr_info("initfree: %s free init pages\n",
+			initfree ? "will" : "won't");
+	}
 	return 1;
 }
 __setup("initfree=", set_initfree);
@@ -1060,7 +1064,7 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from 0xfd000000 that we won't use again after init.
+	 * pages from MEM_SV_INTRPT that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 335c246..1f5430c 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -134,9 +134,9 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 }
 
 #if defined(CONFIG_HIGHPTE)
-pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type)
+pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
 {
-	pte_t *pte = kmap_atomic(pmd_page(*dir), type) +
+	pte_t *pte = kmap_atomic(pmd_page(*dir)) +
 		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
 	return &pte[pte_index(address)];
 }