Merging projects/armv6, part 1

Cummulative patch of changes that are not vendor-specific: - ARMv6 and ARMv7 architecture support - ARM SMP support - VFP/Neon support - ARM Generic Interrupt Controller driver - Simplification of startup code for all platforms
author: gonzo <gonzo@FreeBSD.org> 2012-08-15 03:03:03 +0000
committer: gonzo <gonzo@FreeBSD.org> 2012-08-15 03:03:03 +0000
commit: 032427f3e9854fccfdddaea8fb15ae4603391a11 (patch)
tree: 68d86df1ea7d9bfea335c91632747716f5a0df4a
parent: eca813ad76756aea4f70787cf7827d4b319cfe94 (diff)
download: FreeBSD-src-032427f3e9854fccfdddaea8fb15ae4603391a11.zip
FreeBSD-src-032427f3e9854fccfdddaea8fb15ae4603391a11.tar.gz
64 files changed, 9940 insertions, 692 deletions
diff --git a/sys/arm/arm/bcopyinout.S b/sys/arm/arm/bcopyinout.S
index 0d26ffa..992d0d7 100644
--- a/sys/arm/arm/bcopyinout.S
+++ b/sys/arm/arm/bcopyinout.S
@@ -54,14 +54,19 @@ __FBSDID("$FreeBSD$");
 	.text
 	.align	0
 
-#ifdef MULTIPROCESSOR
-.Lcpu_info:
-	.word	_C_LABEL(cpu_info)
+#ifdef _ARM_ARCH_6
+#define GET_PCB(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4; \
+	add	tmp, tmp, #(PC_CURPCB)
 #else
 .Lcurpcb:
-	.word _C_LABEL(__pcpu) + PC_CURPCB
+	.word	_C_LABEL(__pcpu) + PC_CURPCB
+
+#define GET_PCB(tmp) \
+	ldr	tmp, .Lcurpcb
 #endif
 
+
 #define SAVE_REGS	stmfd	sp!, {r4-r11}
 #define RESTORE_REGS	ldmfd	sp!, {r4-r11}
 		
@@ -111,18 +116,9 @@ ENTRY(copyin)
 
 .Lnormal:
 	SAVE_REGS
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r2, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r4, .Lcpu_info
-	ldr	r4, [r4, r0, lsl #2]
-	ldr	r4, [r4, #CI_CURPCB]
-	ldmfd	sp!, {r0-r2, r14}
-#else
-	ldr	r4, .Lcurpcb
+	GET_PCB(r4)
 	ldr	r4, [r4]
-#endif
+
 
 	ldr	r5, [r4, #PCB_ONFAULT]
 	adr	r3, .Lcopyfault
@@ -357,18 +353,8 @@ ENTRY(copyout)
 
 .Lnormale:
 	SAVE_REGS
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r2, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r4, .Lcpu_info
-	ldr	r4, [r4, r0, lsl #2]
-	ldr	r4, [r4, #CI_CURPCB]
-	ldmfd	sp!, {r0-r2, r14}
-#else
-	ldr	r4, .Lcurpcb
+	GET_PCB(r4)
 	ldr	r4, [r4]
-#endif
 
 	ldr	r5, [r4, #PCB_ONFAULT]
 	adr	r3, .Lcopyfault
@@ -561,18 +547,9 @@ ENTRY(copyout)
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_1)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0-r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
+
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
@@ -595,18 +572,9 @@ ENTRY(badaddr_read_1)
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_2)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0-r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
+
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
@@ -629,18 +597,9 @@ ENTRY(badaddr_read_2)
  * else EFAULT if a page fault occurred.
  */
 ENTRY(badaddr_read_4)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0-r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
+
 	ldr	ip, [r2, #PCB_ONFAULT]
 	adr	r3, 1f
 	str	r3, [r2, #PCB_ONFAULT]
diff --git a/sys/arm/arm/bcopyinout_xscale.S b/sys/arm/arm/bcopyinout_xscale.S
index 64d9f42..a2853cc 100644
--- a/sys/arm/arm/bcopyinout_xscale.S
+++ b/sys/arm/arm/bcopyinout_xscale.S
@@ -41,12 +41,15 @@ __FBSDID("$FreeBSD$");
 	.text
 	.align	0
 
-#ifdef MULTIPROCESSOR
-.Lcpu_info:
-	.word	_C_LABEL(cpu_info)
+#ifdef _ARM_ARCH_6
+#define GET_PCB(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4; \
+	add	tmp, tmp, #(PC_CURPCB)
 #else
 .Lcurpcb:
 	.word	_C_LABEL(__pcpu) + PC_CURPCB
+#define GET_PCB(tmp) \
+	ldr	tmp, .Lcurpcb
 #endif
 
 /*
@@ -85,18 +88,8 @@ ENTRY(copyin)
 .Lnormal:
 	stmfd	sp!, {r10-r11, lr}
 
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r2}
-	bl	_C_LABEL(cpu_number)
-	ldr	r10, .Lcpu_info
-	ldmfd	sp!, {r0-r2}
-	ldr	r10, [r10, r0, lsl #2]
-	ldr	r10, [r10, #CI_CURPCB]
-#else
-	ldr	r10, .Lcurpcb
+	GET_PCB(r10)
 	ldr	r10, [r10]
-#endif
 
 	mov	r3, #0x00
 	adr	ip, .Lcopyin_fault
@@ -537,18 +530,8 @@ ENTRY(copyout)
 .Lnormale:									
 	stmfd	sp!, {r10-r11, lr}
 
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r2}
-	bl	_C_LABEL(cpu_number)
-	ldr	r10, .Lcpu_info
-	ldmfd	sp!, {r0-r2}
-	ldr	r10, [r10, r0, lsl #2]
-	ldr	r10, [r10, #CI_CURPCB]
-#else
-	ldr	r10, .Lcurpcb
+	GET_PCB(r10)
 	ldr	r10, [r10]
-#endif
 
 	mov	r3, #0x00
 	adr	ip, .Lcopyout_fault
diff --git a/sys/arm/arm/bus_space_asm_generic.S b/sys/arm/arm/bus_space_asm_generic.S
index 37c1808..2492474 100644
--- a/sys/arm/arm/bus_space_asm_generic.S
+++ b/sys/arm/arm/bus_space_asm_generic.S
@@ -51,11 +51,9 @@ ENTRY(generic_bs_r_1)
 	ldrb	r0, [r1, r2]
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_r_2)
 	ldrh	r0, [r1, r2]
 	RET
-#endif
 
 ENTRY(generic_bs_r_4)
 	ldr	r0, [r1, r2]
@@ -69,11 +67,9 @@ ENTRY(generic_bs_w_1)
 	strb	r3, [r1, r2]
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_w_2)
 	strh	r3, [r1, r2]
 	RET
-#endif
 
 ENTRY(generic_bs_w_4)
 	str	r3, [r1, r2]
@@ -97,7 +93,6 @@ ENTRY(generic_bs_rm_1)
 
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_rm_2)
 	add	r0, r1, r2
 	mov	r1, r3
@@ -111,7 +106,6 @@ ENTRY(generic_armv4_bs_rm_2)
 	bne	1b
 
 	RET
-#endif
 
 ENTRY(generic_bs_rm_4)
 	add	r0, r1, r2
@@ -145,7 +139,6 @@ ENTRY(generic_bs_wm_1)
 
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_wm_2)
 	add	r0, r1, r2
 	mov	r1, r3
@@ -159,7 +152,6 @@ ENTRY(generic_armv4_bs_wm_2)
 	bne	1b
 
 	RET
-#endif
 
 ENTRY(generic_bs_wm_4)
 	add	r0, r1, r2
@@ -193,7 +185,6 @@ ENTRY(generic_bs_rr_1)
 
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_rr_2)
 	add	r0, r1, r2
 	mov	r1, r3
@@ -207,7 +198,6 @@ ENTRY(generic_armv4_bs_rr_2)
 	bne	1b
 
 	RET
-#endif
 
 ENTRY(generic_bs_rr_4)
 	add	r0, r1, r2
@@ -241,7 +231,6 @@ ENTRY(generic_bs_wr_1)
 
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_wr_2)
 	add	r0, r1, r2
 	mov	r1, r3
@@ -255,7 +244,6 @@ ENTRY(generic_armv4_bs_wr_2)
 	bne	1b
 
 	RET
-#endif
 
 ENTRY(generic_bs_wr_4)
 	add	r0, r1, r2
@@ -288,7 +276,6 @@ ENTRY(generic_bs_sr_1)
 
 	RET
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_sr_2)
 	add	r0, r1, r2
 	mov	r1, r3
@@ -301,7 +288,6 @@ ENTRY(generic_armv4_bs_sr_2)
 	bne	1b
 
 	RET
-#endif
 
 ENTRY(generic_bs_sr_4)
 	add	r0, r1, r2
@@ -320,7 +306,6 @@ ENTRY(generic_bs_sr_4)
  * copy region
  */
 
-#if (ARM_ARCH_4 + ARM_ARCH_5) > 0
 ENTRY(generic_armv4_bs_c_2)
 	add	r0, r1, r2
 	ldr	r2, [sp, #0]
@@ -350,4 +335,3 @@ ENTRY(generic_armv4_bs_c_2)
 	bne	3b
 
 	RET
-#endif
diff --git a/sys/arm/arm/busdma_machdep-v6.c b/sys/arm/arm/busdma_machdep-v6.c
new file mode 100644
index 0000000..17287f73
--- /dev/null
+++ b/sys/arm/arm/busdma_machdep-v6.c
@@ -0,0 +1,1559 @@
+/*-
+ * Copyright (c) 2010 Mark Tinguely
+ * Copyright (c) 2004 Olivier Houchard
+ * Copyright (c) 2002 Peter Grehan
+ * Copyright (c) 1997, 1998 Justin T. Gibbs.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  From i386/busdma_machdep.c 191438 2009-04-23 20:24:19Z jhb
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define _ARM32_BUS_DMA_PRIVATE
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#include <ddb/db_output.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/uio.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+
+#define MAX_BPAGES 64
+#define BUS_DMA_COULD_BOUNCE	BUS_DMA_BUS3
+#define BUS_DMA_MIN_ALLOC_COMP	BUS_DMA_BUS4
+
+#define FIX_DMAP_BUS_DMASYNC_POSTREAD
+
+struct bounce_zone;
+
+struct bus_dma_tag {
+	bus_dma_tag_t	  parent;
+	bus_size_t	  alignment;
+	bus_size_t	  boundary;
+	bus_addr_t	  lowaddr;
+	bus_addr_t	  highaddr;
+	bus_dma_filter_t *filter;
+	void		 *filterarg;
+	bus_size_t	  maxsize;
+	u_int		  nsegments;
+	bus_size_t	  maxsegsz;
+	int		  flags;
+	int		  ref_count;
+	int		  map_count;
+	bus_dma_lock_t	 *lockfunc;
+	void		 *lockfuncarg;
+	bus_dma_segment_t *segments;
+	struct bounce_zone *bounce_zone;
+	/*
+	 * DMA range for this tag.  If the page doesn't fall within
+	 * one of these ranges, an error is returned.  The caller
+	 * may then decide what to do with the transfer.  If the
+	 * range pointer is NULL, it is ignored.
+	 */
+	struct arm32_dma_range	*ranges;
+	int			_nranges;
+
+};
+
+struct bounce_page {
+	vm_offset_t	vaddr;		/* kva of bounce buffer */
+	bus_addr_t	busaddr;	/* Physical address */
+	vm_offset_t	datavaddr;	/* kva of client data */
+	bus_size_t	datacount;	/* client data count */
+	STAILQ_ENTRY(bounce_page) links;
+};
+
+struct sync_list {
+	vm_offset_t	vaddr;		/* kva of bounce buffer */
+	bus_addr_t	busaddr;	/* Physical address */
+	bus_size_t	datacount;	/* client data count */
+	STAILQ_ENTRY(sync_list) slinks;
+};
+
+int busdma_swi_pending;
+
+struct bounce_zone {
+	STAILQ_ENTRY(bounce_zone) links;
+	STAILQ_HEAD(bp_list, bounce_page) bounce_page_list;
+	int		total_bpages;
+	int		free_bpages;
+	int		reserved_bpages;
+	int		active_bpages;
+	int		total_bounced;
+	int		total_deferred;
+	int		map_count;
+	bus_size_t	alignment;
+	bus_addr_t	lowaddr;
+	char		zoneid[8];
+	char		lowaddrid[20];
+	struct sysctl_ctx_list sysctl_tree;
+	struct sysctl_oid *sysctl_tree_top;
+};
+
+static struct mtx bounce_lock;
+static int total_bpages;
+static int busdma_zonecount;
+static STAILQ_HEAD(, bounce_zone) bounce_zone_list;
+
+SYSCTL_NODE(_hw, OID_AUTO, busdma, CTLFLAG_RD, 0, "Busdma parameters");
+SYSCTL_INT(_hw_busdma, OID_AUTO, total_bpages, CTLFLAG_RD, &total_bpages, 0,
+	   "Total bounce pages");
+
+struct bus_dmamap {
+	struct bp_list	       bpages;
+	int		       pagesneeded;
+	int		       pagesreserved;
+	bus_dma_tag_t	       dmat;
+	void		      *buf;		/* unmapped buffer pointer */
+	bus_size_t	       buflen;		/* unmapped buffer length */
+	pmap_t		       pmap;
+	bus_dmamap_callback_t *callback;
+	void		      *callback_arg;
+	STAILQ_ENTRY(bus_dmamap) links;
+	STAILQ_HEAD(,sync_list)	slist;
+};
+
+static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
+static STAILQ_HEAD(, bus_dmamap) bounce_map_callbacklist;
+
+static void init_bounce_pages(void *dummy);
+static int alloc_bounce_zone(bus_dma_tag_t dmat);
+static int alloc_bounce_pages(bus_dma_tag_t dmat, u_int numpages);
+static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
+				int commit);
+static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
+				   vm_offset_t vaddr, bus_size_t size);
+static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
+int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
+static int _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
+    void *buf, bus_size_t buflen, int flags);
+
+static __inline int
+_bus_dma_can_bounce(vm_offset_t lowaddr, vm_offset_t highaddr)
+{
+	int i;
+	for (i = 0; phys_avail[i] && phys_avail[i + 1]; i += 2) {
+		if ((lowaddr >= phys_avail[i] && lowaddr <= phys_avail[i + 1])
+		    || (lowaddr < phys_avail[i] &&
+		    highaddr > phys_avail[i]))
+			return (1);
+	}
+	return (0);
+}
+
+static __inline struct arm32_dma_range *
+_bus_dma_inrange(struct arm32_dma_range *ranges, int nranges,
+    bus_addr_t curaddr)
+{
+	struct arm32_dma_range *dr;
+	int i;
+
+	for (i = 0, dr = ranges; i < nranges; i++, dr++) {
+		if (curaddr >= dr->dr_sysbase &&
+		    round_page(curaddr) <= (dr->dr_sysbase + dr->dr_len))
+			return (dr);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Return true if a match is made.
+ *
+ * To find a match walk the chain of bus_dma_tag_t's looking for 'paddr'.
+ *
+ * If paddr is within the bounds of the dma tag then call the filter callback
+ * to check for a match, if there is no filter callback then assume a match.
+ */
+int
+run_filter(bus_dma_tag_t dmat, bus_addr_t paddr)
+{
+	int retval;
+
+	retval = 0;
+
+	do {
+		if (((paddr > dmat->lowaddr && paddr <= dmat->highaddr)
+		 || ((paddr & (dmat->alignment - 1)) != 0))
+		 && (dmat->filter == NULL
+		  || (*dmat->filter)(dmat->filterarg, paddr) != 0))
+			retval = 1;
+
+		dmat = dmat->parent;
+	} while (retval == 0 && dmat != NULL);
+	return (retval);
+}
+
+/*
+ * Convenience function for manipulating driver locks from busdma (during
+ * busdma_swi, for example).  Drivers that don't provide their own locks
+ * should specify &Giant to dmat->lockfuncarg.  Drivers that use their own
+ * non-mutex locking scheme don't have to use this at all.
+ */
+void
+busdma_lock_mutex(void *arg, bus_dma_lock_op_t op)
+{
+	struct mtx *dmtx;
+
+	dmtx = (struct mtx *)arg;
+	switch (op) {
+	case BUS_DMA_LOCK:
+		mtx_lock(dmtx);
+		break;
+	case BUS_DMA_UNLOCK:
+		mtx_unlock(dmtx);
+		break;
+	default:
+		panic("Unknown operation 0x%x for busdma_lock_mutex!", op);
+	}
+}
+
+/*
+ * dflt_lock should never get called.  It gets put into the dma tag when
+ * lockfunc == NULL, which is only valid if the maps that are associated
+ * with the tag are meant to never be defered.
+ * XXX Should have a way to identify which driver is responsible here.
+ */
+static void
+dflt_lock(void *arg, bus_dma_lock_op_t op)
+{
+	panic("driver error: busdma dflt_lock called");
+}
+
+/*
+ * Allocate a device specific dma_tag.
+ */
+int
+bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
+		   bus_size_t boundary, bus_addr_t lowaddr,
+		   bus_addr_t highaddr, bus_dma_filter_t *filter,
+		   void *filterarg, bus_size_t maxsize, int nsegments,
+		   bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
+		   void *lockfuncarg, bus_dma_tag_t *dmat)
+{
+	bus_dma_tag_t newtag;
+	int error = 0;
+
+#if 0
+	if (!parent)
+		parent = arm_root_dma_tag;
+#endif
+
+	/* Basic sanity checking */
+	if (boundary != 0 && boundary < maxsegsz)
+		maxsegsz = boundary;
+
+	/* Return a NULL tag on failure */
+	*dmat = NULL;
+
+	if (maxsegsz == 0) {
+		return (EINVAL);
+	}
+
+	newtag = (bus_dma_tag_t)malloc(sizeof(*newtag), M_DEVBUF,
+	    M_ZERO | M_NOWAIT);
+	if (newtag == NULL) {
+		CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
+		    __func__, newtag, 0, error);
+		return (ENOMEM);
+	}
+
+	newtag->parent = parent;
+	newtag->alignment = alignment;
+	newtag->boundary = boundary;
+	newtag->lowaddr = trunc_page((vm_paddr_t)lowaddr) + (PAGE_SIZE - 1);
+	newtag->highaddr = trunc_page((vm_paddr_t)highaddr) +
+	    (PAGE_SIZE - 1);
+	newtag->filter = filter;
+	newtag->filterarg = filterarg;
+	newtag->maxsize = maxsize;
+	newtag->nsegments = nsegments;
+	newtag->maxsegsz = maxsegsz;
+	newtag->flags = flags;
+	newtag->ref_count = 1; /* Count ourself */
+	newtag->map_count = 0;
+	newtag->ranges = bus_dma_get_range();
+	newtag->_nranges = bus_dma_get_range_nb();
+	if (lockfunc != NULL) {
+		newtag->lockfunc = lockfunc;
+		newtag->lockfuncarg = lockfuncarg;
+	} else {
+		newtag->lockfunc = dflt_lock;
+		newtag->lockfuncarg = NULL;
+	}
+	newtag->segments = NULL;
+
+	/* Take into account any restrictions imposed by our parent tag */
+	if (parent != NULL) {
+		newtag->lowaddr = MIN(parent->lowaddr, newtag->lowaddr);
+		newtag->highaddr = MAX(parent->highaddr, newtag->highaddr);
+		if (newtag->boundary == 0)
+			newtag->boundary = parent->boundary;
+		else if (parent->boundary != 0)
+			newtag->boundary = MIN(parent->boundary,
+					       newtag->boundary);
+		if ((newtag->filter != NULL) ||
+		    ((parent->flags & BUS_DMA_COULD_BOUNCE) != 0))
+			newtag->flags |= BUS_DMA_COULD_BOUNCE;
+		if (newtag->filter == NULL) {
+			/*
+			 * Short circuit looking at our parent directly
+			 * since we have encapsulated all of its information
+			 */
+			newtag->filter = parent->filter;
+			newtag->filterarg = parent->filterarg;
+			newtag->parent = parent->parent;
+		}
+		if (newtag->parent != NULL)
+			atomic_add_int(&parent->ref_count, 1);
+	}
+
+	if (_bus_dma_can_bounce(newtag->lowaddr, newtag->highaddr)
+	 || newtag->alignment > 1)
+		newtag->flags |= BUS_DMA_COULD_BOUNCE;
+
+	if (((newtag->flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+	    (flags & BUS_DMA_ALLOCNOW) != 0) {
+		struct bounce_zone *bz;
+
+		/* Must bounce */
+
+		if ((error = alloc_bounce_zone(newtag)) != 0) {
+			free(newtag, M_DEVBUF);
+			return (error);
+		}
+		bz = newtag->bounce_zone;
+
+		if (ptoa(bz->total_bpages) < maxsize) {
+			int pages;
+
+			pages = atop(maxsize) - bz->total_bpages;
+
+			/* Add pages to our bounce pool */
+			if (alloc_bounce_pages(newtag, pages) < pages)
+				error = ENOMEM;
+		}
+		/* Performed initial allocation */
+		newtag->flags |= BUS_DMA_MIN_ALLOC_COMP;
+	} else
+		newtag->bounce_zone = NULL;
+
+	if (error != 0) {
+		free(newtag, M_DEVBUF);
+	} else {
+		*dmat = newtag;
+	}
+	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
+	    __func__, newtag, (newtag != NULL ? newtag->flags : 0), error);
+	return (error);
+}
+
+int
+bus_dma_tag_destroy(bus_dma_tag_t dmat)
+{
+	bus_dma_tag_t dmat_copy;
+	int error;
+
+	error = 0;
+	dmat_copy = dmat;
+
+	if (dmat != NULL) {
+
+		if (dmat->map_count != 0) {
+			error = EBUSY;
+			goto out;
+		}
+
+		while (dmat != NULL) {
+			bus_dma_tag_t parent;
+
+			parent = dmat->parent;
+			atomic_subtract_int(&dmat->ref_count, 1);
+			if (dmat->ref_count == 0) {
+				if (dmat->segments != NULL)
+					free(dmat->segments, M_DEVBUF);
+				free(dmat, M_DEVBUF);
+				/*
+				 * Last reference count, so
+				 * release our reference
+				 * count on our parent.
+				 */
+				dmat = parent;
+			} else
+				dmat = NULL;
+		}
+	}
+out:
+	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
+	return (error);
+}
+
+/*
+ * Allocate a handle for mapping from kva/uva/physical
+ * address space into bus device space.
+ */
+int
+bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
+{
+	int error;
+
+	error = 0;
+
+	*mapp = (bus_dmamap_t)malloc(sizeof(**mapp), M_DEVBUF,
+					     M_NOWAIT | M_ZERO);
+	if (*mapp == NULL) {
+		CTR3(KTR_BUSDMA, "%s: tag %p error %d", __func__, dmat, ENOMEM);
+		return (ENOMEM);
+	}
+	STAILQ_INIT(&((*mapp)->slist));
+
+	if (dmat->segments == NULL) {
+		dmat->segments = (bus_dma_segment_t *)malloc(
+		    sizeof(bus_dma_segment_t) * dmat->nsegments, M_DEVBUF,
+		    M_NOWAIT);
+		if (dmat->segments == NULL) {
+			CTR3(KTR_BUSDMA, "%s: tag %p error %d",
+			    __func__, dmat, ENOMEM);
+			free(*mapp, M_DEVBUF);
+			*mapp = NULL;
+			return (ENOMEM);
+		}
+	}
+	/*
+	 * Bouncing might be required if the driver asks for an active
+	 * exclusion region, a data alignment that is stricter than 1, and/or
+	 * an active address boundary.
+	 */
+	if (dmat->flags & BUS_DMA_COULD_BOUNCE) {
+
+		/* Must bounce */
+		struct bounce_zone *bz;
+		int maxpages;
+
+		if (dmat->bounce_zone == NULL) {
+			if ((error = alloc_bounce_zone(dmat)) != 0) {
+				free(*mapp, M_DEVBUF);
+				*mapp = NULL;
+				return (error);
+			}
+		}
+		bz = dmat->bounce_zone;
+
+		/* Initialize the new map */
+		STAILQ_INIT(&((*mapp)->bpages));
+
+		/*
+		 * Attempt to add pages to our pool on a per-instance
+		 * basis up to a sane limit.
+		 */
+		maxpages = MAX_BPAGES;
+		if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0
+		 || (bz->map_count > 0 && bz->total_bpages < maxpages)) {
+			int pages;
+
+			pages = MAX(atop(dmat->maxsize), 1);
+			pages = MIN(maxpages - bz->total_bpages, pages);
+			pages = MAX(pages, 1);
+			if (alloc_bounce_pages(dmat, pages) < pages)
+				error = ENOMEM;
+
+			if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0) {
+				if (error == 0)
+					dmat->flags |= BUS_DMA_MIN_ALLOC_COMP;
+			} else {
+				error = 0;
+			}
+		}
+		bz->map_count++;
+	}
+	if (error == 0)
+		dmat->map_count++;
+	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
+	    __func__, dmat, dmat->flags, error);
+	return (error);
+}
+
+/*
+ * Destroy a handle for mapping from kva/uva/physical
+ * address space into bus device space.
+ */
+int
+bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map)
+{
+	if (STAILQ_FIRST(&map->bpages) != NULL ||
+	    STAILQ_FIRST(&map->slist) != NULL) {
+		CTR3(KTR_BUSDMA, "%s: tag %p error %d",
+		    __func__, dmat, EBUSY);
+		return (EBUSY);
+	}
+	if (dmat->bounce_zone)
+		dmat->bounce_zone->map_count--;
+	free(map, M_DEVBUF);
+	dmat->map_count--;
+	CTR2(KTR_BUSDMA, "%s: tag %p error 0", __func__, dmat);
+	return (0);
+}
+
+
+/*
+ * Allocate a piece of memory that can be efficiently mapped into
+ * bus device space based on the constraints lited in the dma tag.
+ * A dmamap to for use with dmamap_load is also allocated.
+ */
+int
+bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
+		 bus_dmamap_t *mapp)
+{
+	int mflags, len;
+
+	if (flags & BUS_DMA_NOWAIT)
+		mflags = M_NOWAIT;
+	else
+		mflags = M_WAITOK;
+
+	/* ARM non-snooping caches need a map for the VA cache sync structure */
+
+	*mapp = (bus_dmamap_t)malloc(sizeof(**mapp), M_DEVBUF,
+					     M_NOWAIT | M_ZERO);
+	if (*mapp == NULL) {
+		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
+		    __func__, dmat, dmat->flags, ENOMEM);
+		return (ENOMEM);
+	}
+
+	STAILQ_INIT(&((*mapp)->slist));
+
+	if (dmat->segments == NULL) {
+		dmat->segments = (bus_dma_segment_t *)malloc(
+		    sizeof(bus_dma_segment_t) * dmat->nsegments, M_DEVBUF,
+		    mflags);
+		if (dmat->segments == NULL) {
+			CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
+			    __func__, dmat, dmat->flags, ENOMEM);
+			free(*mapp, M_DEVBUF);
+			*mapp = NULL;
+			return (ENOMEM);
+		}
+	}
+
+	if (flags & BUS_DMA_ZERO)
+		mflags |= M_ZERO;
+
+	/* 
+	 * XXX:
+	 * (dmat->alignment < dmat->maxsize) is just a quick hack; the exact
+	 * alignment guarantees of malloc need to be nailed down, and the
+	 * code below should be rewritten to take that into account.
+	 *
+	 * In the meantime, we'll warn the user if malloc gets it wrong.
+	 *
+	 * allocate at least a cache line. This should help avoid cache
+	 * corruption.
+	 */
+	len = max(dmat->maxsize, arm_dcache_align);
+        if (len <= PAGE_SIZE &&
+	   (dmat->alignment < len) &&
+	   !_bus_dma_can_bounce(dmat->lowaddr, dmat->highaddr)) {
+		*vaddr = malloc(len, M_DEVBUF, mflags);
+	} else {
+		/*
+		 * XXX Use Contigmalloc until it is merged into this facility
+		 *     and handles multi-seg allocations.  Nobody is doing
+		 *     multi-seg allocations yet though.
+		 * XXX Certain AGP hardware does.
+		 */
+		*vaddr = contigmalloc(len, M_DEVBUF, mflags,
+		    0ul, dmat->lowaddr, dmat->alignment? dmat->alignment : 1ul,
+		    dmat->boundary);
+	}
+	if (*vaddr == NULL) {
+		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
+		    __func__, dmat, dmat->flags, ENOMEM);
+		free(*mapp, M_DEVBUF);
+		*mapp = NULL;
+		return (ENOMEM);
+	} else if ((uintptr_t)*vaddr & (dmat->alignment - 1)) {
+		printf("bus_dmamem_alloc failed to align memory properly.\n");
+	}
+	dmat->map_count++;
+
+	if (flags & BUS_DMA_COHERENT)
+		pmap_change_attr((vm_offset_t)*vaddr, len,
+		    BUS_DMA_NOCACHE);
+
+	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
+	    __func__, dmat, dmat->flags, 0);
+	return (0);
+}
+
+/*
+ * Free a piece of memory and it's allociated dmamap, that was allocated
+ * via bus_dmamem_alloc.  Make the same choice for free/contigfree.
+ */
+void
+bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map)
+{
+	int len;
+
+#ifdef mftnotyet
+	pmap_change_attr((vm_offset_t)vaddr, dmat->maxsize, ARM_WRITE_BACK);
+#endif
+	len = max(dmat->maxsize, arm_dcache_align);
+        if (len <= PAGE_SIZE &&
+	   (dmat->alignment < len) &&
+	   !_bus_dma_can_bounce(dmat->lowaddr, dmat->highaddr))
+		free(vaddr, M_DEVBUF);
+	else {
+		contigfree(vaddr, len, M_DEVBUF);
+	}
+	dmat->map_count--;
+	free(map, M_DEVBUF);
+	CTR3(KTR_BUSDMA, "%s: tag %p flags 0x%x", __func__, dmat, dmat->flags);
+}
+
+static int
+_bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
+    void *buf, bus_size_t buflen, int flags)
+{
+	vm_offset_t vaddr;
+	vm_offset_t vendaddr;
+	bus_addr_t paddr;
+
+	if (map->pagesneeded == 0) {
+		CTR5(KTR_BUSDMA, "lowaddr= %d, boundary= %d, alignment= %d"
+		    " map= %p, pagesneeded= %d",
+		    dmat->lowaddr, dmat->boundary, dmat->alignment,
+		    map, map->pagesneeded);
+		/*
+		 * Count the number of bounce pages
+		 * needed in order to complete this transfer
+		 */
+		vaddr = (vm_offset_t)buf;
+		vendaddr = (vm_offset_t)buf + buflen;
+
+		while (vaddr < vendaddr) {
+			if (__predict_true(map->pmap == pmap_kernel()))
+				paddr = pmap_kextract(vaddr);
+			else
+				paddr = pmap_extract(map->pmap, vaddr);
+			if (((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+			    run_filter(dmat, paddr) != 0) {
+				map->pagesneeded++;
+			}
+			vaddr += (PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK));
+
+		}
+		CTR1(KTR_BUSDMA, "pagesneeded= %d", map->pagesneeded);
+	}
+
+	/* Reserve Necessary Bounce Pages */
+	if (map->pagesneeded != 0) {
+		mtx_lock(&bounce_lock);
+		if (flags & BUS_DMA_NOWAIT) {
+			if (reserve_bounce_pages(dmat, map, 0) != 0) {
+				map->pagesneeded = 0;
+				mtx_unlock(&bounce_lock);
+				return (ENOMEM);
+			}
+		} else {
+			if (reserve_bounce_pages(dmat, map, 1) != 0) {
+				/* Queue us for resources */
+				map->dmat = dmat;
+				map->buf = buf;
+				map->buflen = buflen;
+				STAILQ_INSERT_TAIL(&bounce_map_waitinglist,
+				    map, links);
+				mtx_unlock(&bounce_lock);
+				return (EINPROGRESS);
+			}
+		}
+		mtx_unlock(&bounce_lock);
+	}
+
+	return (0);
+}
+
+/*
+ * Utility function to load a linear buffer. lastaddrp holds state
+ * between invocations (for multiple-buffer loads).  segp contains
+ * the starting segment on entrace, and the ending segment on exit.
+ * first indicates if this is the first invocation of this function.
+ */
+static __inline int
+_bus_dmamap_load_buffer(bus_dma_tag_t dmat,
+			bus_dmamap_t map,
+			void *buf, bus_size_t buflen,
+			int flags,
+			bus_addr_t *lastaddrp,
+			bus_dma_segment_t *segs,
+			int *segp,
+			int first)
+{
+	bus_size_t sgsize;
+	bus_addr_t curaddr, lastaddr, baddr, bmask;
+	vm_offset_t vaddr;
+	struct sync_list *sl;
+	int seg, error;
+
+	if ((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) {
+		error = _bus_dmamap_count_pages(dmat, map, buf, buflen, flags);
+		if (error)
+			return (error);
+	}
+
+	sl = NULL;
+	vaddr = (vm_offset_t)buf;
+	lastaddr = *lastaddrp;
+	bmask = ~(dmat->boundary - 1);
+
+	for (seg = *segp; buflen > 0 ; ) {
+		/*
+		 * Get the physical address for this segment.
+		 */
+		if (__predict_true(map->pmap == pmap_kernel()))
+			curaddr = pmap_kextract(vaddr);
+		else
+			curaddr = pmap_extract(map->pmap, vaddr);
+
+		/*
+		 * Compute the segment size, and adjust counts.
+		 */
+		sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK);
+		if (sgsize > dmat->maxsegsz)
+			sgsize = dmat->maxsegsz;
+		if (buflen < sgsize)
+			sgsize = buflen;
+
+		/*
+		 * Make sure we don't cross any boundaries.
+		 */
+		if (dmat->boundary > 0) {
+			baddr = (curaddr + dmat->boundary) & bmask;
+			if (sgsize > (baddr - curaddr))
+				sgsize = (baddr - curaddr);
+		}
+
+		if (((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+		    map->pagesneeded != 0 && run_filter(dmat, curaddr)) {
+			curaddr = add_bounce_page(dmat, map, vaddr, sgsize);
+		} else {
+			/* add_sync_list(dmat, map, vaddr, sgsize, cflag); */
+			sl = (struct sync_list *)malloc(sizeof(struct sync_list),
+						M_DEVBUF, M_NOWAIT | M_ZERO);
+			if (sl == NULL)
+				goto cleanup;
+			STAILQ_INSERT_TAIL(&(map->slist), sl, slinks);
+			sl->vaddr = vaddr;
+			sl->datacount = sgsize;
+			sl->busaddr = curaddr;
+		}
+
+
+		if (dmat->ranges) {
+			struct arm32_dma_range *dr;
+
+			dr = _bus_dma_inrange(dmat->ranges, dmat->_nranges,
+			    curaddr);
+			if (dr == NULL) {
+				_bus_dmamap_unload(dmat, map);
+				return (EINVAL);
+			}
+			/*
+			 * In a valid DMA range.  Translate the physical
+			 * memory address to an address in the DMA window.
+			 */
+			curaddr = (curaddr - dr->dr_sysbase) + dr->dr_busbase;
+		}
+
+		/*
+		 * Insert chunk into a segment, coalescing with
+		 * previous segment if possible.
+		 */
+		if (first) {
+			segs[seg].ds_addr = curaddr;
+			segs[seg].ds_len = sgsize;
+			first = 0;
+		} else {
+			if (curaddr == lastaddr &&
+			    (segs[seg].ds_len + sgsize) <= dmat->maxsegsz &&
+			    (dmat->boundary == 0 ||
+			     (segs[seg].ds_addr & bmask) == (curaddr & bmask)))
+				segs[seg].ds_len += sgsize;
+			else {
+				if (++seg >= dmat->nsegments)
+					break;
+				segs[seg].ds_addr = curaddr;
+				segs[seg].ds_len = sgsize;
+			}
+		}
+
+		lastaddr = curaddr + sgsize;
+		vaddr += sgsize;
+		buflen -= sgsize;
+	}
+
+	*segp = seg;
+	*lastaddrp = lastaddr;
+cleanup:
+	/*
+	 * Did we fit?
+	 */
+	if (buflen != 0) {
+		_bus_dmamap_unload(dmat, map);
+		return(EFBIG); /* XXX better return value here? */
+	}
+	return (0);
+}
+
+/*
+ * Map the buffer buf into bus space using the dmamap map.
+ */
+int
+bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+		bus_size_t buflen, bus_dmamap_callback_t *callback,
+		void *callback_arg, int flags)
+{
+	bus_addr_t		lastaddr = 0;
+	int			error, nsegs = 0;
+
+	flags |= BUS_DMA_WAITOK;
+	map->callback = callback;
+	map->callback_arg = callback_arg;
+	map->pmap = kernel_pmap;
+
+	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, flags,
+		     &lastaddr, dmat->segments, &nsegs, 1);
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, nsegs + 1);
+
+	if (error == EINPROGRESS) {
+		return (error);
+	}
+
+	if (error)
+		(*callback)(callback_arg, dmat->segments, 0, error);
+	else
+		(*callback)(callback_arg, dmat->segments, nsegs + 1, 0);
+
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferal is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+
+/*
+ * Like _bus_dmamap_load(), but for mbufs.
+ */
+static __inline int
+_bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+			struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs,
+			int flags)
+{
+	int error;
+
+	M_ASSERTPKTHDR(m0);
+	map->pmap = kernel_pmap;
+
+	flags |= BUS_DMA_NOWAIT;
+	*nsegs = 0;
+	error = 0;
+	if (m0->m_pkthdr.len <= dmat->maxsize) {
+		int first = 1;
+		bus_addr_t lastaddr = 0;
+		struct mbuf *m;
+
+		for (m = m0; m != NULL && error == 0; m = m->m_next) {
+			if (m->m_len > 0) {
+				error = _bus_dmamap_load_buffer(dmat, map,
+						m->m_data, m->m_len,
+						flags, &lastaddr,
+						segs, nsegs, first);
+				first = 0;
+			}
+		}
+	} else {
+		error = EINVAL;
+	}
+
+	/* XXX FIXME: Having to increment nsegs is really annoying */
+	++*nsegs;
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, *nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map,
+		     struct mbuf *m0,
+		     bus_dmamap_callback2_t *callback, void *callback_arg,
+		     int flags)
+{
+	int nsegs, error;
+
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, dmat->segments, &nsegs,
+		    flags);
+
+	if (error) {
+		/* force "no valid mappings" in callback */
+		(*callback)(callback_arg, dmat->segments, 0, 0, error);
+	} else {
+		(*callback)(callback_arg, dmat->segments,
+			    nsegs, m0->m_pkthdr.len, error);
+	}
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, nsegs);
+
+	return (error);
+}
+
+int
+bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+			struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs,
+			int flags)
+{
+	return (_bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags));
+}
+
+/*
+ * Like _bus_dmamap_load(), but for uios.
+ */
+int
+bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map,
+		    struct uio *uio,
+		    bus_dmamap_callback2_t *callback, void *callback_arg,
+		    int flags)
+{
+	bus_addr_t lastaddr;
+	int nsegs, error, first, i;
+	bus_size_t resid;
+	struct iovec *iov;
+
+	flags |= BUS_DMA_NOWAIT;
+	resid = uio->uio_resid;
+	iov = uio->uio_iov;
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		KASSERT(uio->uio_td != NULL,
+			("bus_dmamap_load_uio: USERSPACE but no proc"));
+		map->pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+	} else
+		map->pmap = kernel_pmap;
+
+	nsegs = 0;
+	error = 0;
+	first = 1;
+	lastaddr = (bus_addr_t) 0;
+	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
+		/*
+		 * Now at the first iovec to load.  Load each iovec
+		 * until we have exhausted the residual count.
+		 */
+		bus_size_t minlen =
+			resid < iov[i].iov_len ? resid : iov[i].iov_len;
+		caddr_t addr = (caddr_t) iov[i].iov_base;
+
+		if (minlen > 0) {
+			error = _bus_dmamap_load_buffer(dmat, map,
+					addr, minlen, flags, &lastaddr,
+					dmat->segments, &nsegs, first);
+			first = 0;
+			resid -= minlen;
+		}
+	}
+
+	if (error) {
+		/* force "no valid mappings" in callback */
+		(*callback)(callback_arg, dmat->segments, 0, 0, error);
+	} else {
+		(*callback)(callback_arg, dmat->segments,
+			    nsegs+1, uio->uio_resid, error);
+	}
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, nsegs + 1);
+	return (error);
+}
+
+/*
+ * Release the mapping held by map.
+ */
+void
+_bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map)
+{
+	struct bounce_page *bpage;
+	struct bounce_zone *bz;
+	struct sync_list *sl;
+
+        while ((sl = STAILQ_FIRST(&map->slist)) != NULL) {
+                STAILQ_REMOVE_HEAD(&map->slist, slinks);
+                free(sl, M_DEVBUF);
+        }
+
+	if ((bz = dmat->bounce_zone) != NULL) {
+		while ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
+			STAILQ_REMOVE_HEAD(&map->bpages, links);
+			free_bounce_page(dmat, bpage);
+		}
+
+		bz = dmat->bounce_zone;
+		bz->free_bpages += map->pagesreserved;
+		bz->reserved_bpages -= map->pagesreserved;
+		map->pagesreserved = 0;
+		map->pagesneeded = 0;
+	}
+}
+
+#ifdef notyetbounceuser
+	/* If busdma uses user pages, then the interrupt handler could
+	 * be use the kernel vm mapping. Both bounce pages and sync list
+	 * do not cross page boundaries.
+	 * Below is a rough sequence that a person would do to fix the
+	 * user page reference in the kernel vmspace. This would be
+	 * done in the dma post routine.
+	 */
+void
+_bus_dmamap_fix_user(vm_offset_t buf, bus_size_t len,
+			pmap_t pmap, int op)
+{
+	bus_size_t sgsize;
+	bus_addr_t curaddr;
+	vm_offset_t va;
+
+		/* each synclist entry is contained within a single page.
+		 *
+		 * this would be needed if BUS_DMASYNC_POSTxxxx was implemented
+		*/
+	curaddr = pmap_extract(pmap, buf);
+	va = pmap_dma_map(curaddr);
+	switch (op) {
+	case SYNC_USER_INV:
+		cpu_dcache_wb_range(va, sgsize);
+		break;
+
+	case SYNC_USER_COPYTO:
+		bcopy((void *)va, (void *)bounce, sgsize);
+		break;
+
+	case SYNC_USER_COPYFROM:
+		bcopy((void *) bounce, (void *)va, sgsize);
+		break;
+
+	default:
+		break;
+	}
+
+	pmap_dma_unmap(va);
+}
+#endif
+
+#ifdef ARM_L2_PIPT
+#define l2cache_wb_range(va, pa, size) cpu_l2cache_wb_range(pa, size)
+#define l2cache_wbinv_range(va, pa, size) cpu_l2cache_wbinv_range(pa, size)
+#define l2cache_inv_range(va, pa, size) cpu_l2cache_inv_range(pa, size)
+#else
+#define l2cache_wb_range(va, pa, size) cpu_l2cache_wb_range(va, size)
+#define l2cache_wbinv_range(va, pa, size) cpu_l2cache_wbinv_range(va, size)
+#define l2cache_inv_range(va, pa, size) cpu_l2cache_wbinv_range(va, size)
+#endif
+
+void
+_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dmasync_op_t op)
+{
+	struct bounce_page *bpage;
+	struct sync_list *sl;
+	bus_size_t len, unalign;
+	vm_offset_t buf, ebuf;
+#ifdef FIX_DMAP_BUS_DMASYNC_POSTREAD
+	vm_offset_t bbuf;
+	char _tmp_cl[arm_dcache_align], _tmp_clend[arm_dcache_align];
+#endif
+	int listcount = 0;
+
+		/* if buffer was from user space, it it possible that this
+		 * is not the same vm map. The fix is to map each page in
+		 * the buffer into the current address space (KVM) and then
+		 * do the bounce copy or sync list cache operation.
+		 *
+		 * The sync list entries are already broken into
+		 * their respective physical pages.
+		 */
+	if (!pmap_dmap_iscurrent(map->pmap))
+		printf("_bus_dmamap_sync: wrong user map: %p %x\n", map->pmap, op);
+
+	if ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
+
+		/* Handle data bouncing. */
+		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
+		    "performing bounce", __func__, dmat, dmat->flags, op);
+
+		if (op & BUS_DMASYNC_PREWRITE) {
+			while (bpage != NULL) {
+				bcopy((void *)bpage->datavaddr,
+				      (void *)bpage->vaddr,
+				      bpage->datacount);
+				cpu_dcache_wb_range((vm_offset_t)bpage->vaddr,
+					bpage->datacount);
+				l2cache_wb_range((vm_offset_t)bpage->vaddr,
+				    (vm_offset_t)bpage->busaddr, 
+				    bpage->datacount);
+				bpage = STAILQ_NEXT(bpage, links);
+			}
+			dmat->bounce_zone->total_bounced++;
+		}
+
+		if (op & BUS_DMASYNC_POSTREAD) {
+			if (!pmap_dmap_iscurrent(map->pmap))
+			    panic("_bus_dmamap_sync: wrong user map. apply fix");
+
+			cpu_dcache_inv_range((vm_offset_t)bpage->vaddr,
+					bpage->datacount);
+			l2cache_inv_range((vm_offset_t)bpage->vaddr,
+			    (vm_offset_t)bpage->busaddr,
+			    bpage->datacount);
+			while (bpage != NULL) {
+				vm_offset_t startv;
+				vm_paddr_t startp;
+				int len;
+
+				startv = bpage->vaddr &~ arm_dcache_align_mask;
+				startp = bpage->busaddr &~ arm_dcache_align_mask;
+				len = bpage->datacount;
+				
+				if (startv != bpage->vaddr)
+					len += bpage->vaddr & arm_dcache_align_mask;
+				if (len & arm_dcache_align_mask) 
+					len = (len -
+					    (len & arm_dcache_align_mask)) +
+					    arm_dcache_align;
+				cpu_dcache_inv_range(startv, len);
+				l2cache_inv_range(startv, startp, len);
+				bcopy((void *)bpage->vaddr,
+				      (void *)bpage->datavaddr,
+				      bpage->datacount);
+				bpage = STAILQ_NEXT(bpage, links);
+			}
+			dmat->bounce_zone->total_bounced++;
+		}
+	}
+
+	sl = STAILQ_FIRST(&map->slist);
+	while (sl) {
+		listcount++;
+		sl = STAILQ_NEXT(sl, slinks);
+	}
+	if ((sl = STAILQ_FIRST(&map->slist)) != NULL) {
+		/* ARM caches are not self-snooping for dma */
+
+		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
+		    "performing sync", __func__, dmat, dmat->flags, op);
+
+		switch (op) {
+		case BUS_DMASYNC_PREWRITE:
+			while (sl != NULL) {
+			    cpu_dcache_wb_range(sl->vaddr, sl->datacount);
+			    l2cache_wb_range(sl->vaddr, sl->busaddr,
+				sl->datacount);
+			    sl = STAILQ_NEXT(sl, slinks);
+			}
+			break;
+
+		case BUS_DMASYNC_PREREAD:
+			while (sl != NULL) {
+					/* write back the unaligned portions */
+				vm_paddr_t physaddr = sl->busaddr, ephysaddr;
+				buf = sl->vaddr;
+				len = sl->datacount;
+				ebuf = buf + len;	/* end of buffer */
+				ephysaddr = physaddr + len;
+				unalign = buf & arm_dcache_align_mask;
+				if (unalign) {
+						/* wbinv leading fragment */
+					buf &= ~arm_dcache_align_mask;
+					physaddr &= ~arm_dcache_align_mask;
+					cpu_dcache_wbinv_range(buf,
+							arm_dcache_align);
+					l2cache_wbinv_range(buf, physaddr,
+					    arm_dcache_align);
+					buf += arm_dcache_align;
+					physaddr += arm_dcache_align;
+					/* number byte in buffer wbinv */
+					unalign = arm_dcache_align - unalign;
+					if (len > unalign)
+						len -= unalign;
+					else
+						len = 0;
+				}
+				unalign = ebuf & arm_dcache_align_mask;
+				if (ebuf > buf && unalign) {
+						/* wbinv trailing fragment */
+					len -= unalign;
+					ebuf -= unalign;
+					ephysaddr -= unalign;
+					cpu_dcache_wbinv_range(ebuf,
+							arm_dcache_align);
+					l2cache_wbinv_range(ebuf, ephysaddr,
+					    arm_dcache_align);
+				}
+				if (ebuf > buf) {
+					cpu_dcache_inv_range(buf, len);
+					l2cache_inv_range(buf, physaddr, len);
+				}
+				sl = STAILQ_NEXT(sl, slinks);
+			}
+			break;
+
+		case BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD:
+			while (sl != NULL) {
+				cpu_dcache_wbinv_range(sl->vaddr, sl->datacount);
+				l2cache_wbinv_range(sl->vaddr,
+				    sl->busaddr, sl->datacount);
+				sl = STAILQ_NEXT(sl, slinks);
+			}
+			break;
+
+#ifdef FIX_DMAP_BUS_DMASYNC_POSTREAD
+		case BUS_DMASYNC_POSTREAD:
+			if (!pmap_dmap_iscurrent(map->pmap))
+			     panic("_bus_dmamap_sync: wrong user map. apply fix");
+			while (sl != NULL) {
+					/* write back the unaligned portions */
+				vm_paddr_t physaddr;
+				buf = sl->vaddr;
+				len = sl->datacount;
+				physaddr = sl->busaddr;
+				bbuf = buf & ~arm_dcache_align_mask;
+				ebuf = buf + len;
+				physaddr = physaddr & ~arm_dcache_align_mask;
+				unalign = buf & arm_dcache_align_mask;
+				if (unalign) {
+					memcpy(_tmp_cl, (void *)bbuf, unalign);
+					len += unalign; /* inv entire cache line */
+				}
+				unalign = ebuf & arm_dcache_align_mask;
+				if (unalign) {
+					unalign = arm_dcache_align - unalign;
+					memcpy(_tmp_clend, (void *)ebuf, unalign);
+					len += unalign; /* inv entire cache line */
+				}
+					/* inv are cache length aligned */
+				cpu_dcache_inv_range(bbuf, len);
+				l2cache_inv_range(bbuf, physaddr, len);
+
+				unalign = (vm_offset_t)buf & arm_dcache_align_mask;
+				if (unalign) {
+					memcpy((void *)bbuf, _tmp_cl, unalign);
+				}
+				unalign = ebuf & arm_dcache_align_mask;
+				if (unalign) {
+					unalign = arm_dcache_align - unalign;
+					memcpy((void *)ebuf, _tmp_clend, unalign);
+				}
+				sl = STAILQ_NEXT(sl, slinks);
+			}
+				break;
+#endif /* FIX_DMAP_BUS_DMASYNC_POSTREAD */
+
+		default:
+			break;
+		}
+	}
+}
+
+static void
+init_bounce_pages(void *dummy __unused)
+{
+
+	total_bpages = 0;
+	STAILQ_INIT(&bounce_zone_list);
+	STAILQ_INIT(&bounce_map_waitinglist);
+	STAILQ_INIT(&bounce_map_callbacklist);
+	mtx_init(&bounce_lock, "bounce pages lock", NULL, MTX_DEF);
+}
+SYSINIT(bpages, SI_SUB_LOCK, SI_ORDER_ANY, init_bounce_pages, NULL);
+
+static struct sysctl_ctx_list *
+busdma_sysctl_tree(struct bounce_zone *bz)
+{
+	return (&bz->sysctl_tree);
+}
+
+static struct sysctl_oid *
+busdma_sysctl_tree_top(struct bounce_zone *bz)
+{
+	return (bz->sysctl_tree_top);
+}
+
+static int
+alloc_bounce_zone(bus_dma_tag_t dmat)
+{
+	struct bounce_zone *bz;
+
+	/* Check to see if we already have a suitable zone */
+	STAILQ_FOREACH(bz, &bounce_zone_list, links) {
+		if ((dmat->alignment <= bz->alignment)
+		 && (dmat->lowaddr >= bz->lowaddr)) {
+			dmat->bounce_zone = bz;
+			return (0);
+		}
+	}
+
+	if ((bz = (struct bounce_zone *)malloc(sizeof(*bz), M_DEVBUF,
+	    M_NOWAIT | M_ZERO)) == NULL)
+		return (ENOMEM);
+
+	STAILQ_INIT(&bz->bounce_page_list);
+	bz->free_bpages = 0;
+	bz->reserved_bpages = 0;
+	bz->active_bpages = 0;
+	bz->lowaddr = dmat->lowaddr;
+	bz->alignment = MAX(dmat->alignment, PAGE_SIZE);
+	bz->map_count = 0;
+	snprintf(bz->zoneid, 8, "zone%d", busdma_zonecount);
+	busdma_zonecount++;
+	snprintf(bz->lowaddrid, 18, "%#jx", (uintmax_t)bz->lowaddr);
+	STAILQ_INSERT_TAIL(&bounce_zone_list, bz, links);
+	dmat->bounce_zone = bz;
+
+	sysctl_ctx_init(&bz->sysctl_tree);
+	bz->sysctl_tree_top = SYSCTL_ADD_NODE(&bz->sysctl_tree,
+	    SYSCTL_STATIC_CHILDREN(_hw_busdma), OID_AUTO, bz->zoneid,
+	    CTLFLAG_RD, 0, "");
+	if (bz->sysctl_tree_top == NULL) {
+		sysctl_ctx_free(&bz->sysctl_tree);
+		return (0);	/* XXX error code? */
+	}
+
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "total_bpages", CTLFLAG_RD, &bz->total_bpages, 0,
+	    "Total bounce pages");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "free_bpages", CTLFLAG_RD, &bz->free_bpages, 0,
+	    "Free bounce pages");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "reserved_bpages", CTLFLAG_RD, &bz->reserved_bpages, 0,
+	    "Reserved bounce pages");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "active_bpages", CTLFLAG_RD, &bz->active_bpages, 0,
+	    "Active bounce pages");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "total_bounced", CTLFLAG_RD, &bz->total_bounced, 0,
+	    "Total bounce requests");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "total_deferred", CTLFLAG_RD, &bz->total_deferred, 0,
+	    "Total bounce requests that were deferred");
+	SYSCTL_ADD_STRING(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "lowaddr", CTLFLAG_RD, bz->lowaddrid, 0, "");
+	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
+	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
+	    "alignment", CTLFLAG_RD, &bz->alignment, 0, "");
+
+	return (0);
+}
+
+static int
+alloc_bounce_pages(bus_dma_tag_t dmat, u_int numpages)
+{
+	struct bounce_zone *bz;
+	int count;
+
+	bz = dmat->bounce_zone;
+	count = 0;
+	while (numpages > 0) {
+		struct bounce_page *bpage;
+
+		bpage = (struct bounce_page *)malloc(sizeof(*bpage), M_DEVBUF,
+						     M_NOWAIT | M_ZERO);
+
+		if (bpage == NULL)
+			break;
+		bpage->vaddr = (vm_offset_t)contigmalloc(PAGE_SIZE, M_DEVBUF,
+							 M_NOWAIT, 0ul,
+							 bz->lowaddr,
+							 PAGE_SIZE,
+							 0);
+		if (bpage->vaddr == 0) {
+			free(bpage, M_DEVBUF);
+			break;
+		}
+		bpage->busaddr = pmap_kextract(bpage->vaddr);
+		mtx_lock(&bounce_lock);
+		STAILQ_INSERT_TAIL(&bz->bounce_page_list, bpage, links);
+		total_bpages++;
+		bz->total_bpages++;
+		bz->free_bpages++;
+		mtx_unlock(&bounce_lock);
+		count++;
+		numpages--;
+	}
+	return (count);
+}
+
+static int
+reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int commit)
+{
+	struct bounce_zone *bz;
+	int pages;
+
+	mtx_assert(&bounce_lock, MA_OWNED);
+	bz = dmat->bounce_zone;
+	pages = MIN(bz->free_bpages, map->pagesneeded - map->pagesreserved);
+	if (commit == 0 && map->pagesneeded > (map->pagesreserved + pages))
+		return (map->pagesneeded - (map->pagesreserved + pages));
+	bz->free_bpages -= pages;
+	bz->reserved_bpages += pages;
+	map->pagesreserved += pages;
+	pages = map->pagesneeded - map->pagesreserved;
+
+	return (pages);
+}
+
+static bus_addr_t
+add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
+		bus_size_t size)
+{
+	struct bounce_zone *bz;
+	struct bounce_page *bpage;
+
+	printf("add bounce page\n");
+	KASSERT(dmat->bounce_zone != NULL, ("no bounce zone in dma tag"));
+	KASSERT(map != NULL,
+	    ("add_bounce_page: bad map %p", map));
+
+	bz = dmat->bounce_zone;
+	if (map->pagesneeded == 0)
+		panic("add_bounce_page: map doesn't need any pages");
+	map->pagesneeded--;
+
+	if (map->pagesreserved == 0)
+		panic("add_bounce_page: map doesn't need any pages");
+	map->pagesreserved--;
+
+	mtx_lock(&bounce_lock);
+	bpage = STAILQ_FIRST(&bz->bounce_page_list);
+	if (bpage == NULL)
+		panic("add_bounce_page: free page list is empty");
+
+	STAILQ_REMOVE_HEAD(&bz->bounce_page_list, links);
+	bz->reserved_bpages--;
+	bz->active_bpages++;
+	mtx_unlock(&bounce_lock);
+
+	if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) {
+		/* Page offset needs to be preserved. */
+		bpage->vaddr |= vaddr & PAGE_MASK;
+		bpage->busaddr |= vaddr & PAGE_MASK;
+	}
+	bpage->datavaddr = vaddr;
+	bpage->datacount = size;
+	STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
+	return (bpage->busaddr);
+}
+
+static void
+free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage)
+{
+	struct bus_dmamap *map;
+	struct bounce_zone *bz;
+
+	bz = dmat->bounce_zone;
+	bpage->datavaddr = 0;
+	bpage->datacount = 0;
+	if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) {
+		/*
+		 * Reset the bounce page to start at offset 0.  Other uses
+		 * of this bounce page may need to store a full page of
+		 * data and/or assume it starts on a page boundary.
+		 */
+		bpage->vaddr &= ~PAGE_MASK;
+		bpage->busaddr &= ~PAGE_MASK;
+	}
+
+	mtx_lock(&bounce_lock);
+	STAILQ_INSERT_HEAD(&bz->bounce_page_list, bpage, links);
+	bz->free_bpages++;
+	bz->active_bpages--;
+	if ((map = STAILQ_FIRST(&bounce_map_waitinglist)) != NULL) {
+		if (reserve_bounce_pages(map->dmat, map, 1) == 0) {
+			STAILQ_REMOVE_HEAD(&bounce_map_waitinglist, links);
+			STAILQ_INSERT_TAIL(&bounce_map_callbacklist,
+					   map, links);
+			busdma_swi_pending = 1;
+			bz->total_deferred++;
+			swi_sched(vm_ih, 0);
+		}
+	}
+	mtx_unlock(&bounce_lock);
+}
+
+void
+busdma_swi(void)
+{
+	bus_dma_tag_t dmat;
+	struct bus_dmamap *map;
+
+	mtx_lock(&bounce_lock);
+	while ((map = STAILQ_FIRST(&bounce_map_callbacklist)) != NULL) {
+		STAILQ_REMOVE_HEAD(&bounce_map_callbacklist, links);
+		mtx_unlock(&bounce_lock);
+		dmat = map->dmat;
+		(dmat->lockfunc)(dmat->lockfuncarg, BUS_DMA_LOCK);
+		bus_dmamap_load(map->dmat, map, map->buf, map->buflen,
+				map->callback, map->callback_arg, /*flags*/0);
+		(dmat->lockfunc)(dmat->lockfuncarg, BUS_DMA_UNLOCK);
+		mtx_lock(&bounce_lock);
+	}
+	mtx_unlock(&bounce_lock);
+}
diff --git a/sys/arm/arm/copystr.S b/sys/arm/arm/copystr.S
index 13bbf01..9eb8682 100644
--- a/sys/arm/arm/copystr.S
+++ b/sys/arm/arm/copystr.S
@@ -49,12 +49,17 @@ __FBSDID("$FreeBSD$");
 
 	.text
 	.align	0
-#ifdef MULTIPROCESSOR
-.Lcpu_info:
-	.word	_C_LABEL(cpu_info)
+
+#ifdef _ARM_ARCH_6
+#define GET_PCB(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4; \
+	add	tmp, tmp, #(PC_CURPCB)
 #else
 .Lpcb:
 	.word	_C_LABEL(__pcpu) + PC_CURPCB
+
+#define GET_PCB(tmp) \
+	ldr	tmp, .Lpcb
 #endif
 
 /*
@@ -108,18 +113,8 @@ ENTRY(copyinstr)
 	moveq	r0, #ENAMETOOLONG
 	beq	2f
 
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r3, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r4, .Lcpu_info
-	ldr	r4, [r4, r0, lsl #2]
-	ldr	r4, [r4, #CI_CURPCB]
-	ldmfd	sp!, {r0-r3, r14}
-#else
-	ldr	r4, .Lpcb
+	GET_PCB(r4)
 	ldr	r4, [r4]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r4, #0x00000000
@@ -165,18 +160,8 @@ ENTRY(copyoutstr)
 	moveq	r0, #ENAMETOOLONG
 	beq	2f
 
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0-r3, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r4, .Lcpu_info
-	ldr	r4, [r4, r0, lsl #2]
-	ldr	r4, [r4, #CI_CURPCB]
-	ldmfd	sp!, {r0-r3, r14}
-#else
-	ldr	r4, .Lpcb
+	GET_PCB(r4)
 	ldr	r4, [r4]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r4, #0x00000000
diff --git a/sys/arm/arm/cpufunc.c b/sys/arm/arm/cpufunc.c
index b29dfd9..decc287 100644
--- a/sys/arm/arm/cpufunc.c
+++ b/sys/arm/arm/cpufunc.c
@@ -98,6 +98,10 @@ int	arm_pcache_unified;
 int	arm_dcache_align;
 int	arm_dcache_align_mask;
 
+u_int	arm_cache_level;
+u_int	arm_cache_type[14];
+u_int	arm_cache_loc;
+
 /* 1 == use cpu_sleep(), 0 == don't */
 int cpu_do_powersave;
 int ctrl;
@@ -472,6 +476,126 @@ struct cpu_functions arm10_cpufuncs = {
 };
 #endif /* CPU_ARM10 */
 
+#ifdef CPU_MV_PJ4B
+struct cpu_functions pj4bv7_cpufuncs = {
+	/* CPU functions */
+
+	cpufunc_id,			/* id			*/
+	arm11_drain_writebuf,		/* cpwait		*/
+
+	/* MMU functions */
+
+	cpufunc_control,		/* control		*/
+	cpufunc_domains,		/* Domain		*/
+	pj4b_setttb,			/* Setttb		*/
+	cpufunc_faultstatus,		/* Faultstatus		*/
+	cpufunc_faultaddress,		/* Faultaddress		*/
+
+	/* TLB functions */
+
+	armv7_tlb_flushID,		/* tlb_flushID		*/
+	armv7_tlb_flushID_SE,		/* tlb_flushID_SE	*/
+	armv7_tlb_flushID,		/* tlb_flushI		*/
+	armv7_tlb_flushID_SE,		/* tlb_flushI_SE	*/
+	armv7_tlb_flushID,		/* tlb_flushD		*/
+	armv7_tlb_flushID_SE,		/* tlb_flushD_SE	*/
+
+	/* Cache operations */
+	armv7_idcache_wbinv_all,	/* icache_sync_all	*/
+	armv7_icache_sync_range,	/* icache_sync_range	*/
+
+	armv7_dcache_wbinv_all,		/* dcache_wbinv_all	*/
+	armv7_dcache_wbinv_range,	/* dcache_wbinv_range	*/
+	armv7_dcache_inv_range,		/* dcache_inv_range	*/
+	armv7_dcache_wb_range,		/* dcache_wb_range	*/
+
+	armv7_idcache_wbinv_all,	/* idcache_wbinv_all	*/
+	armv7_idcache_wbinv_range,	/* idcache_wbinv_all	*/
+
+	(void *)cpufunc_nullop,		/* l2cache_wbinv_all	*/
+	(void *)cpufunc_nullop,		/* l2cache_wbinv_range	*/
+	(void *)cpufunc_nullop,		/* l2cache_inv_range	*/
+	(void *)cpufunc_nullop,		/* l2cache_wb_range	*/
+
+	/* Other functions */
+
+	pj4b_drain_readbuf,		/* flush_prefetchbuf	*/
+	arm11_drain_writebuf,		/* drain_writebuf	*/
+	pj4b_flush_brnchtgt_all,	/* flush_brnchtgt_C	*/
+	pj4b_flush_brnchtgt_va,		/* flush_brnchtgt_E	*/
+
+	(void *)cpufunc_nullop,		/* sleep		*/
+
+	/* Soft functions */
+
+	cpufunc_null_fixup,		/* dataabt_fixup	*/
+	cpufunc_null_fixup,		/* prefetchabt_fixup	*/
+
+	arm11_context_switch,		/* context_switch	*/
+
+	pj4bv7_setup			/* cpu setup		*/
+};
+
+struct cpu_functions pj4bv6_cpufuncs = {
+	/* CPU functions */
+
+	cpufunc_id,			/* id			*/
+	arm11_drain_writebuf,		/* cpwait		*/
+
+	/* MMU functions */
+
+	cpufunc_control,		/* control		*/
+	cpufunc_domains,		/* Domain		*/
+	pj4b_setttb,			/* Setttb		*/
+	cpufunc_faultstatus,		/* Faultstatus		*/
+	cpufunc_faultaddress,		/* Faultaddress		*/
+
+	/* TLB functions */
+
+	arm11_tlb_flushID,		/* tlb_flushID		*/
+	arm11_tlb_flushID_SE,		/* tlb_flushID_SE	*/
+	arm11_tlb_flushI,		/* tlb_flushI		*/
+	arm11_tlb_flushI_SE,		/* tlb_flushI_SE	*/
+	arm11_tlb_flushD,		/* tlb_flushD		*/
+	arm11_tlb_flushD_SE,		/* tlb_flushD_SE	*/
+
+	/* Cache operations */
+	armv6_icache_sync_all,		/* icache_sync_all	*/
+	pj4b_icache_sync_range,		/* icache_sync_range	*/
+
+	armv6_dcache_wbinv_all,		/* dcache_wbinv_all	*/
+	pj4b_dcache_wbinv_range,	/* dcache_wbinv_range	*/
+	pj4b_dcache_inv_range,		/* dcache_inv_range	*/
+	pj4b_dcache_wb_range,		/* dcache_wb_range	*/
+
+	armv6_idcache_wbinv_all,	/* idcache_wbinv_all	*/
+	pj4b_idcache_wbinv_range,	/* idcache_wbinv_all	*/
+
+	(void *)cpufunc_nullop,		/* l2cache_wbinv_all	*/
+	(void *)cpufunc_nullop,		/* l2cache_wbinv_range	*/
+	(void *)cpufunc_nullop,		/* l2cache_inv_range	*/
+	(void *)cpufunc_nullop,		/* l2cache_wb_range	*/
+
+	/* Other functions */
+
+	pj4b_drain_readbuf,		/* flush_prefetchbuf	*/
+	arm11_drain_writebuf,		/* drain_writebuf	*/
+	pj4b_flush_brnchtgt_all,	/* flush_brnchtgt_C	*/
+	pj4b_flush_brnchtgt_va,		/* flush_brnchtgt_E	*/
+
+	(void *)cpufunc_nullop,		/* sleep		*/
+
+	/* Soft functions */
+
+	cpufunc_null_fixup,		/* dataabt_fixup	*/
+	cpufunc_null_fixup,		/* prefetchabt_fixup	*/
+
+	arm11_context_switch,		/* context_switch	*/
+
+	pj4bv6_setup			/* cpu setup		*/
+};
+#endif /* CPU_MV_PJ4B */
+
 #ifdef CPU_SA110
 struct cpu_functions sa110_cpufuncs = {
 	/* CPU functions */
@@ -844,6 +968,70 @@ struct cpu_functions fa526_cpufuncs = {
 };
 #endif	/* CPU_FA526 || CPU_FA626TE */
 
+#if defined(CPU_CORTEXA)
+struct cpu_functions cortexa_cpufuncs = {
+	/* CPU functions */
+	
+	cpufunc_id,                     /* id                   */
+	cpufunc_nullop,                 /* cpwait               */
+	
+	/* MMU functions */
+	
+	cpufunc_control,                /* control              */
+	cpufunc_domains,                /* Domain               */
+	armv7_setttb,                   /* Setttb               */
+	cpufunc_faultstatus,            /* Faultstatus          */
+	cpufunc_faultaddress,           /* Faultaddress         */
+	
+	/* TLB functions */
+	
+	arm11_tlb_flushID,              /* tlb_flushID          */
+	armv7_tlb_flushID_SE,           /* tlb_flushID_SE       */
+	arm11_tlb_flushI,               /* tlb_flushI           */
+	arm11_tlb_flushI_SE,            /* tlb_flushI_SE        */
+	arm11_tlb_flushD,               /* tlb_flushD           */
+	arm11_tlb_flushD_SE,            /* tlb_flushD_SE        */
+	
+	/* Cache operations */
+	
+	armv7_idcache_wbinv_all,         /* icache_sync_all      */
+	armv7_icache_sync_range,        /* icache_sync_range    */
+	
+	armv7_dcache_wbinv_all,         /* dcache_wbinv_all     */
+	armv7_dcache_wbinv_range,       /* dcache_wbinv_range   */
+	armv7_dcache_inv_range,         /* dcache_inv_range     */
+	armv7_dcache_wb_range,          /* dcache_wb_range      */
+	
+	armv7_idcache_wbinv_all,        /* idcache_wbinv_all    */
+	armv7_idcache_wbinv_range,      /* idcache_wbinv_range  */
+	
+	/* Note: From OMAP4 the L2 ops are filled in when the
+	 * L2 cache controller is actually enabled.
+	 */
+	cpufunc_nullop,                 /* l2cache_wbinv_all    */
+	(void *)cpufunc_nullop,         /* l2cache_wbinv_range  */
+	(void *)cpufunc_nullop,         /* l2cache_inv_range    */
+	(void *)cpufunc_nullop,         /* l2cache_wb_range     */
+	
+	/* Other functions */
+	
+	cpufunc_nullop,                 /* flush_prefetchbuf    */
+	arm11_drain_writebuf,           /* drain_writebuf       */
+	cpufunc_nullop,                 /* flush_brnchtgt_C     */
+	(void *)cpufunc_nullop,         /* flush_brnchtgt_E     */
+	
+	arm11_sleep,                    /* sleep                */
+	
+	/* Soft functions */
+	
+	cpufunc_null_fixup,             /* dataabt_fixup        */
+	cpufunc_null_fixup,             /* prefetchabt_fixup    */
+	
+	arm11_context_switch,           /* context_switch       */
+	
+	cortexa_setup                     /* cpu setup            */
+};
+#endif /* CPU_CORTEXA */
 
 /*
  * Global constants also used by locore.s
@@ -854,11 +1042,12 @@ u_int cputype;
 u_int cpu_reset_needs_v4_MMU_disable;	/* flag used in locore.s */
 
 #if defined(CPU_ARM7TDMI) || defined(CPU_ARM8) || defined(CPU_ARM9) ||	\
-  defined (CPU_ARM9E) || defined (CPU_ARM10) ||				\
+  defined (CPU_ARM9E) || defined (CPU_ARM10) || defined (CPU_ARM11) ||	\
   defined(CPU_XSCALE_80200) || defined(CPU_XSCALE_80321) ||		\
   defined(CPU_XSCALE_PXA2X0) || defined(CPU_XSCALE_IXP425) ||		\
-  defined(CPU_FA526) || defined(CPU_FA626TE) ||				\
-  defined(CPU_XSCALE_80219) || defined(CPU_XSCALE_81342)
+  defined(CPU_FA526) || defined(CPU_FA626TE) || defined(CPU_MV_PJ4B) ||			\
+  defined(CPU_XSCALE_80219) || defined(CPU_XSCALE_81342) || \
+  defined(CPU_CORTEXA)
 
 static void get_cachetype_cp15(void);
 
@@ -871,12 +1060,15 @@ static int	arm_dcache_l2_linesize;
 static void
 get_cachetype_cp15()
 {
-	u_int ctype, isize, dsize;
+	u_int ctype, isize, dsize, cpuid;
+	u_int clevel, csize, i, sel;
 	u_int multiplier;
+	u_char type;
 
 	__asm __volatile("mrc p15, 0, %0, c0, c0, 1"
 		: "=r" (ctype));
 
+	cpuid = cpufunc_id();
 	/*
 	 * ...and thus spake the ARM ARM:
 	 *
@@ -884,57 +1076,89 @@ get_cachetype_cp15()
 	 * reserved ID register is encountered, the System Control
 	 * processor returns the value of the main ID register.
 	 */
-	if (ctype == cpufunc_id())
+	if (ctype == cpuid)
 		goto out;
 
-	if ((ctype & CPU_CT_S) == 0)
-		arm_pcache_unified = 1;
+	if (CPU_CT_FORMAT(ctype) == CPU_CT_ARMV7) {
+		__asm __volatile("mrc p15, 1, %0, c0, c0, 1"
+		    : "=r" (clevel));
+		arm_cache_level = clevel;
+		arm_cache_loc = CPU_CLIDR_LOC(arm_cache_level);
+		i = 0;
+		while ((type = (clevel & 0x7)) && i < 7) {
+			if (type == CACHE_DCACHE || type == CACHE_UNI_CACHE ||
+			    type == CACHE_SEP_CACHE) {
+				sel = i << 1;
+				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
+				    : : "r" (sel));
+				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
+				    : "=r" (csize));
+				arm_cache_type[sel] = csize;
+				arm_dcache_align = 1 << 
+				    (CPUV7_CT_xSIZE_LEN(csize) + 4);
+				arm_dcache_align_mask = arm_dcache_align - 1;
+			}
+			if (type == CACHE_ICACHE || type == CACHE_SEP_CACHE) {
+				sel = (i << 1) | 1;
+				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
+				    : : "r" (sel));
+				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
+				    : "=r" (csize));
+				arm_cache_type[sel] = csize;
+			}
+			i++;
+			clevel >>= 3;
+		}
+	} else {
+		if ((ctype & CPU_CT_S) == 0)
+			arm_pcache_unified = 1;
 
-	/*
-	 * If you want to know how this code works, go read the ARM ARM.
-	 */
+		/*
+		 * If you want to know how this code works, go read the ARM ARM.
+		 */
 
-	arm_pcache_type = CPU_CT_CTYPE(ctype);
+		arm_pcache_type = CPU_CT_CTYPE(ctype);
+
+		if (arm_pcache_unified == 0) {
+			isize = CPU_CT_ISIZE(ctype);
+			multiplier = (isize & CPU_CT_xSIZE_M) ? 3 : 2;
+			arm_picache_line_size = 1U << (CPU_CT_xSIZE_LEN(isize) + 3);
+			if (CPU_CT_xSIZE_ASSOC(isize) == 0) {
+				if (isize & CPU_CT_xSIZE_M)
+					arm_picache_line_size = 0; /* not present */
+				else
+					arm_picache_ways = 1;
+			} else {
+				arm_picache_ways = multiplier <<
+				    (CPU_CT_xSIZE_ASSOC(isize) - 1);
+			}
+			arm_picache_size = multiplier << (CPU_CT_xSIZE_SIZE(isize) + 8);
+		}
 
-	if (arm_pcache_unified == 0) {
-		isize = CPU_CT_ISIZE(ctype);
-		multiplier = (isize & CPU_CT_xSIZE_M) ? 3 : 2;
-		arm_picache_line_size = 1U << (CPU_CT_xSIZE_LEN(isize) + 3);
-		if (CPU_CT_xSIZE_ASSOC(isize) == 0) {
-			if (isize & CPU_CT_xSIZE_M)
-				arm_picache_line_size = 0; /* not present */
+		dsize = CPU_CT_DSIZE(ctype);
+		multiplier = (dsize & CPU_CT_xSIZE_M) ? 3 : 2;
+		arm_pdcache_line_size = 1U << (CPU_CT_xSIZE_LEN(dsize) + 3);
+		if (CPU_CT_xSIZE_ASSOC(dsize) == 0) {
+			if (dsize & CPU_CT_xSIZE_M)
+				arm_pdcache_line_size = 0; /* not present */
 			else
-				arm_picache_ways = 1;
+				arm_pdcache_ways = 1;
 		} else {
-			arm_picache_ways = multiplier <<
-			    (CPU_CT_xSIZE_ASSOC(isize) - 1);
+			arm_pdcache_ways = multiplier <<
+			    (CPU_CT_xSIZE_ASSOC(dsize) - 1);
 		}
-		arm_picache_size = multiplier << (CPU_CT_xSIZE_SIZE(isize) + 8);
-	}
+		arm_pdcache_size = multiplier << (CPU_CT_xSIZE_SIZE(dsize) + 8);
 
-	dsize = CPU_CT_DSIZE(ctype);
-	multiplier = (dsize & CPU_CT_xSIZE_M) ? 3 : 2;
-	arm_pdcache_line_size = 1U << (CPU_CT_xSIZE_LEN(dsize) + 3);
-	if (CPU_CT_xSIZE_ASSOC(dsize) == 0) {
-		if (dsize & CPU_CT_xSIZE_M)
-			arm_pdcache_line_size = 0; /* not present */
-		else
-			arm_pdcache_ways = 1;
-	} else {
-		arm_pdcache_ways = multiplier <<
-		    (CPU_CT_xSIZE_ASSOC(dsize) - 1);
-	}
-	arm_pdcache_size = multiplier << (CPU_CT_xSIZE_SIZE(dsize) + 8);
-
-	arm_dcache_align = arm_pdcache_line_size;
+		arm_dcache_align = arm_pdcache_line_size;
 
-	arm_dcache_l2_assoc = CPU_CT_xSIZE_ASSOC(dsize) + multiplier - 2;
-	arm_dcache_l2_linesize = CPU_CT_xSIZE_LEN(dsize) + 3;
-	arm_dcache_l2_nsets = 6 + CPU_CT_xSIZE_SIZE(dsize) -
-	    CPU_CT_xSIZE_ASSOC(dsize) - CPU_CT_xSIZE_LEN(dsize);
+		arm_dcache_l2_assoc = CPU_CT_xSIZE_ASSOC(dsize) + multiplier - 2;
+		arm_dcache_l2_linesize = CPU_CT_xSIZE_LEN(dsize) + 3;
+		arm_dcache_l2_nsets = 6 + CPU_CT_xSIZE_SIZE(dsize) -
+		    CPU_CT_xSIZE_ASSOC(dsize) - CPU_CT_xSIZE_LEN(dsize);
 
- out:
-	arm_dcache_align_mask = arm_dcache_align - 1;
+	out:
+		arm_dcache_align_mask = arm_dcache_align - 1;
+	}
 }
 #endif /* ARM7TDMI || ARM8 || ARM9 || XSCALE */
 
@@ -1049,40 +1273,32 @@ set_cpufuncs()
 	}
 #endif /* CPU_ARM9 */
 #if defined(CPU_ARM9E) || defined(CPU_ARM10)
-	if (cputype == CPU_ID_ARM926EJS || cputype == CPU_ID_ARM1026EJS ||
-	    cputype == CPU_ID_MV88FR131 || cputype == CPU_ID_MV88FR571_VD ||
+	if (cputype == CPU_ID_MV88FR131 || cputype == CPU_ID_MV88FR571_VD ||
 	    cputype == CPU_ID_MV88FR571_41) {
-		if (cputype == CPU_ID_MV88FR131 ||
-		    cputype == CPU_ID_MV88FR571_VD ||
-		    cputype == CPU_ID_MV88FR571_41) {
-
-			cpufuncs = sheeva_cpufuncs;
-			/*
-			 * Workaround for Marvell MV78100 CPU: Cache prefetch
-			 * mechanism may affect the cache coherency validity,
-			 * so it needs to be disabled.
-			 *
-			 * Refer to errata document MV-S501058-00C.pdf (p. 3.1
-			 * L2 Prefetching Mechanism) for details.
-			 */
-			if (cputype == CPU_ID_MV88FR571_VD ||
-			    cputype == CPU_ID_MV88FR571_41) {
-				sheeva_control_ext(0xffffffff,
-				    FC_DCACHE_STREAM_EN | FC_WR_ALLOC_EN |
-				    FC_BRANCH_TARG_BUF_DIS | FC_L2CACHE_EN |
-				    FC_L2_PREF_DIS);
-			} else {
-				sheeva_control_ext(0xffffffff,
-				    FC_DCACHE_STREAM_EN | FC_WR_ALLOC_EN |
-				    FC_BRANCH_TARG_BUF_DIS | FC_L2CACHE_EN);
-			}
+		uint32_t sheeva_ctrl;
+
+		sheeva_ctrl = (MV_DC_STREAM_ENABLE | MV_BTB_DISABLE |
+		    MV_L2_ENABLE);
+		/*
+		 * Workaround for Marvell MV78100 CPU: Cache prefetch
+		 * mechanism may affect the cache coherency validity,
+		 * so it needs to be disabled.
+		 *
+		 * Refer to errata document MV-S501058-00C.pdf (p. 3.1
+		 * L2 Prefetching Mechanism) for details.
+		 */
+		if (cputype == CPU_ID_MV88FR571_VD ||
+		    cputype == CPU_ID_MV88FR571_41)
+			sheeva_ctrl |= MV_L2_PREFETCH_DISABLE;
 
-			/* Use powersave on this CPU. */
-			cpu_do_powersave = 1;
-		} else
-			cpufuncs = armv5_ec_cpufuncs;
+		sheeva_control_ext(0xffffffff & ~MV_WA_ENABLE, sheeva_ctrl);
 
-		cpu_reset_needs_v4_MMU_disable = 1;	/* V4 or higher */
+		cpufuncs = sheeva_cpufuncs;
+		get_cachetype_cp15();
+		pmap_pte_init_generic();
+		goto out;
+	} else if (cputype == CPU_ID_ARM926EJS || cputype == CPU_ID_ARM1026EJS) {
+		cpufuncs = armv5_ec_cpufuncs;
 		get_cachetype_cp15();
 		pmap_pte_init_generic();
 		goto out;
@@ -1108,6 +1324,45 @@ set_cpufuncs()
 		goto out;
 	}
 #endif /* CPU_ARM10 */
+#ifdef CPU_CORTEXA
+	if (cputype == CPU_ID_CORTEXA8R1 ||
+	    cputype == CPU_ID_CORTEXA8R2 ||
+	    cputype == CPU_ID_CORTEXA8R3 ||
+	    cputype == CPU_ID_CORTEXA9R1 ||
+	    cputype == CPU_ID_CORTEXA9R2) {
+		cpufuncs = cortexa_cpufuncs;
+		cpu_reset_needs_v4_MMU_disable = 1;     /* V4 or higher */
+		get_cachetype_cp15();
+		
+		pmap_pte_init_mmu_v6();
+		/* Use powersave on this CPU. */
+		cpu_do_powersave = 1;
+		goto out;
+	}
+#endif /* CPU_CORTEXA */
+		
+#if defined(CPU_MV_PJ4B)
+	if (cputype == CPU_ID_MV88SV581X_V6 ||
+	    cputype == CPU_ID_MV88SV581X_V7 ||
+	    cputype == CPU_ID_ARM_88SV581X_V6 ||
+	    cputype == CPU_ID_ARM_88SV581X_V7) {
+		if (cpu_pfr(0) & ARM_PFR0_THUMBEE_MASK)
+			cpufuncs = pj4bv7_cpufuncs;
+		else
+			cpufuncs = pj4bv6_cpufuncs;
+
+		get_cachetype_cp15();
+		pmap_pte_init_mmu_v6();
+		goto out;
+	} else if (cputype == CPU_ID_ARM_88SV584X ||
+	    cputype == CPU_ID_MV88SV584X) {
+		cpufuncs = pj4bv6_cpufuncs;
+		get_cachetype_cp15();
+		pmap_pte_init_mmu_v6();
+		goto out;
+	}
+
+#endif /* CPU_MV_PJ4B */
 #ifdef CPU_SA110
 	if (cputype == CPU_ID_SA110) {
 		cpufuncs = sa110_cpufuncs;
@@ -1970,7 +2225,6 @@ arm11_setup(args)
 	__asm __volatile ("mcr\tp15, 0, r0, c7, c7, 0" : : );
 
 	/* Set the control register */
-	curcpu()->ci_ctrl = cpuctrl;
 	cpu_control(0xffffffff, cpuctrl);
 
 	/* And again. */
@@ -1978,6 +2232,126 @@ arm11_setup(args)
 }
 #endif	/* CPU_ARM11 */
 
+#ifdef CPU_MV_PJ4B
+void
+pj4bv6_setup(char *args)
+{
+	int cpuctrl;
+
+	pj4b_config();
+
+	cpuctrl = CPU_CONTROL_MMU_ENABLE;
+#ifndef ARM32_DISABLE_ALIGNMENT_FAULTS
+	cpuctrl |= CPU_CONTROL_AFLT_ENABLE;
+#endif
+	cpuctrl |= CPU_CONTROL_DC_ENABLE;
+	cpuctrl |= (0xf << 3);
+#ifdef __ARMEB__
+	cpuctrl |= CPU_CONTROL_BEND_ENABLE;
+#endif
+	cpuctrl |= CPU_CONTROL_SYST_ENABLE;
+	cpuctrl |= CPU_CONTROL_BPRD_ENABLE;
+	cpuctrl |= CPU_CONTROL_IC_ENABLE;
+	if (vector_page == ARM_VECTORS_HIGH)
+		cpuctrl |= CPU_CONTROL_VECRELOC;
+	cpuctrl |= (0x5 << 16);
+	cpuctrl |= CPU_CONTROL_V6_EXTPAGE;
+	/* XXX not yet */
+	/* cpuctrl |= CPU_CONTROL_L2_ENABLE; */
+
+	/* Make sure caches are clean.  */
+	cpu_idcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+
+	/* Set the control register */
+	ctrl = cpuctrl;
+	cpu_control(0xffffffff, cpuctrl);
+
+	cpu_idcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+}
+
+void
+pj4bv7_setup(args)
+	char *args;
+{
+	int cpuctrl;
+
+	pj4b_config();
+
+	cpuctrl = CPU_CONTROL_MMU_ENABLE;
+#ifndef ARM32_DISABLE_ALIGNMENT_FAULTS
+	cpuctrl |= CPU_CONTROL_AFLT_ENABLE;
+#endif
+	cpuctrl |= CPU_CONTROL_DC_ENABLE;
+	cpuctrl |= (0xf << 3);
+	cpuctrl |= CPU_CONTROL_BPRD_ENABLE;
+	cpuctrl |= CPU_CONTROL_IC_ENABLE;
+	if (vector_page == ARM_VECTORS_HIGH)
+		cpuctrl |= CPU_CONTROL_VECRELOC;
+	cpuctrl |= (0x5 << 16) | (1 < 22);
+	cpuctrl |= CPU_CONTROL_V6_EXTPAGE;
+
+	/* Clear out the cache */
+	cpu_idcache_wbinv_all();
+
+	/* Set the control register */
+	ctrl = cpuctrl;
+	cpu_control(0xFFFFFFFF, cpuctrl);
+
+	/* And again. */
+	cpu_idcache_wbinv_all();
+}
+#endif /* CPU_MV_PJ4B */
+
+#ifdef CPU_CORTEXA
+
+void
+cortexa_setup(char *args)
+{
+	int cpuctrl, cpuctrlmask;
+	
+	cpuctrlmask = CPU_CONTROL_MMU_ENABLE |     /* MMU enable         [0] */
+	    CPU_CONTROL_AFLT_ENABLE |    /* Alignment fault    [1] */
+	    CPU_CONTROL_DC_ENABLE |      /* DCache enable      [2] */
+	    CPU_CONTROL_BPRD_ENABLE |    /* Branch prediction [11] */
+	    CPU_CONTROL_IC_ENABLE |      /* ICache enable     [12] */
+	    CPU_CONTROL_VECRELOC;        /* Vector relocation [13] */
+	
+	cpuctrl = CPU_CONTROL_MMU_ENABLE |
+	    CPU_CONTROL_IC_ENABLE |
+	    CPU_CONTROL_DC_ENABLE |
+	    CPU_CONTROL_BPRD_ENABLE;
+	
+#ifndef ARM32_DISABLE_ALIGNMENT_FAULTS
+	cpuctrl |= CPU_CONTROL_AFLT_ENABLE;
+#endif
+	
+	/* Switch to big endian */
+#ifdef __ARMEB__
+	cpuctrl |= CPU_CONTROL_BEND_ENABLE;
+#endif
+	
+	/* Check if the vector page is at the high address (0xffff0000) */
+	if (vector_page == ARM_VECTORS_HIGH)
+		cpuctrl |= CPU_CONTROL_VECRELOC;
+	
+	/* Clear out the cache */
+	cpu_idcache_wbinv_all();
+	
+	/* Set the control register */
+	ctrl = cpuctrl;
+	cpu_control(cpuctrlmask, cpuctrl);
+	
+	/* And again. */
+	cpu_idcache_wbinv_all();
+#ifdef SMP
+	armv7_auxctrl((1 << 6) | (1 << 0), (1 << 6) | (1 << 0)); /* Enable SMP + TLB broadcasting  */
+#endif
+}
+#endif  /* CPU_CORTEXA */
+
+
 #ifdef CPU_SA110
 struct cpu_option sa110_options[] = {
 #ifdef COMPAT_12
diff --git a/sys/arm/arm/cpufunc_asm.S b/sys/arm/arm/cpufunc_asm.S
index 99b40f4..1709796 100644
--- a/sys/arm/arm/cpufunc_asm.S
+++ b/sys/arm/arm/cpufunc_asm.S
@@ -65,6 +65,10 @@ ENTRY(cpufunc_id)
 	mrc	p15, 0, r0, c0, c0, 0
 	RET
 
+ENTRY(cpufunc_cpuid)
+	mrc	p15, 0, r0, c0, c0, 0
+	RET
+
 ENTRY(cpu_get_control)
 	mrc	p15, 0, r0, c1, c0, 0
 	RET
diff --git a/sys/arm/arm/cpufunc_asm_arm11.S b/sys/arm/arm/cpufunc_asm_arm11.S
index 81914db..bac945bc 100644
--- a/sys/arm/arm/cpufunc_asm_arm11.S
+++ b/sys/arm/arm/cpufunc_asm_arm11.S
@@ -122,3 +122,8 @@ ENTRY(arm11_tlb_flushD_SE)
 ENTRY(arm11_drain_writebuf)
 	mcr	p15, 0, r0, c7, c10, 4	/* drain write buffer */
 	mov	pc, lr
+
+ENTRY_NP(arm11_sleep)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c0, 4   /* wait for interrupt */
+	RET
diff --git a/sys/arm/arm/cpufunc_asm_armv7.S b/sys/arm/arm/cpufunc_asm_armv7.S
new file mode 100644
index 0000000..963aa68
--- /dev/null
+++ b/sys/arm/arm/cpufunc_asm_armv7.S
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (C) 2011 MARVELL INTERNATIONAL LTD.
+ * All rights reserved.
+ *
+ * Developed by Semihalf.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of MARVELL nor the names of contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+	.cpu cortex-a8
+
+.Lcoherency_level:
+	.word	_C_LABEL(arm_cache_loc)
+.Lcache_type:
+	.word	_C_LABEL(arm_cache_type)
+.Lway_mask:
+	.word	0x3ff
+.Lmax_index:
+	.word	0x7fff
+.Lpage_mask:
+	.word	0xfff
+
+#define PT_NOS          (1 << 5)
+#define PT_S 	        (1 << 1)
+#define PT_INNER_NC	0
+#define PT_INNER_WT	(1 << 0)
+#define PT_INNER_WB	((1 << 0) | (1 << 6))
+#define PT_INNER_WBWA	(1 << 6)
+#define PT_OUTER_NC	0
+#define PT_OUTER_WT	(2 << 3)
+#define PT_OUTER_WB	(3 << 3)
+#define PT_OUTER_WBWA	(1 << 3)
+	
+#ifdef SMP
+#define PT_ATTR	(PT_S|PT_INNER_WT|PT_OUTER_WT|PT_NOS)
+#else
+#define PT_ATTR	(PT_INNER_WT|PT_OUTER_WT)
+#endif
+
+ENTRY(armv7_setttb)
+	stmdb   sp!, {r0, lr}
+ 	bl      _C_LABEL(armv7_idcache_wbinv_all) /* clean the D cache */
+ 	ldmia   sp!, {r0, lr}
+ 	dsb
+				
+	orr 	r0, r0, #PT_ATTR
+ 	mcr	p15, 0, r0, c2, c0, 0	/* Translation Table Base Register 0 (TTBR0) */
+ 	mcr     p15, 0, r0, c8, c7, 0   /* invalidate I+D TLBs */
+ 	dsb
+ 	isb
+	RET
+
+ENTRY(armv7_tlb_flushID)
+	dsb
+#ifdef SMP
+	mcr 	p15, 0, r0, c8, c3, 0
+#else
+	mcr	p15, 0, r0, c8, c7, 0	/* flush I+D tlb */
+#endif
+	mcr	p15, 0, r0, c7, c5, 6	/* flush BTB */
+	dsb
+	isb
+	mov	pc, lr
+
+ENTRY(armv7_tlb_flushID_SE)
+	ldr	r1, .Lpage_mask
+	bic	r0, r0, r1
+#ifdef SMP
+	mcr	p15, 0, r0, c8, c3, 1	/* flush D tlb single entry */
+#else
+	mcr	p15, 0, r0, c8, c7, 1	/* flush D tlb single entry */
+#endif
+	mcr	p15, 0, r0, c7, c5, 6	/* flush BTB */
+	dsb
+	isb
+	mov	pc, lr
+
+/* Based on algorithm from ARM Architecture Reference Manual */
+ENTRY(armv7_dcache_wbinv_all)
+	stmdb	sp!, {r4, r5, r6, r7, r8, r9}
+
+	/* Get cache level */
+	ldr	r0, .Lcoherency_level
+	ldr	r3, [r0]
+	cmp	r3, #0
+	beq	Finished
+	/* For each cache level */
+	mov	r8, #0
+Loop1:
+	/* Get cache type for given level */
+	mov	r2, r8, lsl #2
+	add	r2, r2, r2
+	ldr	r0, .Lcache_type
+	ldr	r1, [r0, r2]
+
+	/* Get line size */
+	and	r2, r1, #7
+	add	r2, r2, #4
+
+	/* Get number of ways */
+	ldr	r4, .Lway_mask
+	ands	r4, r4, r1, lsr #3
+	clz	r5, r4
+
+	/* Get max index */
+	ldr	r7, .Lmax_index
+	ands	r7, r7, r1, lsr #13
+Loop2:
+	mov	r9, r4
+Loop3:
+	mov	r6, r8, lsl #1
+	orr	r6, r6, r9, lsl r5
+	orr	r6, r6, r7, lsl r2
+
+	/* Clean and invalidate data cache by way/index */
+	mcr	p15, 0, r6, c7, c14, 2
+	subs	r9, r9, #1
+	bge	Loop3
+	subs	r7, r7, #1
+	bge	Loop2
+Skip:
+	add	r8, r8, #1
+	cmp	r3, r8
+	bne Loop1
+Finished:
+	dsb
+	ldmia	sp!, {r4, r5, r6, r7, r8, r9}
+	RET
+
+ENTRY(armv7_idcache_wbinv_all)
+	stmdb	sp!, {lr}
+	bl armv7_dcache_wbinv_all
+	mcr	p15, 0, r0, c7, c5, 0	/* Invalidate all I caches to PoU (ICIALLU) */
+	dsb
+	isb
+	ldmia	sp!, {lr}
+	RET
+
+/* XXX Temporary set it to 32 for MV cores, however this value should be
+ * get from Cache Type register
+ */
+.Larmv7_line_size:
+	.word	32
+
+ENTRY(armv7_dcache_wb_range)
+	ldr	ip, .Larmv7_line_size
+	sub	r3, ip, #1
+	and	r2, r0, r3
+	add	r1, r1, r2
+	bic	r0, r0, r3
+.Larmv7_wb_next:
+	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bhi	.Larmv7_wb_next
+	dsb				/* data synchronization barrier */
+	RET
+
+ENTRY(armv7_dcache_wbinv_range)
+	ldr	ip, .Larmv7_line_size
+	sub     r3, ip, #1
+	and     r2, r0, r3
+	add     r1, r1, r2
+	bic     r0, r0, r3
+.Larmv7_wbinv_next:
+	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bhi	.Larmv7_wbinv_next
+	dsb				/* data synchronization barrier */
+	RET
+
+/*
+ * Note, we must not invalidate everything.  If the range is too big we
+ * must use wb-inv of the entire cache.
+ */
+ENTRY(armv7_dcache_inv_range)
+	ldr	ip, .Larmv7_line_size
+	sub     r3, ip, #1
+	and     r2, r0, r3
+	add     r1, r1, r2
+	bic     r0, r0, r3
+.Larmv7_inv_next:
+	mcr	p15, 0, r0, c7, c6, 1	/* Invalidate D cache SE with VA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bhi	.Larmv7_inv_next
+	dsb				/* data synchronization barrier */
+	RET
+
+ENTRY(armv7_idcache_wbinv_range)
+	ldr	ip, .Larmv7_line_size
+	sub     r3, ip, #1
+	and     r2, r0, r3
+	add     r1, r1, r2
+	bic     r0, r0, r3
+.Larmv7_id_wbinv_next:
+	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
+	mcr	p15, 0, r0, c7, c14, 1	/* Purge D cache SE with VA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bhi	.Larmv7_id_wbinv_next
+	isb				/* instruction synchronization barrier */
+	dsb				/* data synchronization barrier */
+	RET
+
+ENTRY_NP(armv7_icache_sync_range)
+	ldr	ip, .Larmv7_line_size
+.Larmv7_sync_next:
+	mcr	p15, 0, r0, c7, c5, 1	/* Invalidate I cache SE with VA */
+	mcr	p15, 0, r0, c7, c10, 1	/* Clean D cache SE with VA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bhi	.Larmv7_sync_next
+	isb				/* instruction synchronization barrier */
+	dsb				/* data synchronization barrier */
+	RET
+
+ENTRY(armv7_cpu_sleep)
+	dsb				/* data synchronization barrier */
+	wfi  				/* wait for interrupt */
+	RET
+
+ENTRY(armv7_context_switch)
+	dsb
+	orr     r0, r0, #PT_ATTR
+			
+	mcr	p15, 0, r0, c2, c0, 0	/* set the new TTB */
+	mcr	p15, 0, r0, c8, c7, 0	/* and flush the I+D tlbs */
+	dsb
+	isb
+	RET
+
+ENTRY(armv7_drain_writebuf)
+	dsb
+	RET
+
+ENTRY(armv7_sev)
+	dsb
+	sev
+	nop
+	RET
+
+ENTRY(armv7_auxctrl)
+	mrc p15, 0, r2, c1, c0, 1
+	bic r3, r2, r0	/* Clear bits */
+	eor r3, r3, r1  /* XOR bits */
+
+	teq r2, r3
+	mcrne p15, 0, r3, c1, c0, 1
+	mov r0, r2
+	RET
diff --git a/sys/arm/arm/cpufunc_asm_pj4b.S b/sys/arm/arm/cpufunc_asm_pj4b.S
new file mode 100644
index 0000000..f6890d9
--- /dev/null
+++ b/sys/arm/arm/cpufunc_asm_pj4b.S
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (C) 2011 MARVELL INTERNATIONAL LTD.
+ * All rights reserved.
+ *
+ * Developed by Semihalf.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of MARVELL nor the names of contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#include <machine/param.h>
+
+.Lpj4b_cache_line_size:
+	.word	_C_LABEL(arm_pdcache_line_size)
+
+ENTRY(pj4b_setttb)
+	/* Cache synchronization is not required as this core has PIPT caches */
+	mcr	p15, 0, r1, c7, c10, 4	/* drain the write buffer */
+#ifdef SMP
+	orr 	r0, r0, #2		/* Set TTB shared memory flag */
+#endif
+	mcr	p15, 0, r0, c2, c0, 0	/* load new TTB */
+	mcr	p15, 0, r0, c8, c7, 0	/* invalidate I+D TLBs */
+	RET
+
+ENTRY_NP(armv6_icache_sync_all)
+	/*
+	 * We assume that the code here can never be out of sync with the
+	 * dcache, so that we can safely flush the Icache and fall through
+	 * into the Dcache cleaning code.
+	 */
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0	/* Invalidate ICache */
+	mcr	p15, 0, r0, c7, c10, 0	/* Clean (don't invalidate) DCache */
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_icache_sync_range)
+	sub	r1, r1, #1
+	add	r1, r0, r1
+	mcrr	p15, 0, r1, r0, c5	/* invalidate IC range */
+	mcrr	p15, 0, r1, r0, c12	/* clean DC range */
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_dcache_inv_range)
+	ldr	ip, .Lpj4b_cache_line_size
+	ldr	ip, [ip]
+	sub	r1, r1, #1		/* Don't overrun */
+	sub	r3, ip, #1
+	and	r2, r0, r3
+	add	r1, r1, r2
+	bic	r0, r0, r3
+
+	mcr	p15, 0, r0, c7, c10, 5  /* Data Memory Barrier err:4413 */
+1:
+	mcr	p15, 0, r0, c7, c6, 1
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bpl	1b
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(armv6_idcache_wbinv_all)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c5, 0	/* invalidate ICache */
+	mcr	p15, 0, r0, c7, c14, 0	/* clean and invalidate DCache */
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(armv6_dcache_wbinv_all)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c14, 0	/* clean and invalidate DCache */
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_idcache_wbinv_range)
+	ldr	ip, .Lpj4b_cache_line_size
+	ldr	ip, [ip]
+	sub	r1, r1, #1		/* Don't overrun */
+	sub	r3, ip, #1
+	and	r2, r0, r3
+	add	r1, r1, r2
+	bic	r0, r0, r3
+
+	mcr	p15, 0, r0, c7, c10, 5  /* Data Memory Barrier err:4611 */
+1:
+#ifdef SMP
+	/* Request for ownership */
+	ldr	r2, [r0]
+	str	r2, [r0]
+#endif
+	mcr	p15, 0, r0, c7, c5, 1
+	mcr	p15, 0, r0, c7, c14, 1	/* L2C clean and invalidate entry */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bpl	1b
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_dcache_wbinv_range)
+	ldr	ip, .Lpj4b_cache_line_size
+	ldr	ip, [ip]
+	sub	r1, r1, #1		/* Don't overrun */
+	sub	r3, ip, #1
+	and	r2, r0, r3
+	add	r1, r1, r2
+	bic	r0, r0, r3
+
+	mcr	p15, 0, r0, c7, c10, 5  /* Data Memory Barrier err:4611 */
+1:
+#ifdef SMP
+	/* Request for ownership */
+	ldr	r2, [r0]
+	str	r2, [r0]
+#endif
+	mcr	p15, 0, r0, c7, c14, 1
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bpl	1b
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_dcache_wb_range)
+	ldr	ip, .Lpj4b_cache_line_size
+	ldr	ip, [ip]
+	sub	r1, r1, #1		/* Don't overrun */
+	sub	r3, ip, #1
+	and	r2, r0, r3
+	add	r1, r1, r2
+	bic	r0, r0, r3
+
+	mcr	p15, 0, r0, c7, c10, 5  /* Data Memory Barrier err:4611 */
+1:
+#ifdef SMP
+	/* Request for ownership */
+	ldr	r2, [r0]
+	str	r2, [r0]
+#endif
+	mcr	p15, 0, r0, c7, c10, 1	/* L2C clean single entry by MVA */
+	add	r0, r0, ip
+	subs	r1, r1, ip
+	bpl	1b
+	mcr	p15, 0, r0, c7, c10, 4	/* drain the write buffer */
+	RET
+
+ENTRY(pj4b_drain_readbuf)
+	mcr	p15, 0, r0, c7, c5, 4	/* flush prefetch buffers */
+	RET
+
+ENTRY(pj4b_flush_brnchtgt_all)
+	mcr	p15, 0, r0, c7, c5, 6	/* flush entrie branch target cache */
+	RET
+
+ENTRY(pj4b_flush_brnchtgt_va)
+	mcr	p15, 0, r0, c7, c5, 7	/* flush branch target cache by VA */
+	RET
+
+ENTRY(get_core_id)
+	mrc p15, 0, r0, c0, c0, 5
+	RET
+
+ENTRY(pj4b_config)
+	/* Set Auxiliary Debug Modes Control 2 register */
+	mrc	p15, 1, r0, c15, c1, 2
+	bic	r0, r0, #(1 << 23)
+	orr	r0, r0, #(1 << 25)
+	orr	r0, r0, #(1 << 27)
+	orr	r0, r0, #(1 << 29)
+	orr	r0, r0, #(1 << 30)
+	mcr	p15, 1, r0, c15, c1, 2
+#if defined(SMP)
+	/* Set SMP mode in Auxiliary Control Register */
+	mrc	p15, 0, r0, c1, c0, 1
+	orr	r0, r0, #(1 << 5)
+	mcr	p15, 0, r0, c1, c0, 1
+#endif
+	RET
diff --git a/sys/arm/arm/elf_trampoline.c b/sys/arm/arm/elf_trampoline.c
index 121bd56..c2b9eaf 100644
--- a/sys/arm/arm/elf_trampoline.c
+++ b/sys/arm/arm/elf_trampoline.c
@@ -72,15 +72,25 @@ void __startC(void);
 #define cpu_idcache_wbinv_all	xscale_cache_purgeID
 #elif defined(CPU_XSCALE_81342)
 #define cpu_idcache_wbinv_all	xscalec3_cache_purgeID
+#elif defined(CPU_MV_PJ4B)
+#if !defined(SOC_MV_ARMADAXP)
+#define cpu_idcache_wbinv_all	armv6_idcache_wbinv_all
+#else
+#define cpu_idcache_wbinv_all()	armadaxp_idcache_wbinv_all
 #endif
+#endif /* CPU_MV_PJ4B */
 #ifdef CPU_XSCALE_81342
 #define cpu_l2cache_wbinv_all	xscalec3_l2cache_purge
 #elif defined(SOC_MV_KIRKWOOD) || defined(SOC_MV_DISCOVERY)
 #define cpu_l2cache_wbinv_all	sheeva_l2cache_wbinv_all
+#elif defined(CPU_CORTEXA)
+#define cpu_idcache_wbinv_all	armv7_idcache_wbinv_all
+#define cpu_l2cache_wbinv_all()
 #else
 #define cpu_l2cache_wbinv_all()	
 #endif
 
+static void armadaxp_idcache_wbinv_all(void);
 
 int     arm_picache_size;
 int     arm_picache_line_size;
@@ -96,6 +106,10 @@ int     arm_pcache_unified;
 int     arm_dcache_align;
 int     arm_dcache_align_mask;
 
+u_int	arm_cache_level;
+u_int	arm_cache_type[14];
+u_int	arm_cache_loc;
+
 /* Additional cache information local to this file.  Log2 of some of the
       above numbers.  */
 static int      arm_dcache_l2_nsets;
@@ -221,8 +235,6 @@ _startC(void)
 	if ((cpufunc_id() & 0x0000f000) == 0x00009000)
 		arm9_setup();
 #endif
-	cpu_idcache_wbinv_all();
-	cpu_l2cache_wbinv_all();
 #endif
 	__start();
 }
@@ -230,68 +242,102 @@ _startC(void)
 static void
 get_cachetype_cp15()
 {
-	u_int ctype, isize, dsize;
+	u_int ctype, isize, dsize, cpuid;
+	u_int clevel, csize, i, sel;
 	u_int multiplier;
+	u_char type;
 
 	__asm __volatile("mrc p15, 0, %0, c0, c0, 1"
-	    : "=r" (ctype));
-	
+		: "=r" (ctype));
+
+	cpuid = cpufunc_id();
 	/*
 	 * ...and thus spake the ARM ARM:
 	 *
- 	 * If an <opcode2> value corresponding to an unimplemented or
+	 * If an <opcode2> value corresponding to an unimplemented or
 	 * reserved ID register is encountered, the System Control
 	 * processor returns the value of the main ID register.
 	 */
-	if (ctype == cpufunc_id())
+	if (ctype == cpuid)
 		goto out;
-	
-	if ((ctype & CPU_CT_S) == 0)
-		arm_pcache_unified = 1;
 
-	/*
-	 * If you want to know how this code works, go read the ARM ARM.
-	 */
-	
-	arm_pcache_type = CPU_CT_CTYPE(ctype);
-        if (arm_pcache_unified == 0) {
-		isize = CPU_CT_ISIZE(ctype);
-	    	multiplier = (isize & CPU_CT_xSIZE_M) ? 3 : 2;
-		arm_picache_line_size = 1U << (CPU_CT_xSIZE_LEN(isize) + 3);
-		if (CPU_CT_xSIZE_ASSOC(isize) == 0) {
-			if (isize & CPU_CT_xSIZE_M)
-				arm_picache_line_size = 0; /* not present */
+	if (CPU_CT_FORMAT(ctype) == CPU_CT_ARMV7) {
+		__asm __volatile("mrc p15, 1, %0, c0, c0, 1"
+		    : "=r" (clevel));
+		arm_cache_level = clevel;
+		arm_cache_loc = CPU_CLIDR_LOC(arm_cache_level) + 1;
+		i = 0;
+		while ((type = (clevel & 0x7)) && i < 7) {
+			if (type == CACHE_DCACHE || type == CACHE_UNI_CACHE ||
+			    type == CACHE_SEP_CACHE) {
+				sel = i << 1;
+				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
+				    : : "r" (sel));
+				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
+				    : "=r" (csize));
+				arm_cache_type[sel] = csize;
+			}
+			if (type == CACHE_ICACHE || type == CACHE_SEP_CACHE) {
+				sel = (i << 1) | 1;
+				__asm __volatile("mcr p15, 2, %0, c0, c0, 0"
+				    : : "r" (sel));
+				__asm __volatile("mrc p15, 1, %0, c0, c0, 0"
+				    : "=r" (csize));
+				arm_cache_type[sel] = csize;
+			}
+			i++;
+			clevel >>= 3;
+		}
+	} else {
+		if ((ctype & CPU_CT_S) == 0)
+			arm_pcache_unified = 1;
+
+		/*
+		 * If you want to know how this code works, go read the ARM ARM.
+		 */
+
+		arm_pcache_type = CPU_CT_CTYPE(ctype);
+
+		if (arm_pcache_unified == 0) {
+			isize = CPU_CT_ISIZE(ctype);
+			multiplier = (isize & CPU_CT_xSIZE_M) ? 3 : 2;
+			arm_picache_line_size = 1U << (CPU_CT_xSIZE_LEN(isize) + 3);
+			if (CPU_CT_xSIZE_ASSOC(isize) == 0) {
+				if (isize & CPU_CT_xSIZE_M)
+					arm_picache_line_size = 0; /* not present */
+				else
+					arm_picache_ways = 1;
+			} else {
+				arm_picache_ways = multiplier <<
+				    (CPU_CT_xSIZE_ASSOC(isize) - 1);
+			}
+			arm_picache_size = multiplier << (CPU_CT_xSIZE_SIZE(isize) + 8);
+		}
+
+		dsize = CPU_CT_DSIZE(ctype);
+		multiplier = (dsize & CPU_CT_xSIZE_M) ? 3 : 2;
+		arm_pdcache_line_size = 1U << (CPU_CT_xSIZE_LEN(dsize) + 3);
+		if (CPU_CT_xSIZE_ASSOC(dsize) == 0) {
+			if (dsize & CPU_CT_xSIZE_M)
+				arm_pdcache_line_size = 0; /* not present */
 			else
-				arm_picache_ways = 1;
+				arm_pdcache_ways = 1;
 		} else {
-			arm_picache_ways = multiplier <<
-			    (CPU_CT_xSIZE_ASSOC(isize) - 1);
+			arm_pdcache_ways = multiplier <<
+			    (CPU_CT_xSIZE_ASSOC(dsize) - 1);
 		}
-		arm_picache_size = multiplier << (CPU_CT_xSIZE_SIZE(isize) + 8);
-	}
-	
-	dsize = CPU_CT_DSIZE(ctype);
-	multiplier = (dsize & CPU_CT_xSIZE_M) ? 3 : 2;
-	arm_pdcache_line_size = 1U << (CPU_CT_xSIZE_LEN(dsize) + 3);
-	if (CPU_CT_xSIZE_ASSOC(dsize) == 0) {
-		if (dsize & CPU_CT_xSIZE_M)
-			arm_pdcache_line_size = 0; /* not present */
-		else
-			arm_pdcache_ways = 1;
-	} else {
-		arm_pdcache_ways = multiplier <<
-		    (CPU_CT_xSIZE_ASSOC(dsize) - 1);
+		arm_pdcache_size = multiplier << (CPU_CT_xSIZE_SIZE(dsize) + 8);
+
+		arm_dcache_align = arm_pdcache_line_size;
+
+		arm_dcache_l2_assoc = CPU_CT_xSIZE_ASSOC(dsize) + multiplier - 2;
+		arm_dcache_l2_linesize = CPU_CT_xSIZE_LEN(dsize) + 3;
+		arm_dcache_l2_nsets = 6 + CPU_CT_xSIZE_SIZE(dsize) -
+		    CPU_CT_xSIZE_ASSOC(dsize) - CPU_CT_xSIZE_LEN(dsize);
+
+	out:
+		arm_dcache_align_mask = arm_dcache_align - 1;
 	}
-	arm_pdcache_size = multiplier << (CPU_CT_xSIZE_SIZE(dsize) + 8);
-	
-	arm_dcache_align = arm_pdcache_line_size;
-	
-	arm_dcache_l2_assoc = CPU_CT_xSIZE_ASSOC(dsize) + multiplier - 2;
-	arm_dcache_l2_linesize = CPU_CT_xSIZE_LEN(dsize) + 3;
-	arm_dcache_l2_nsets = 6 + CPU_CT_xSIZE_SIZE(dsize) -
-	    CPU_CT_xSIZE_ASSOC(dsize) - CPU_CT_xSIZE_LEN(dsize);
- out:
-	arm_dcache_align_mask = arm_dcache_align - 1;
 }
 
 static void
@@ -306,7 +352,18 @@ arm9_setup(void)
 	arm9_dcache_index_max = 0U - arm9_dcache_index_inc;
 }
 
+static void
+armadaxp_idcache_wbinv_all(void)
+{
+	uint32_t feat;
 
+	__asm __volatile("mrc p15, 0, %0, c0, c1, 0" : "=r" (feat));
+	if (feat & ARM_PFR0_THUMBEE_MASK)
+		armv7_idcache_wbinv_all();
+	else
+		armv6_idcache_wbinv_all();
+
+}
 #ifdef KZIP
 static  unsigned char *orig_input, *i_input, *i_output;
 
diff --git a/sys/arm/arm/fusu.S b/sys/arm/arm/fusu.S
index 02e4870..edf1a63 100644
--- a/sys/arm/arm/fusu.S
+++ b/sys/arm/arm/fusu.S
@@ -39,12 +39,15 @@
 #include "assym.s"
 __FBSDID("$FreeBSD$");
 
-#ifdef MULTIPROCESSOR
-.Lcpu_info:
-	.word	_C_LABEL(cpu_info)
+#ifdef _ARM_ARCH_6
+#define GET_PCB(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4; \
+	add	tmp, tmp, #(PC_CURPCB)
 #else
 .Lcurpcb:
 	.word	_C_LABEL(__pcpu) + PC_CURPCB
+#define GET_PCB(tmp) \
+	ldr	tmp, .Lcurpcb
 #endif
 
 /*
@@ -54,18 +57,8 @@ __FBSDID("$FreeBSD$");
 
 ENTRY_NP(casuword32)
 ENTRY(casuword)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r14}
-#else
-	ldr	r3, .Lcurpcb
+	GET_PCB(r3)
 	ldr	r3, [r3]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r3, #0x00000000
@@ -101,18 +94,8 @@ ENTRY(casuword)
 
 ENTRY_NP(fuword32)
 ENTRY(fuword)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -135,18 +118,8 @@ ENTRY(fuword)
  */
 
 ENTRY(fusword)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -180,18 +153,8 @@ ENTRY(fuswintr)
 	mvnne	r0, #0x00000000
 	RETne
 
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -229,18 +192,8 @@ _C_LABEL(block_userspace_access):
  */
 
 ENTRY(fubyte)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -303,18 +256,8 @@ fusupcbfaulttext:
 
 ENTRY_NP(suword32)
 ENTRY(suword)
-#ifdef MULTIPROCESSOR
-	/* XXX Probably not appropriate for non-Hydra SMPs */
-	stmfd	sp!, {r0, r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -343,17 +286,8 @@ ENTRY(suswintr)
 	mvnne	r0, #0x00000000
 	RETne
 
-#ifdef MULTIPROCESSOR
-	stmfd	sp!, {r0, r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -382,17 +316,8 @@ ENTRY(suswintr)
  */
 
 ENTRY(susword)
-#ifdef MULTIPROCESSOR
-	stmfd	sp!, {r0, r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 #ifdef DIAGNOSTIC
 	teq	r2, #0x00000000
@@ -421,17 +346,8 @@ ENTRY(susword)
  */
 
 ENTRY(subyte)
-#ifdef MULTIPROCESSOR
-	stmfd	sp!, {r0, r1, r14}
-	bl	_C_LABEL(cpu_number)
-	ldr	r2, .Lcpu_info
-	ldr	r2, [r2, r0, lsl #2]
-	ldr	r2, [r2, #CI_CURPCB]
-	ldmfd	sp!, {r0, r1, r14}
-#else
-	ldr	r2, .Lcurpcb
+	GET_PCB(r2)
 	ldr	r2, [r2]
-#endif
 
 
 #ifdef DIAGNOSTIC
diff --git a/sys/arm/arm/genassym.c b/sys/arm/arm/genassym.c
index 40b2988..4b8f4cc 100644
--- a/sys/arm/arm/genassym.c
+++ b/sys/arm/arm/genassym.c
@@ -34,7 +34,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <machine/vmparam.h>
 #include <machine/armreg.h>
 #include <machine/pcb.h>
@@ -105,9 +107,22 @@ ASSYM(TF_PC, offsetof(struct trapframe, tf_pc));
 ASSYM(P_PID, offsetof(struct proc, p_pid));
 ASSYM(P_FLAG, offsetof(struct proc, p_flag));
 
+#ifdef ARM_TP_ADDRESS
 ASSYM(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
 ASSYM(ARM_RAS_START, ARM_RAS_START);
 ASSYM(ARM_RAS_END, ARM_RAS_END);
+#endif
+
+#ifdef ARM_VFP_SUPPORT
+ASSYM(PCB_VFPSTATE, offsetof(struct pcb, pcb_vfpstate));
+ASSYM(PCB_VFPCPU, offsetof(struct pcb, pcb_vfpcpu));
+
+ASSYM(PC_VFPCTHREAD, offsetof(struct pcpu, pc_vfpcthread));
+ASSYM(PC_CPU, offsetof(struct pcpu, pc_cpu));
+
+ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
+#endif
+
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(PDESIZE, PDESIZE);
 ASSYM(PMAP_DOMAIN_KERNEL, PMAP_DOMAIN_KERNEL);
diff --git a/sys/arm/arm/gic.c b/sys/arm/arm/gic.c
new file mode 100644
index 0000000..ba25c1d
--- /dev/null
+++ b/sys/arm/arm/gic.c
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Developed by Damjan Marion <damjan.marion@gmail.com>
+ *
+ * Based on OMAP4 GIC code by Ben Gray
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/rman.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <machine/bus.h>
+#include <machine/intr.h>
+#include <machine/smp.h>
+
+#include <dev/fdt/fdt_common.h>
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+/* We are using GICv2 register naming */
+
+/* Distributor Registers */
+#define GICD_CTLR		0x000			/* v1 ICDDCR */
+#define GICD_TYPER		0x004			/* v1 ICDICTR */
+#define GICD_IIDR		0x008			/* v1 ICDIIDR */
+#define GICD_IGROUPR(n)		(0x0080 + ((n) * 4))	/* v1 ICDISER */
+#define GICD_ISENABLER(n)	(0x0100 + ((n) * 4))	/* v1 ICDISER */
+#define GICD_ICENABLER(n)	(0x0180 + ((n) * 4))	/* v1 ICDICER */
+#define GICD_ISPENDR(n)		(0x0200 + ((n) * 4))	/* v1 ICDISPR */
+#define GICD_ICPENDR(n)		(0x0280 + ((n) * 4))	/* v1 ICDICPR */
+#define GICD_ICACTIVER(n)	(0x0380 + ((n) * 4))	/* v1 ICDABR */
+#define GICD_IPRIORITYR(n)	(0x0400 + ((n) * 4))	/* v1 ICDIPR */
+#define GICD_ITARGETSR(n)	(0x0800 + ((n) * 4))	/* v1 ICDIPTR */
+#define GICD_ICFGR(n)		(0x0C00 + ((n) * 4))	/* v1 ICDICFR */
+#define GICD_SGIR(n)		(0x0F00 + ((n) * 4))	/* v1 ICDSGIR */
+
+/* CPU Registers */
+#define GICC_CTLR		0x0000			/* v1 ICCICR */
+#define GICC_PMR		0x0004			/* v1 ICCPMR */
+#define GICC_BPR		0x0008			/* v1 ICCBPR */
+#define GICC_IAR		0x000C			/* v1 ICCIAR */
+#define GICC_EOIR		0x0010			/* v1 ICCEOIR */
+#define GICC_RPR		0x0014			/* v1 ICCRPR */
+#define GICC_HPPIR		0x0018			/* v1 ICCHPIR */
+#define GICC_ABPR		0x001C			/* v1 ICCABPR */
+#define GICC_IIDR		0x00FC			/* v1 ICCIIDR*/
+
+struct arm_gic_softc {
+	struct resource *	gic_res[3];
+	bus_space_tag_t		gic_c_bst;
+	bus_space_tag_t		gic_d_bst;
+	bus_space_handle_t	gic_c_bsh;
+	bus_space_handle_t	gic_d_bsh;
+	uint8_t			ver;
+};
+
+static struct resource_spec arm_gic_spec[] = {
+	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },	/* Distributor registers */
+	{ SYS_RES_MEMORY,	1,	RF_ACTIVE },	/* CPU Interrupt Intf. registers */
+	{ -1, 0 }
+};
+
+static struct arm_gic_softc *arm_gic_sc = NULL;
+
+#define	gic_c_read_4(reg)		\
+    bus_space_read_4(arm_gic_sc->gic_c_bst, arm_gic_sc->gic_c_bsh, reg)
+#define	gic_c_write_4(reg, val)		\
+    bus_space_write_4(arm_gic_sc->gic_c_bst, arm_gic_sc->gic_c_bsh, reg, val)
+#define	gic_d_read_4(reg)		\
+    bus_space_read_4(arm_gic_sc->gic_d_bst, arm_gic_sc->gic_d_bsh, reg)
+#define	gic_d_write_4(reg, val)		\
+    bus_space_write_4(arm_gic_sc->gic_d_bst, arm_gic_sc->gic_d_bsh, reg, val)
+
+static void gic_post_filter(void *);
+
+static int
+arm_gic_probe(device_t dev)
+{
+	if (!ofw_bus_is_compatible(dev, "arm,gic"))
+		return (ENXIO);
+	device_set_desc(dev, "ARM Generic Interrupt Controller");
+	return (BUS_PROBE_DEFAULT);
+}
+
+void
+gic_init_secondary(void)
+{
+	int nirqs;
+	
+  	/* Get the number of interrupts */
+	nirqs = gic_d_read_4(GICD_TYPER);
+	nirqs = 32 * ((nirqs & 0x1f) + 1);
+			
+	for (int i = 0; i < nirqs; i += 4)
+		gic_d_write_4(GICD_IPRIORITYR(i >> 2), 0);
+	/* Enable CPU interface */
+	gic_c_write_4(GICC_CTLR, 1);
+
+	/* Enable interrupt distribution */
+	gic_d_write_4(GICD_CTLR, 0x01);
+		
+	/* Activate IRQ 29, ie private timer IRQ*/
+	gic_d_write_4(GICD_ISENABLER(29 >> 5), (1UL << (29 & 0x1F)));
+}
+
+static int
+arm_gic_attach(device_t dev)
+{
+	struct		arm_gic_softc *sc = device_get_softc(dev);
+	int		i;
+	uint32_t	icciidr;
+	uint32_t	nirqs;
+
+	if (arm_gic_sc)
+		return (ENXIO);
+
+	if (bus_alloc_resources(dev, arm_gic_spec, sc->gic_res)) {
+		device_printf(dev, "could not allocate resources\n");
+		return (ENXIO);
+	}
+
+	arm_post_filter = gic_post_filter;
+
+	/* Distributor Interface */
+	sc->gic_d_bst = rman_get_bustag(sc->gic_res[0]);
+	sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[0]);
+
+	/* CPU Interface */
+	sc->gic_c_bst = rman_get_bustag(sc->gic_res[1]);
+	sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[1]);
+
+	arm_gic_sc = sc;
+
+	/* Disable interrupt forwarding to the CPU interface */
+	gic_d_write_4(GICD_CTLR, 0x00);
+
+	/* Get the number of interrupts */
+	nirqs = gic_d_read_4(GICD_TYPER);
+	nirqs = 32 * ((nirqs & 0x1f) + 1);
+
+	icciidr = gic_c_read_4(GICC_IIDR);
+	device_printf(dev,"pn 0x%x, arch 0x%x, rev 0x%x, implementer 0x%x nirqs %u\n", 
+			icciidr>>20, (icciidr>>16) & 0xF, (icciidr>>12) & 0xf,
+			(icciidr & 0xfff), nirqs);
+
+	/* Set all global interrupts to be level triggered, active low. */
+	for (i = 32; i < nirqs; i += 32) {
+		gic_d_write_4(GICD_ICFGR(i >> 5), 0x00000000);
+	}
+
+	/* Disable all interrupts. */
+	for (i = 32; i < nirqs; i += 32) {
+		gic_d_write_4(GICD_ICENABLER(i >> 5), 0xFFFFFFFF);
+	}
+
+	for (i = 0; i < nirqs; i += 4) {
+		gic_d_write_4(GICD_IPRIORITYR(i >> 2),  0);
+		gic_d_write_4(GICD_ITARGETSR(i >> 2), 0xffffffff);
+	}
+
+	/* Enable CPU interface */
+	gic_c_write_4(GICC_CTLR, 1);
+
+	/* Enable interrupt distribution */
+	gic_d_write_4(GICD_CTLR, 0x01);
+
+	return (0);
+}
+
+static device_method_t arm_gic_methods[] = {
+	DEVMETHOD(device_probe,		arm_gic_probe),
+	DEVMETHOD(device_attach,	arm_gic_attach),
+	{ 0, 0 }
+};
+
+static driver_t arm_gic_driver = {
+	"gic",
+	arm_gic_methods,
+	sizeof(struct arm_gic_softc),
+};
+
+static devclass_t arm_gic_devclass;
+
+DRIVER_MODULE(gic, simplebus, arm_gic_driver, arm_gic_devclass, 0, 0);
+
+static void
+gic_post_filter(void *arg)
+{
+	uintptr_t irq = (uintptr_t) arg;
+
+	gic_c_write_4(GICC_EOIR, irq);
+}
+
+int
+arm_get_next_irq(int last_irq)
+{
+	uint32_t active_irq;
+
+	active_irq = gic_c_read_4(GICC_IAR);
+
+	/* 
+	 * Immediatly EOIR the SGIs, because doing so requires the other
+	 * bits (ie CPU number), not just the IRQ number, and we do not
+	 * have this information later.
+	 */
+	   
+	if ((active_irq & 0x3ff) < 16)
+		gic_c_write_4(GICC_EOIR, active_irq);
+	active_irq &= 0x3FF;
+
+	if (active_irq == 0x3FF) {
+		if (last_irq == -1)
+			printf("Spurious interrupt detected [0x%08x]\n", active_irq);
+		return -1;
+	}
+	        gic_c_write_4(GICC_EOIR, active_irq);
+
+	return active_irq;
+}
+
+void
+arm_mask_irq(uintptr_t nb)
+{
+	gic_d_write_4(GICD_ICENABLER(nb >> 5), (1UL << (nb & 0x1F)));
+}
+
+void
+arm_unmask_irq(uintptr_t nb)
+{
+	
+	gic_c_write_4(GICC_EOIR, nb);
+	gic_d_write_4(GICD_ISENABLER(nb >> 5), (1UL << (nb & 0x1F)));
+}
+
+#ifdef SMP
+void
+pic_ipi_send(cpuset_t cpus, u_int ipi)
+{
+	uint32_t val = 0, i;
+
+	for (i = 0; i < MAXCPU; i++)
+		if (CPU_ISSET(i, &cpus))
+			val |= 1 << (16 + i);
+	gic_d_write_4(GICD_SGIR(0), val | ipi);
+	
+}
+
+int
+pic_ipi_get(int i)
+{
+	
+	if (i != -1) {
+		/*
+		 * The intr code will automagically give the frame pointer
+		 * if the interrupt argument is 0.
+		 */
+		if ((unsigned int)i > 16) 
+			return (0);
+		return (i);
+	}
+	return (0x3ff);
+}
+
+void
+pic_ipi_clear(int ipi)
+{
+}
+#endif
+
diff --git a/sys/arm/arm/identcpu.c b/sys/arm/arm/identcpu.c
index f996f1a..94e2707 100644
--- a/sys/arm/arm/identcpu.c
+++ b/sys/arm/arm/identcpu.c
@@ -236,6 +236,17 @@ const struct cpuidtab cpuids[] = {
 	{ CPU_ID_ARM1026EJS,	CPU_CLASS_ARM10EJ,	"ARM1026EJ-S",
 	  generic_steppings },
 
+	{ CPU_ID_CORTEXA8R1,	CPU_CLASS_CORTEXA,	"Cortex A8-r1",
+	  generic_steppings },
+	{ CPU_ID_CORTEXA8R2,	CPU_CLASS_CORTEXA,	"Cortex A8-r2",
+	  generic_steppings },
+	{ CPU_ID_CORTEXA8R3,	CPU_CLASS_CORTEXA,	"Cortex A8-r3",
+	  generic_steppings },
+	{ CPU_ID_CORTEXA9R1,	CPU_CLASS_CORTEXA,	"Cortex A9-r1",
+	  generic_steppings },
+	{ CPU_ID_CORTEXA9R2,	CPU_CLASS_CORTEXA,	"Cortex A9-r2",
+	  generic_steppings },
+
 	{ CPU_ID_SA110,		CPU_CLASS_SA1,		"SA-110",
 	  sa110_steppings },
 	{ CPU_ID_SA1100,	CPU_CLASS_SA1,		"SA-1100",
@@ -302,8 +313,17 @@ const struct cpuidtab cpuids[] = {
 
 	{ CPU_ID_MV88FR571_VD,	CPU_CLASS_MARVELL,	"Feroceon 88FR571-VD",
 	  generic_steppings },
-
-	{ CPU_ID_MV88FR571_41,	CPU_CLASS_MARVELL,	"Early Feroceon 88FR571",
+	{ CPU_ID_MV88SV581X_V6,	CPU_CLASS_MARVELL,	"Sheeva 88SV581x",
+	  generic_steppings },
+	{ CPU_ID_ARM_88SV581X_V6, CPU_CLASS_MARVELL,	"Sheeva 88SV581x",
+	  generic_steppings },
+	{ CPU_ID_MV88SV581X_V7,	CPU_CLASS_MARVELL,	"Sheeva 88SV581x",
+	  generic_steppings },
+	{ CPU_ID_ARM_88SV581X_V7, CPU_CLASS_MARVELL,	"Sheeva 88SV581x",
+	  generic_steppings },
+	{ CPU_ID_MV88SV584X,	CPU_CLASS_MARVELL,	"Sheeva 88SV584x",
+	  generic_steppings },
+	{ CPU_ID_ARM_88SV584X,	CPU_CLASS_MARVELL,	"Sheeva 88SV584x",
 	  generic_steppings },
 
 	{ 0, CPU_CLASS_NONE, NULL, NULL }
@@ -328,6 +348,7 @@ const struct cpu_classtab cpu_classes[] = {
 	{ "ARM9EJ-S",	"CPU_ARM9E" },		/* CPU_CLASS_ARM9EJS */
 	{ "ARM10E",	"CPU_ARM10" },		/* CPU_CLASS_ARM10E */
 	{ "ARM10EJ",	"CPU_ARM10" },		/* CPU_CLASS_ARM10EJ */
+	{ "Cortex-A",	"CPU_CORTEXA" },	/* CPU_CLASS_CORTEXA */
 	{ "SA-1",	"CPU_SA110" },		/* CPU_CLASS_SA1 */
 	{ "XScale",	"CPU_XSCALE_..." },	/* CPU_CLASS_XSCALE */
 	{ "ARM11J",	"CPU_ARM11" },		/* CPU_CLASS_ARM11J */
@@ -359,13 +380,81 @@ static const char * const wtnames[] = {
 	"**unknown 15**",
 };
 
+static void
+print_enadis(int enadis, char *s)
+{
+
+	printf(" %s %sabled", s, (enadis == 0) ? "dis" : "en");
+}
 
 extern int ctrl;
 enum cpu_class cpu_class = CPU_CLASS_NONE;
+
+u_int cpu_pfr(int num)
+{
+	u_int feat;
+
+	switch (num) {
+	case 0:
+		__asm __volatile("mrc p15, 0, %0, c0, c1, 0"
+		    : "=r" (feat));
+		break;
+	case 1:
+		__asm __volatile("mrc p15, 0, %0, c0, c1, 1"
+		    : "=r" (feat));
+		break;
+	default:
+		panic("Processor Feature Register %d not implemented", num);
+		break;
+	}
+
+	return (feat);
+}
+
+static
+void identify_armv7(void)
+{
+	u_int feature;
+
+	printf("Supported features:");
+	/* Get Processor Feature Register 0 */
+	feature = cpu_pfr(0);
+
+	if (feature & ARM_PFR0_ARM_ISA_MASK)
+		printf(" ARM_ISA");
+
+	if (feature & ARM_PFR0_THUMB2)
+		printf(" THUMB2");
+	else if (feature & ARM_PFR0_THUMB)
+		printf(" THUMB");
+
+	if (feature & ARM_PFR0_JAZELLE_MASK)
+		printf(" JAZELLE");
+
+	if (feature & ARM_PFR0_THUMBEE_MASK)
+		printf(" THUMBEE");
+
+
+	/* Get Processor Feature Register 1 */
+	feature = cpu_pfr(1);
+
+	if (feature & ARM_PFR1_ARMV4_MASK)
+		printf(" ARMv4");
+
+	if (feature & ARM_PFR1_SEC_EXT_MASK)
+		printf(" Security_Ext");
+
+	if (feature & ARM_PFR1_MICROCTRL_MASK)
+		printf(" M_profile");
+
+	printf("\n");
+}
+
 void
 identify_arm_cpu(void)
 {
-	u_int cpuid;
+	u_int cpuid, reg, size, sets, ways;
+	u_int8_t type, linesize;
 	int i;
 
 	cpuid = cpu_id();
@@ -389,74 +478,130 @@ identify_arm_cpu(void)
 		printf("unknown CPU (ID = 0x%x)\n", cpuid);
 
 	printf(" ");
-	switch (cpu_class) {
-	case CPU_CLASS_ARM6:
-	case CPU_CLASS_ARM7:
-	case CPU_CLASS_ARM7TDMI:
-	case CPU_CLASS_ARM8:
-		if ((ctrl & CPU_CONTROL_IDC_ENABLE) == 0)
-			printf(" IDC disabled");
-		else
-			printf(" IDC enabled");
-		break;
-	case CPU_CLASS_ARM9TDMI:
-	case CPU_CLASS_ARM9ES:
-	case CPU_CLASS_ARM9EJS:
-	case CPU_CLASS_ARM10E:
-	case CPU_CLASS_ARM10EJ:
-	case CPU_CLASS_SA1:
-	case CPU_CLASS_XSCALE:
-	case CPU_CLASS_ARM11J:
-	case CPU_CLASS_MARVELL:
-		if ((ctrl & CPU_CONTROL_DC_ENABLE) == 0)
-			printf(" DC disabled");
-		else
-			printf(" DC enabled");
-		if ((ctrl & CPU_CONTROL_IC_ENABLE) == 0)
-			printf(" IC disabled");
+
+	if ((cpuid & CPU_ID_ARCH_MASK) == CPU_ID_CPUID_SCHEME) {
+		identify_armv7();
+	} else {
+		if (ctrl & CPU_CONTROL_BEND_ENABLE)
+			printf(" Big-endian");
 		else
-			printf(" IC enabled");
+			printf(" Little-endian");
+
+		switch (cpu_class) {
+		case CPU_CLASS_ARM6:
+		case CPU_CLASS_ARM7:
+		case CPU_CLASS_ARM7TDMI:
+		case CPU_CLASS_ARM8:
+			print_enadis(ctrl & CPU_CONTROL_IDC_ENABLE, "IDC");
+			break;
+		case CPU_CLASS_ARM9TDMI:
+		case CPU_CLASS_ARM9ES:
+		case CPU_CLASS_ARM9EJS:
+		case CPU_CLASS_ARM10E:
+		case CPU_CLASS_ARM10EJ:
+		case CPU_CLASS_SA1:
+		case CPU_CLASS_XSCALE:
+		case CPU_CLASS_ARM11J:
+		case CPU_CLASS_MARVELL:
+			print_enadis(ctrl & CPU_CONTROL_DC_ENABLE, "DC");
+			print_enadis(ctrl & CPU_CONTROL_IC_ENABLE, "IC");
 #ifdef CPU_XSCALE_81342
-		if ((ctrl & CPU_CONTROL_L2_ENABLE) == 0)
-			printf(" L2 disabled");
-		else
-			printf(" L2 enabled");
+			print_enadis(ctrl & CPU_CONTROL_L2_ENABLE, "L2");
 #endif
-		break;
-	default:
-		break;
+#if defined(SOC_MV_KIRKWOOD) || defined(SOC_MV_DISCOVERY)
+			i = sheeva_control_ext(0, 0);
+			print_enadis(i & MV_WA_ENABLE, "WA");
+			print_enadis(i & MV_DC_STREAM_ENABLE, "DC streaming");
+			printf("\n ");
+			print_enadis((i & MV_BTB_DISABLE) == 0, "BTB");
+			print_enadis(i & MV_L2_ENABLE, "L2");
+			print_enadis((i & MV_L2_PREFETCH_DISABLE) == 0,
+			    "L2 prefetch");
+			printf("\n ");
+#endif
+			break;
+		default:
+			break;
+		}
 	}
-	if ((ctrl & CPU_CONTROL_WBUF_ENABLE) == 0)
-		printf(" WB disabled");
-	else
-		printf(" WB enabled");
 
+	print_enadis(ctrl & CPU_CONTROL_WBUF_ENABLE, "WB");
 	if (ctrl & CPU_CONTROL_LABT_ENABLE)
 		printf(" LABT");
 	else
 		printf(" EABT");
 
-	if (ctrl & CPU_CONTROL_BPRD_ENABLE)
-		printf(" branch prediction enabled");
-
+	print_enadis(ctrl & CPU_CONTROL_BPRD_ENABLE, "branch prediction");
 	printf("\n");
-	/* Print cache info. */
-	if (arm_picache_line_size == 0 && arm_pdcache_line_size == 0)
-		return;
-	
-	if (arm_pcache_unified) {
- 		printf("  %dKB/%dB %d-way %s unified cache\n",
-		    arm_pdcache_size / 1024,
-		    arm_pdcache_line_size, arm_pdcache_ways,
-		    wtnames[arm_pcache_type]);
+
+	if (arm_cache_level) {
+		printf("LoUU:%d LoC:%d LoUIS:%d \n", CPU_CLIDR_LOUU(arm_cache_level) + 1,
+		    arm_cache_loc, CPU_CLIDR_LOUIS(arm_cache_level) + 1);
+		i = 0;
+		while (((type = CPU_CLIDR_CTYPE(arm_cache_level, i)) != 0) && i < 7) {
+			printf("Cache level %d: \n", i + 1);
+			if (type == CACHE_DCACHE || type == CACHE_UNI_CACHE ||
+			    type == CACHE_SEP_CACHE) {
+				reg = arm_cache_type[2 * i];
+				ways = CPUV7_CT_xSIZE_ASSOC(reg) + 1;
+				sets = CPUV7_CT_xSIZE_SET(reg) + 1;
+				linesize = 1 << (CPUV7_CT_xSIZE_LEN(reg) + 4);
+				size = (ways * sets * linesize) / 1024;
+
+				if (type == CACHE_UNI_CACHE)
+					printf(" %dKB/%dB %d-way unified cache", size, linesize,ways);
+				else
+					printf(" %dKB/%dB %d-way data cache", size, linesize, ways);
+				if (reg & CPUV7_CT_CTYPE_WT)
+					printf(" WT");
+				if (reg & CPUV7_CT_CTYPE_WB)
+					printf(" WB");
+				if (reg & CPUV7_CT_CTYPE_RA)
+					printf(" Read-Alloc");
+				if (reg & CPUV7_CT_CTYPE_WA)
+					printf(" Write-Alloc");
+				printf("\n");
+			}
+
+			if (type == CACHE_ICACHE || type == CACHE_SEP_CACHE) {
+				reg = arm_cache_type[(2 * i) + 1];
+
+				ways = CPUV7_CT_xSIZE_ASSOC(reg) + 1;
+				sets = CPUV7_CT_xSIZE_SET(reg) + 1;
+				linesize = 1 << (CPUV7_CT_xSIZE_LEN(reg) + 4);
+				size = (ways * sets * linesize) / 1024;
+
+				printf(" %dKB/%dB %d-way instruction cache", size, linesize, ways);
+				if (reg & CPUV7_CT_CTYPE_WT)
+					printf(" WT");
+				if (reg & CPUV7_CT_CTYPE_WB)
+					printf(" WB");
+				if (reg & CPUV7_CT_CTYPE_RA)
+					printf(" Read-Alloc");
+				if (reg & CPUV7_CT_CTYPE_WA)
+					printf(" Write-Alloc");
+				printf("\n");
+			}
+			i++;
+		}
 	} else {
-		printf("  %dKB/%dB %d-way Instruction cache\n",
-		    arm_picache_size / 1024,
-		    arm_picache_line_size, arm_picache_ways);
-		printf("  %dKB/%dB %d-way %s Data cache\n",
-		    arm_pdcache_size / 1024,
-		    arm_pdcache_line_size, arm_pdcache_ways,
-		    wtnames[arm_pcache_type]);
+		/* Print cache info. */
+		if (arm_picache_line_size == 0 && arm_pdcache_line_size == 0)
+			return;
+
+		if (arm_pcache_unified) {
+			printf("  %dKB/%dB %d-way %s unified cache\n",
+			    arm_pdcache_size / 1024,
+			    arm_pdcache_line_size, arm_pdcache_ways,
+			    wtnames[arm_pcache_type]);
+		} else {
+			printf("  %dKB/%dB %d-way instruction cache\n",
+			    arm_picache_size / 1024,
+			    arm_picache_line_size, arm_picache_ways);
+			printf("  %dKB/%dB %d-way %s data cache\n",
+			    arm_pdcache_size / 1024,
+			    arm_pdcache_line_size, arm_pdcache_ways,
+			    wtnames[arm_pcache_type]);
+		}
 	}
 }
-
diff --git a/sys/arm/arm/locore.S b/sys/arm/arm/locore.S
index b6e049e..e81912c 100644
--- a/sys/arm/arm/locore.S
+++ b/sys/arm/arm/locore.S
@@ -1,6 +1,7 @@
 /*	$NetBSD: locore.S,v 1.14 2003/04/20 16:21:40 thorpej Exp $	*/
 
 /*-
+ * Copyright 2011 Semihalf
  * Copyright (C) 1994-1997 Mark Brinicombe
  * Copyright (C) 1994 Brini
  * All rights reserved.
@@ -41,7 +42,7 @@
 __FBSDID("$FreeBSD$");
 
 /* What size should this really be ? It is only used by initarm() */
-#define INIT_ARM_STACK_SIZE	2048
+#define INIT_ARM_STACK_SIZE	(2048 * 4)
 
 #define	CPWAIT_BRANCH							 \
 	sub	pc, pc, #4
@@ -161,15 +162,26 @@ Lunmapped:
 	orrne	r5, r5, #PHYSADDR
 	movne	pc, r5
 
+#if defined(SMP)
+	orr 	r0, r0, #2		/* Set TTB shared memory flag */
+#endif
 	mcr	p15, 0, r0, c2, c0, 0	/* Set TTB */
 	mcr	p15, 0, r0, c8, c7, 0	/* Flush TLB */
 
+#if defined(CPU_ARM11) || defined(CPU_CORTEXA) || defined(CPU_MV_PJ4B)
+	mov	r0, #0
+	mcr	p15, 0, r0, c13, c0, 1	/* Set ASID to 0 */
+#endif
+
 	/* Set the Domain Access register.  Very important! */
 	mov     r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT)
 	mcr	p15, 0, r0, c3, c0, 0
 	/* Enable MMU */
 	mrc	p15, 0, r0, c1, c0, 0
-	orr	r0, r0, #CPU_CONTROL_MMU_ENABLE
+#if defined(CPU_ARM11) || defined(CPU_CORTEXA) || defined(CPU_MV_PJ4B)
+	orr	r0, r0, #CPU_CONTROL_V6_EXTPAGE
+#endif
+	orr	r0, r0, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE)
 	mcr	p15, 0, r0, c1, c0, 0
 	nop
 	nop
@@ -225,13 +237,23 @@ Lend:
 	.word	_edata
 Lstartup_pagetable:
 	.word	STARTUP_PAGETABLE_ADDR
+#ifdef SMP
+Lstartup_pagetable_secondary:
+	.word	temp_pagetable
+#endif
 mmu_init_table:
 	/* fill all table VA==PA */
 	/* map SDRAM VA==PA, WT cacheable */
+#if !defined(SMP)
 	MMU_INIT(PHYSADDR, PHYSADDR , 64, L1_TYPE_S|L1_S_C|L1_S_AP(AP_KRW))
 	/* map VA 0xc0000000..0xc3ffffff to PA */
 	MMU_INIT(KERNBASE, PHYSADDR, 64, L1_TYPE_S|L1_S_C|L1_S_AP(AP_KRW))
-
+#else
+	MMU_INIT(PHYSADDR, PHYSADDR , 64, L1_TYPE_S|L1_SHARED|L1_S_C|L1_S_AP(AP_KRW))
+	/* map VA 0xc0000000..0xc3ffffff to PA */
+	MMU_INIT(KERNBASE, PHYSADDR, 64, L1_TYPE_S|L1_SHARED|L1_S_C|L1_S_AP(AP_KRW))
+	MMU_INIT(0x48000000, 0x48000000, 1, L1_TYPE_S|L1_SHARED|L1_S_C|L1_S_AP(AP_KRW))
+#endif
 	.word 0	/* end of table */
 #endif
 .Lstart:
@@ -241,6 +263,11 @@ mmu_init_table:
 
 .Lvirt_done:
 	.word	virt_done
+#if defined(SMP)
+.Lmpvirt_done:
+	.word	mpvirt_done
+#endif
+
 .Lmainreturned:
 	.asciz	"main() returned"
 	.align	0
@@ -255,6 +282,133 @@ svcstk:
 .Lcpufuncs:
 	.word	_C_LABEL(cpufuncs)
 
+#if defined(SMP)
+Lsramaddr:
+	.word	0xffff0080
+
+#if 0
+#define	AP_DEBUG(tmp)			\
+	mrc	p15, 0, r1, c0, c0, 5;	\
+	ldr	r0, Lsramaddr;		\
+	add	r0, r1, lsl #2;		\
+	mov	r1, tmp;		\
+	str	r1, [r0], #0x0000;
+#else
+#define AP_DEBUG(tmp)
+#endif
+
+
+ASENTRY_NP(mptramp)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c7, 0
+
+	AP_DEBUG(#1)
+
+	mrs	r3, cpsr_all
+	bic	r3, r3, #(PSR_MODE)
+	orr	r3, r3, #(PSR_SVC32_MODE)
+        msr	cpsr_all, r3
+
+	mrc	p15, 0, r0, c0, c0, 5
+	and	r0, #0x0f		/* Get CPU ID */
+
+	/* Read boot address for CPU */
+	mov	r1, #0x100
+	mul	r2, r0, r1
+	ldr	r1, Lpmureg
+	add	r0, r2, r1
+	ldr	r1, [r0], #0x00
+
+	mov pc, r1
+
+Lpmureg:
+        .word   0xd0022124
+
+ASENTRY_NP(mpentry)
+
+	AP_DEBUG(#2)
+
+	/* Make sure interrupts are disabled. */
+	mrs	r7, cpsr
+	orr	r7, r7, #(I32_bit|F32_bit)
+	msr	cpsr_c, r7
+
+
+	adr     r7, Ltag
+	bic     r7, r7, #0xf0000000
+	orr     r7, r7, #PHYSADDR
+			
+	/* Disable MMU for a while */
+	mrc	p15, 0, r2, c1, c0, 0
+	bic	r2, r2, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE |\
+	    CPU_CONTROL_WBUF_ENABLE)
+	bic	r2, r2, #(CPU_CONTROL_IC_ENABLE)
+	bic	r2, r2, #(CPU_CONTROL_BPRD_ENABLE)
+	mcr	p15, 0, r2, c1, c0, 0
+
+	nop
+	nop
+	nop
+
+	AP_DEBUG(#3)
+
+Ltag:
+	ldr	r0, Lstartup_pagetable_secondary
+	bic	r0, r0, #0xf0000000
+	orr	r0, r0, #PHYSADDR
+	ldr	r0, [r0]
+#if defined(SMP)
+	orr 	r0, r0, #0		/* Set TTB shared memory flag */
+#endif
+	mcr	p15, 0, r0, c2, c0, 0	/* Set TTB */
+	mcr	p15, 0, r0, c8, c7, 0	/* Flush TLB */
+
+#if defined(CPU_ARM11) || defined(CPU_MV_PJ4B) || defined(CPU_CORTEXA)
+	mov	r0, #0
+	mcr	p15, 0, r0, c13, c0, 1	/* Set ASID to 0 */
+#endif
+
+	AP_DEBUG(#4)
+
+	/* Set the Domain Access register.  Very important! */
+	mov	r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT)
+	mcr	p15, 0, r0, c3, c0, 0
+	/* Enable MMU */
+	mrc	p15, 0, r0, c1, c0, 0
+#if defined(CPU_ARM11) || defined(CPU_MV_PJ4B) || defined(CPU_CORTEXA)
+	orr	r0, r0, #CPU_CONTROL_V6_EXTPAGE
+#endif
+	orr	r0, r0, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE)
+	mcr	p15, 0, r0, c1, c0, 0
+	nop
+	nop
+	nop
+	CPWAIT(r0)
+
+	adr	r1, .Lstart
+	ldmia	r1, {r1, r2, sp}	/* Set initial stack and */
+	mrc	p15, 0, r0, c0, c0, 5
+	and	r0, r0, #15
+	mov	r1, #2048
+	mul	r2, r1, r0
+	sub	sp, sp, r2
+	str	r1, [sp]
+	ldr	pc, .Lmpvirt_done
+
+mpvirt_done:
+
+	mov	fp, #0			/* trace back starts here */
+	bl	_C_LABEL(init_secondary)	/* Off we go */
+
+	adr	r0, .Lmpreturned
+	b	_C_LABEL(panic)
+	/* NOTREACHED */
+
+.Lmpreturned:
+	.asciz	"main() returned"
+	.align	0
+#endif
+
 ENTRY_NP(cpu_halt)
 	mrs     r2, cpsr
 	bic	r2, r2, #(PSR_MODE)
diff --git a/sys/arm/arm/machdep.c b/sys/arm/arm/machdep.c
index 07f892a..95b0275 100644
--- a/sys/arm/arm/machdep.c
+++ b/sys/arm/arm/machdep.c
@@ -44,6 +44,7 @@
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
+#include "opt_timer.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
@@ -93,8 +94,10 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 #include <machine/sysarch.h>
 
-static struct trapframe proc0_tf;
+struct pcpu __pcpu[MAXCPU];
+struct pcpu *pcpup = &__pcpu[0];
 
+static struct trapframe proc0_tf;
 uint32_t cpu_reset_address = 0;
 int cold = 1;
 vm_offset_t vector_page;
@@ -278,9 +281,11 @@ static void
 cpu_startup(void *dummy)
 {
 	struct pcb *pcb = thread0.td_pcb;
+#ifdef ARM_TP_ADDRESS
 #ifndef ARM_CACHE_LOCK_ENABLE
 	vm_page_t m;
 #endif
+#endif
 
 	cpu_setup("");
 	identify_arm_cpu();
@@ -322,6 +327,7 @@ cpu_startup(void *dummy)
 	vector_page_setprot(VM_PROT_READ);
 	pmap_set_pcb_pagedir(pmap_kernel(), pcb);
 	pmap_postinit();
+#ifdef ARM_TP_ADDRESS
 #ifdef ARM_CACHE_LOCK_ENABLE
 	pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
 	arm_lock_cache_line(ARM_TP_ADDRESS);
@@ -331,6 +337,7 @@ cpu_startup(void *dummy)
 #endif
 	*(uint32_t *)ARM_RAS_START = 0;
 	*(uint32_t *)ARM_RAS_END = 0xffffffff;
+#endif
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
@@ -358,7 +365,20 @@ cpu_est_clockrate(int cpu_id, uint64_t *rate)
 void
 cpu_idle(int busy)
 {
+	
+#ifndef NO_EVENTTIMERS
+	if (!busy) {
+		critical_enter();
+		cpu_idleclock();
+	}
+#endif
 	cpu_sleep(0);
+#ifndef NO_EVENTTIMERS
+	if (!busy) {
+		cpu_activeclock();
+		critical_exit();
+	}
+#endif
 }
 
 int
@@ -768,6 +788,19 @@ fake_preload_metadata(struct arm_boot_params *abp __unused)
 	return (lastaddr);
 }
 
+void
+pcpu0_init(void)
+{
+#if ARM_ARCH_7A || defined(CPU_MV_PJ4B)
+	set_pcpu(pcpup);
+#endif
+	pcpu_init(pcpup, 0, sizeof(struct pcpu));
+	PCPU_SET(curthread, &thread0);
+#ifdef ARM_VFP_SUPPORT
+	PCPU_SET(cpu, 0);
+#endif
+}
+
 #if defined(LINUX_BOOT_ABI)
 vm_offset_t
 linux_parse_boot_param(struct arm_boot_params *abp)
diff --git a/sys/arm/arm/mp_machdep.c b/sys/arm/arm/mp_machdep.c
new file mode 100644
index 0000000..30e6b63
--- /dev/null
+++ b/sys/arm/arm/mp_machdep.c
@@ -0,0 +1,393 @@
+/*-
+ * Copyright (c) 2011 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/pcpu.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/ktr.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+#include <machine/pcb.h>
+#include <machine/pte.h>
+#include <machine/intr.h>
+#include <machine/vmparam.h>
+
+#include "opt_smp.h"
+
+void *temp_pagetable;
+extern struct pcpu __pcpu[];
+/* used to hold the AP's until we are ready to release them */
+struct mtx ap_boot_mtx;
+struct pcb stoppcbs[MAXCPU];
+
+/* # of Applications processors */
+volatile int mp_naps;
+
+/* Set to 1 once we're ready to let the APs out of the pen. */
+volatile int aps_ready = 0;
+
+static int ipi_handler(void *arg);
+void set_stackptrs(int cpu);
+
+/* Temporary variables for init_secondary()  */
+void *dpcpu[MAXCPU - 1];
+
+/* Determine if we running MP machine */
+int
+cpu_mp_probe(void)
+{
+	CPU_SETOF(0, &all_cpus);
+
+	return (platform_mp_probe());
+}
+
+/* Start Application Processor via platform specific function */
+static int
+check_ap(void)
+{
+	uint32_t ms;
+
+	for (ms = 0; ms < 2000; ++ms) {
+		if ((mp_naps + 1) == mp_ncpus)
+			return (0);		/* success */
+		else
+			DELAY(1000);
+	}
+
+	return (-2);
+}
+
+extern unsigned char _end[];
+
+/* Initialize and fire up non-boot processors */
+void
+cpu_mp_start(void)
+{
+	int error, i;
+	vm_offset_t temp_pagetable_va;
+	vm_paddr_t addr, addr_end;
+
+	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+
+	/* Reserve memory for application processors */
+	for(i = 0; i < (mp_ncpus - 1); i++)
+		dpcpu[i] = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
+	temp_pagetable_va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE,
+	    M_TEMP, 0, 0x0, 0xffffffff, L1_TABLE_SIZE, 0);
+	addr = KERNPHYSADDR;
+	addr_end = (vm_offset_t)&_end - KERNVIRTADDR + KERNPHYSADDR;
+	addr_end &= ~L1_S_OFFSET;
+	addr_end += L1_S_SIZE;
+	bzero((void *)temp_pagetable_va,  L1_TABLE_SIZE);
+	for (addr = KERNPHYSADDR; addr <= addr_end; addr += L1_S_SIZE) { 
+		((int *)(temp_pagetable_va))[addr >> L1_S_SHIFT] =
+		    L1_TYPE_S|L1_SHARED|L1_S_C|L1_S_AP(AP_KRW)|L1_S_DOM(PMAP_DOMAIN_KERNEL)|addr;
+		((int *)(temp_pagetable_va))[(addr -
+			KERNPHYSADDR + KERNVIRTADDR) >> L1_S_SHIFT] = 
+		    L1_TYPE_S|L1_SHARED|L1_S_C|L1_S_AP(AP_KRW)|L1_S_DOM(PMAP_DOMAIN_KERNEL)|addr;
+	}
+	temp_pagetable = (void*)(vtophys(temp_pagetable_va));
+	cpu_idcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+
+	/* Initialize boot code and start up processors */
+	platform_mp_start_ap();
+
+	/*  Check if ap's started properly */
+	error = check_ap();
+	if (error)
+		printf("WARNING: Some AP's failed to start\n");
+	else
+		for (i = 1; i < mp_ncpus; i++)
+			CPU_SET(i, &all_cpus);
+
+	contigfree((void *)temp_pagetable_va, L1_TABLE_SIZE, M_TEMP);
+}
+
+/* Introduce rest of cores to the world */
+void
+cpu_mp_announce(void)
+{
+
+}
+
+extern vm_paddr_t pmap_pa;
+void
+init_secondary(int cpu)
+{
+	struct pcpu *pc;
+	uint32_t loop_counter;
+	int start = 0, end = 0;
+
+	cpu_setup(NULL);
+	setttb(pmap_pa);
+	cpu_tlb_flushID();
+
+	pc = &__pcpu[cpu];
+	set_pcpu(pc);
+	pcpu_init(pc, cpu, sizeof(struct pcpu));
+
+	dpcpu_init(dpcpu[cpu - 1], cpu);
+
+	/* Provide stack pointers for other processor modes. */
+	set_stackptrs(cpu);
+
+	/* Signal our startup to BSP */
+	atomic_add_rel_32(&mp_naps, 1);
+
+	/* Spin until the BSP releases the APs */
+	while (!aps_ready)
+		;
+
+	/* Initialize curthread */
+	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
+	pc->pc_curthread = pc->pc_idlethread;
+	pc->pc_curpcb = pc->pc_idlethread->td_pcb;
+
+	mtx_lock_spin(&ap_boot_mtx);
+
+	atomic_add_rel_32(&smp_cpus, 1);
+
+	if (smp_cpus == mp_ncpus) {
+		/* enable IPI's, tlb shootdown, freezes etc */
+		atomic_store_rel_int(&smp_started, 1);
+		smp_active = 1;
+	}
+
+	mtx_unlock_spin(&ap_boot_mtx);
+
+	/* Enable ipi */
+#ifdef IPI_IRQ_START
+	start = IPI_IRQ_START;
+#ifdef IPI_IRQ_END
+  	end = IPI_IRQ_END;
+#else
+	end = IPI_IRQ_START;
+#endif
+#endif
+				
+	for (int i = start; i <= end; i++)
+		arm_unmask_irq(i);
+	enable_interrupts(I32_bit);
+
+	loop_counter = 0;
+	while (smp_started == 0) {
+		DELAY(100);
+		loop_counter++;
+		if (loop_counter == 1000)
+			CTR0(KTR_SMP, "AP still wait for smp_started");
+	}
+	/* Start per-CPU event timers. */
+	cpu_initclocks_ap();
+
+	CTR0(KTR_SMP, "go into scheduler");
+	platform_mp_init_secondary();
+
+	/* Enter the scheduler */
+	sched_throw(NULL);
+
+	panic("scheduler returned us to %s", __func__);
+	/* NOTREACHED */
+}
+
+static int
+ipi_handler(void *arg)
+{
+	u_int	cpu, ipi;
+
+	cpu = PCPU_GET(cpuid);
+
+	ipi = pic_ipi_get((int)arg);
+
+	while ((ipi != 0x3ff)) {
+		switch (ipi) {
+		case IPI_RENDEZVOUS:
+			CTR0(KTR_SMP, "IPI_RENDEZVOUS");
+			smp_rendezvous_action();
+			break;
+
+		case IPI_AST:
+			CTR0(KTR_SMP, "IPI_AST");
+			break;
+
+		case IPI_STOP:
+		case IPI_STOP_HARD:
+			/*
+			 * IPI_STOP_HARD is mapped to IPI_STOP so it is not
+			 * necessary to add it in the switch.
+			 */
+			CTR0(KTR_SMP, "IPI_STOP or IPI_STOP_HARD");
+
+			savectx(&stoppcbs[cpu]);
+
+			/* Indicate we are stopped */
+			CPU_SET_ATOMIC(cpu, &stopped_cpus);
+
+			/* Wait for restart */
+			while (!CPU_ISSET(cpu, &started_cpus))
+				cpu_spinwait();
+
+			CPU_CLR_ATOMIC(cpu, &started_cpus);
+			CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+			CTR0(KTR_SMP, "IPI_STOP (restart)");
+			break;
+		case IPI_PREEMPT:
+			CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__);
+			sched_preempt(curthread);
+			break;
+		case IPI_HARDCLOCK:
+			CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__);
+			hardclockintr();
+			break;
+		case IPI_TLB:
+			CTR1(KTR_SMP, "%s: IPI_TLB", __func__);
+			cpufuncs.cf_tlb_flushID();
+			break;
+		default:
+			panic("Unknown IPI 0x%0x on cpu %d", ipi, curcpu);
+		}
+
+		pic_ipi_clear(ipi);
+		ipi = pic_ipi_get(-1);
+	}
+
+	return (FILTER_HANDLED);
+}
+
+static void
+release_aps(void *dummy __unused)
+{
+	uint32_t loop_counter;
+	int start = 0, end = 0;
+
+	if (mp_ncpus == 1)
+		return;
+#ifdef IPI_IRQ_START
+	start = IPI_IRQ_START;
+#ifdef IPI_IRQ_END
+	end = IPI_IRQ_END;
+#else
+	end = IPI_IRQ_START;
+#endif
+#endif
+
+	for (int i = start; i <= end; i++) {
+		/*
+		 * IPI handler
+		 */
+		/* 
+		 * Use 0xdeadbeef as the argument value for irq 0,
+		 * if we used 0, the intr code will give the trap frame
+		 * pointer instead.
+		 */
+		arm_setup_irqhandler("ipi", ipi_handler, NULL, (void *)i, i,
+		    INTR_TYPE_MISC | INTR_EXCL, NULL);
+
+		/* Enable ipi */
+		arm_unmask_irq(i);
+	}
+	atomic_store_rel_int(&aps_ready, 1);
+
+	printf("Release APs\n");
+
+	for (loop_counter = 0; loop_counter < 2000; loop_counter++) {
+		if (smp_started)
+			return;
+		DELAY(1000);
+	}
+	printf("AP's not started\n");
+}
+
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
+
+struct cpu_group *
+cpu_topo(void)
+{
+
+	return (smp_topo_1level(CG_SHARE_L2, 1, 0));
+}
+
+void
+cpu_mp_setmaxid(void)
+{
+
+	platform_mp_setmaxid();
+}
+
+/* Sending IPI */
+void
+ipi_all_but_self(u_int ipi)
+{
+	cpuset_t other_cpus;
+
+	other_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+	platform_ipi_send(other_cpus, ipi);
+}
+
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+	cpuset_t cpus;
+
+	CPU_ZERO(&cpus);
+	CPU_SET(cpu, &cpus);
+
+	CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x", __func__, cpu, ipi);
+	platform_ipi_send(cpus, ipi);
+}
+
+void
+ipi_selected(cpuset_t cpus, u_int ipi)
+{
+
+	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+	platform_ipi_send(cpus, ipi);
+}
+
+void
+tlb_broadcast(int ipi)
+{
+
+	if (smp_started)
+		ipi_all_but_self(ipi);
+}
diff --git a/sys/arm/arm/mpcore_timer.c b/sys/arm/arm/mpcore_timer.c
new file mode 100644
index 0000000..363e7ec
--- /dev/null
+++ b/sys/arm/arm/mpcore_timer.c
@@ -0,0 +1,431 @@
+/*-
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Developed by Ben Gray <ben.r.gray@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/**
+ *	The ARM Cortex-A9 core can support a global timer plus a private and
+ *	watchdog timer per core.  This driver reserves memory and interrupt
+ *	resources for accessing both timer register sets, these resources are
+ *	stored globally and used to setup the timecount and eventtimer.
+ *
+ *	The timecount timer uses the global 64-bit counter, whereas the
+ *	per-CPU eventtimer uses the private 32-bit counters.
+ *
+ *
+ *	REF: ARM Cortex-A9 MPCore, Technical Reference Manual (rev. r2p2)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/timeet.h>
+#include <sys/timetc.h>
+#include <sys/watchdog.h>
+#include <machine/bus.h>
+#include <machine/cpu.h>
+#include <machine/frame.h>
+#include <machine/intr.h>
+
+#include <dev/fdt/fdt_common.h>
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <machine/bus.h>
+#include <machine/fdt.h>
+
+/* Private (per-CPU) timer register map */
+#define PRV_TIMER_LOAD                 0x0000
+#define PRV_TIMER_COUNT                0x0004
+#define PRV_TIMER_CTRL                 0x0008
+#define PRV_TIMER_INTR                 0x000C
+
+#define PRV_TIMER_CTR_PRESCALER_SHIFT  8
+#define PRV_TIMER_CTRL_IRQ_ENABLE      (1UL << 2)
+#define PRV_TIMER_CTRL_AUTO_RELOAD     (1UL << 1)
+#define PRV_TIMER_CTRL_TIMER_ENABLE    (1UL << 0)
+
+#define PRV_TIMER_INTR_EVENT           (1UL << 0)
+
+/* Global timer register map */
+#define GBL_TIMER_COUNT_LOW            0x0000
+#define GBL_TIMER_COUNT_HIGH           0x0004
+#define GBL_TIMER_CTRL                 0x0008
+#define GBL_TIMER_INTR                 0x000C
+
+#define GBL_TIMER_CTR_PRESCALER_SHIFT  8
+#define GBL_TIMER_CTRL_AUTO_INC        (1UL << 3)
+#define GBL_TIMER_CTRL_IRQ_ENABLE      (1UL << 2)
+#define GBL_TIMER_CTRL_COMP_ENABLE     (1UL << 1)
+#define GBL_TIMER_CTRL_TIMER_ENABLE    (1UL << 0)
+
+#define GBL_TIMER_INTR_EVENT           (1UL << 0)
+
+struct arm_tmr_softc {
+	struct resource *	tmr_res[4];
+	bus_space_tag_t		prv_bst;
+	bus_space_tag_t		gbl_bst;
+	bus_space_handle_t	prv_bsh;
+	bus_space_handle_t	gbl_bsh;
+	uint32_t		clkfreq;
+	struct eventtimer	et;
+};
+
+static struct resource_spec arm_tmr_spec[] = {
+	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },	/* Global registers */
+	{ SYS_RES_IRQ,		0,	RF_ACTIVE },    /* Global timer interrupt (unused) */
+	{ SYS_RES_MEMORY,	1,	RF_ACTIVE },	/* Private (per-CPU) registers */
+	{ SYS_RES_IRQ,		1,	RF_ACTIVE },    /* Private timer interrupt */
+	{ -1, 0 }
+};
+
+static struct arm_tmr_softc *arm_tmr_sc = NULL;
+
+#define	tmr_prv_read_4(reg)		\
+    bus_space_read_4(arm_tmr_sc->prv_bst, arm_tmr_sc->prv_bsh, reg)
+#define	tmr_prv_write_4(reg, val)		\
+    bus_space_write_4(arm_tmr_sc->prv_bst, arm_tmr_sc->prv_bsh, reg, val)
+#define	tmr_gbl_read_4(reg)		\
+    bus_space_read_4(arm_tmr_sc->gbl_bst, arm_tmr_sc->gbl_bsh, reg)
+#define	tmr_gbl_write_4(reg, val)		\
+    bus_space_write_4(arm_tmr_sc->gbl_bst, arm_tmr_sc->gbl_bsh, reg, val)
+
+
+static timecounter_get_t arm_tmr_get_timecount;
+
+static struct timecounter arm_tmr_timecount = {
+	.tc_name           = "ARM MPCore Timecouter",
+	.tc_get_timecount  = arm_tmr_get_timecount,
+	.tc_poll_pps       = NULL,
+	.tc_counter_mask   = ~0u,
+	.tc_frequency      = 0,
+	.tc_quality        = 1000,
+};
+
+/**
+ *	arm_tmr_get_timecount - reads the timecount (global) timer
+ *	@tc: pointer to arm_tmr_timecount struct
+ *
+ *	We only read the lower 32-bits, the timecount stuff only uses 32-bits
+ *	so (for now?) ignore the upper 32-bits.
+ *
+ *	RETURNS
+ *	The lower 32-bits of the counter.
+ */
+static unsigned
+arm_tmr_get_timecount(struct timecounter *tc)
+{
+	return (tmr_gbl_read_4(GBL_TIMER_COUNT_LOW));
+}
+
+/**
+ *	arm_tmr_start - starts the eventtimer (private) timer
+ *	@et: pointer to eventtimer struct
+ *	@first: the number of seconds and fractional sections to trigger in
+ *	@period: the period (in seconds and fractional sections) to set
+ *
+ *	If the eventtimer is required to be in oneshot mode, period will be
+ *	NULL and first will point to the time to trigger.  If in periodic mode
+ *	period will contain the time period and first may optionally contain
+ *	the time for the first period.
+ *
+ *	RETURNS
+ *	Always returns 0
+ */
+static int
+arm_tmr_start(struct eventtimer *et, struct bintime *first,
+              struct bintime *period)
+{
+	struct arm_tmr_softc *sc = (struct arm_tmr_softc *)et->et_priv;
+	uint32_t load, count;
+	uint32_t ctrl;
+
+	ctrl = PRV_TIMER_CTRL_IRQ_ENABLE | PRV_TIMER_CTRL_TIMER_ENABLE;
+
+	if (period != NULL) {
+		load = (et->et_frequency * (period->frac >> 32)) >> 32;
+		if (period->sec > 0)
+			load += et->et_frequency * period->sec;
+		ctrl |= PRV_TIMER_CTRL_AUTO_RELOAD;
+	} else {
+		load = 0;
+	}
+
+	if (first != NULL) {
+		count = (sc->et.et_frequency * (first->frac >> 32)) >> 32;
+		if (first->sec != 0)
+			count += sc->et.et_frequency * first->sec;
+	} else {
+		count = load;
+	}
+
+	tmr_prv_write_4(PRV_TIMER_LOAD, load);
+	tmr_prv_write_4(PRV_TIMER_COUNT, count);
+
+	tmr_prv_write_4(PRV_TIMER_CTRL, ctrl);
+	return (0);
+}
+
+/**
+ *	arm_tmr_stop - stops the eventtimer (private) timer
+ *	@et: pointer to eventtimer struct
+ *
+ *	Simply stops the private timer by clearing all bits in the ctrl register.
+ *
+ *	RETURNS
+ *	Always returns 0
+ */
+static int
+arm_tmr_stop(struct eventtimer *et)
+{
+	tmr_prv_write_4(PRV_TIMER_CTRL, 0);
+	return (0);
+}
+
+/**
+ *	arm_tmr_intr - ISR for the eventtimer (private) timer
+ *	@arg: pointer to arm_tmr_softc struct
+ *
+ *	Clears the event register and then calls the eventtimer callback.
+ *
+ *	RETURNS
+ *	Always returns FILTER_HANDLED
+ */
+static int
+arm_tmr_intr(void *arg)
+{
+	struct arm_tmr_softc *sc = (struct arm_tmr_softc *)arg;
+
+	tmr_prv_write_4(PRV_TIMER_INTR, PRV_TIMER_INTR_EVENT);
+
+	if (sc->et.et_active)
+		sc->et.et_event_cb(&sc->et, sc->et.et_arg);
+
+	return (FILTER_HANDLED);
+}
+
+
+
+
+/**
+ *	arm_tmr_probe - timer probe routine
+ *	@dev: new device
+ *
+ *	The probe function returns success when probed with the fdt compatible
+ *	string set to "arm,mpcore-timers".
+ *
+ *	RETURNS
+ *	BUS_PROBE_DEFAULT if the fdt device is compatible, otherwise ENXIO.
+ */
+static int
+arm_tmr_probe(device_t dev)
+{
+	if (!ofw_bus_is_compatible(dev, "arm,mpcore-timers"))
+		return (ENXIO);
+
+	device_set_desc(dev, "ARM Generic MPCore Timers");
+	return (BUS_PROBE_DEFAULT);
+}
+
+/**
+ *	arm_tmr_attach - attaches the timer to the simplebus
+ *	@dev: new device
+ *
+ *	Reserves memory and interrupt resources, stores the softc structure
+ *	globally and registers both the timecount and eventtimer objects.
+ *
+ *	RETURNS
+ *	Zero on sucess or ENXIO if an error occuried.
+ */
+static int
+arm_tmr_attach(device_t dev)
+{
+	struct arm_tmr_softc *sc = device_get_softc(dev);
+	phandle_t node;
+	pcell_t clock;
+	void *ihl;
+
+	if (arm_tmr_sc)
+		return (ENXIO);
+
+	/* Get the base clock frequency */
+	node = ofw_bus_get_node(dev);
+	if ((OF_getprop(node, "clock-frequency", &clock, sizeof(clock))) <= 0) {
+		device_printf(dev, "missing clock-frequency attribute in FDT\n");
+		return (ENXIO);
+	}
+	sc->clkfreq = fdt32_to_cpu(clock);
+
+
+	if (bus_alloc_resources(dev, arm_tmr_spec, sc->tmr_res)) {
+		device_printf(dev, "could not allocate resources\n");
+		return (ENXIO);
+	}
+
+	/* Global timer interface */
+	sc->gbl_bst = rman_get_bustag(sc->tmr_res[0]);
+	sc->gbl_bsh = rman_get_bushandle(sc->tmr_res[0]);
+
+	/* Private per-CPU timer interface */
+	sc->prv_bst = rman_get_bustag(sc->tmr_res[2]);
+	sc->prv_bsh = rman_get_bushandle(sc->tmr_res[2]);
+
+	arm_tmr_sc = sc;
+
+	/* Disable both timers to start off */
+	tmr_prv_write_4(PRV_TIMER_CTRL, 0x00000000);
+	tmr_gbl_write_4(GBL_TIMER_CTRL, 0x00000000);
+
+	/* Setup and enable the global timer to use as the timecounter */
+	tmr_gbl_write_4(GBL_TIMER_CTRL, (0x00 << GBL_TIMER_CTR_PRESCALER_SHIFT) | 
+					GBL_TIMER_CTRL_TIMER_ENABLE);
+
+	arm_tmr_timecount.tc_frequency = sc->clkfreq;
+	tc_init(&arm_tmr_timecount);
+
+	/* Setup and enable the timer */
+	if (bus_setup_intr(dev, sc->tmr_res[3], INTR_TYPE_CLK, arm_tmr_intr,
+			NULL, sc, &ihl) != 0) {
+		bus_release_resources(dev, arm_tmr_spec, sc->tmr_res);
+		device_printf(dev, "Unable to setup the clock irq handler.\n");
+		return (ENXIO);
+	}
+
+	sc->et.et_name = "ARM MPCore Eventtimer";
+	sc->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU;
+	sc->et.et_quality = 1000;
+
+	sc->et.et_frequency = sc->clkfreq;
+	sc->et.et_min_period.sec = 0;
+	sc->et.et_min_period.frac =
+            ((0x00000002LLU << 32) / sc->et.et_frequency) << 32;
+	sc->et.et_max_period.sec = 0xfffffff0U / sc->et.et_frequency;
+	sc->et.et_max_period.frac =
+            ((0xfffffffeLLU << 32) / sc->et.et_frequency) << 32;
+	sc->et.et_start = arm_tmr_start;
+	sc->et.et_stop = arm_tmr_stop;
+	sc->et.et_priv = sc;
+	et_register(&sc->et);
+
+	return (0);
+}
+
+static device_method_t arm_tmr_methods[] = {
+	DEVMETHOD(device_probe,		arm_tmr_probe),
+	DEVMETHOD(device_attach,	arm_tmr_attach),
+	{ 0, 0 }
+};
+
+static driver_t arm_tmr_driver = {
+	"mp_tmr",
+	arm_tmr_methods,
+	sizeof(struct arm_tmr_softc),
+};
+
+static devclass_t arm_tmr_devclass;
+
+DRIVER_MODULE(mp_tmr, simplebus, arm_tmr_driver, arm_tmr_devclass, 0, 0);
+
+/**
+ *	cpu_initclocks - called by system to initialise the cpu clocks
+ *
+ *	This is a boilerplat function, most of the setup has already been done
+ *	when the driver was attached.  Therefore this function must only be called
+ *	after the driver is attached.
+ *
+ *	RETURNS
+ *	nothing
+ */
+void
+cpu_initclocks(void)
+{
+	if (PCPU_GET(cpuid) == 0)
+		cpu_initclocks_bsp();
+	else
+		cpu_initclocks_ap();
+}
+
+/**
+ *	DELAY - Delay for at least usec microseconds.
+ *	@usec: number of microseconds to delay by
+ *
+ *	This function is called all over the kernel and is suppose to provide a
+ *	consistent delay.  This function may also be called before the console 
+ *	is setup so no printf's can be called here.
+ *
+ *	RETURNS:
+ *	nothing
+ */
+void
+DELAY(int usec)
+{
+	int32_t counts_per_usec;
+	int32_t counts;
+	uint32_t first, last;
+
+	/* Check the timers are setup, if not just use a for loop for the meantime */
+	if (arm_tmr_sc == NULL) {
+		for (; usec > 0; usec--)
+			for (counts = 200; counts > 0; counts--)
+				cpufunc_nullop();	/* Prevent gcc from optimizing
+							 * out the loop
+							 */
+		return;
+	}
+
+	/* Get the number of times to count */
+	counts_per_usec = ((arm_tmr_timecount.tc_frequency / 1000000) + 1);
+
+	/*
+	 * Clamp the timeout at a maximum value (about 32 seconds with
+	 * a 66MHz clock). *Nobody* should be delay()ing for anywhere
+	 * near that length of time and if they are, they should be hung
+	 * out to dry.
+	 */
+	if (usec >= (0x80000000U / counts_per_usec))
+		counts = (0x80000000U / counts_per_usec) - 1;
+	else
+		counts = usec * counts_per_usec;
+
+	first = tmr_gbl_read_4(GBL_TIMER_COUNT_LOW);
+
+	while (counts > 0) {
+		last = tmr_gbl_read_4(GBL_TIMER_COUNT_LOW);
+		counts -= (int32_t)(last - first);
+		first = last;
+	}
+}
diff --git a/sys/arm/arm/pl310.c b/sys/arm/arm/pl310.c
new file mode 100644
index 0000000..45ee718
--- /dev/null
+++ b/sys/arm/arm/pl310.c
@@ -0,0 +1,321 @@
+/*-
+ * Copyright (c) 2012 Olivier Houchard <cognet@FreeBSD.org>
+ * Copyright (c) 2011
+ *	Ben Gray <ben.r.gray@gmail.com>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BEN GRAY ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BEN GRAY BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/rman.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <machine/intr.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/pl310.h>
+#include <machine/bus.h>
+
+#include <dev/fdt/fdt_common.h>
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+/**
+ *	PL310 - L2 Cache Controller register offsets.
+ *
+ */
+#define PL310_CACHE_ID              0x000
+#define PL310_CACHE_TYPE            0x004
+#define PL310_CTRL                  0x100
+#define PL310_AUX_CTRL              0x104
+#define PL310_EVENT_COUNTER_CTRL    0x200
+#define PL310_EVENT_COUNTER1_CONF   0x204
+#define PL310_EVENT_COUNTER0_CONF   0x208
+#define PL310_EVENT_COUNTER1_VAL    0x20C
+#define PL310_EVENT_COUNTER0_VAL    0x210
+#define PL310_INTR_MASK             0x214
+#define PL310_MASKED_INTR_STAT      0x218
+#define PL310_RAW_INTR_STAT         0x21C
+#define PL310_INTR_CLEAR            0x220
+#define PL310_CACHE_SYNC            0x730
+#define PL310_INV_LINE_PA           0x770
+#define PL310_INV_WAY               0x77C
+#define PL310_CLEAN_LINE_PA         0x7B0
+#define PL310_CLEAN_LINE_IDX        0x7B8
+#define PL310_CLEAN_WAY             0x7BC
+#define PL310_CLEAN_INV_LINE_PA     0x7F0
+#define PL310_CLEAN_INV_LINE_IDX    0x7F8
+#define PL310_CLEAN_INV_WAY         0x7FC
+#define PL310_LOCKDOWN_D_WAY(x)    (0x900 + ((x) * 8))
+#define PL310_LOCKDOWN_I_WAY(x)    (0x904 + ((x) * 8))
+#define PL310_LOCKDOWN_LINE_ENABLE  0x950
+#define PL310_UNLOCK_ALL_LINES_WAY  0x954
+#define PL310_ADDR_FILTER_START     0xC00
+#define PL310_ADDR_FILTER_END       0xC04
+#define PL310_DEBUG_CTRL            0xF40
+
+
+#define PL310_AUX_CTRL_MASK                      0xc0000fff
+#define PL310_AUX_CTRL_ASSOCIATIVITY_SHIFT       16
+#define PL310_AUX_CTRL_WAY_SIZE_SHIFT            17
+#define PL310_AUX_CTRL_WAY_SIZE_MASK             (0x7 << 17)
+#define PL310_AUX_CTRL_SHARE_OVERRIDE_SHIFT      22
+#define PL310_AUX_CTRL_NS_LOCKDOWN_SHIFT         26
+#define PL310_AUX_CTRL_NS_INT_CTRL_SHIFT         27
+#define PL310_AUX_CTRL_DATA_PREFETCH_SHIFT       28
+#define PL310_AUX_CTRL_INSTR_PREFETCH_SHIFT      29
+#define PL310_AUX_CTRL_EARLY_BRESP_SHIFT         30
+
+
+void omap4_l2cache_wbinv_range(vm_paddr_t physaddr, vm_size_t size);
+void omap4_l2cache_inv_range(vm_paddr_t physaddr, vm_size_t size);
+void omap4_l2cache_wb_range(vm_paddr_t physaddr, vm_size_t size);
+void omap4_l2cache_wbinv_all(void);
+void omap4_l2cache_inv_all(void);
+void omap4_l2cache_wb_all(void);
+
+static uint32_t g_l2cache_way_mask;
+
+static const uint32_t g_l2cache_line_size = 32;
+static const uint32_t g_l2cache_align_mask = (32 - 1);
+
+static uint32_t g_l2cache_size;
+
+static struct pl310_softc *pl310_softc;
+
+/**
+ *	pl310_read4 - read a 32-bit value from the PL310 registers
+ *	pl310_write4 - write a 32-bit value from the PL310 registers
+ *	@off: byte offset within the register set to read from
+ *	@val: the value to write into the register
+ *	
+ *
+ *	LOCKING:
+ *	None
+ *
+ *	RETURNS:
+ *	nothing in case of write function, if read function returns the value read.
+ */
+static __inline uint32_t
+pl310_read4(bus_size_t off)
+{
+	return bus_read_4(pl310_softc->sc_mem_res, off);
+}
+static __inline void
+pl310_write4(bus_size_t off, uint32_t val)
+{
+	bus_write_4(pl310_softc->sc_mem_res, off, val);
+}
+
+static __inline void
+pl310_wait_background_op(uint32_t off, uint32_t mask)
+{
+	while (pl310_read4(off) & mask);
+}
+
+
+/**
+ *	pl310_cache_sync - performs a cache sync operation
+ * 
+ *	According to the TRM:
+ *
+ *  "Before writing to any other register you must perform an explicit
+ *   Cache Sync operation. This is particularly important when the cache is
+ *   enabled and changes to how the cache allocates new lines are to be made."
+ *
+ *
+ */
+static __inline void
+pl310_cache_sync(void)
+{
+	pl310_write4(PL310_CACHE_SYNC, 0);
+}
+
+
+static void
+pl310_wbinv_all(void)
+{
+#if 1
+	pl310_write4(PL310_DEBUG_CTRL, 3);
+#endif
+	pl310_write4(PL310_CLEAN_INV_WAY, g_l2cache_way_mask);
+	pl310_wait_background_op(PL310_CLEAN_INV_WAY, g_l2cache_way_mask);
+	pl310_cache_sync();
+#if 1
+	pl310_write4(PL310_DEBUG_CTRL, 0);
+#endif
+		
+}
+
+static void
+pl310_wbinv_range(vm_paddr_t start, vm_size_t size)
+{
+	
+	if (size & g_l2cache_align_mask) {
+		size &= ~g_l2cache_align_mask;
+		size += g_l2cache_line_size;
+	}
+#if 1
+
+	pl310_write4(PL310_DEBUG_CTRL, 3);
+#endif
+	while (size > 0) {
+#if 1
+		/* 
+		 * Errata 588369 says that clean + inv may keep the 
+		 * cache line if it was clean, the recommanded workaround
+		 * is to clean then invalidate the cache line, with
+		 * write-back and cache linefill disabled
+		 */
+		   
+		pl310_write4(PL310_CLEAN_LINE_PA, start);
+		pl310_write4(PL310_INV_LINE_PA, start);
+#else
+		pl310_write4(PL310_CLEAN_INV_LINE_PA, start);
+#endif
+		start += g_l2cache_line_size;
+		size -= g_l2cache_line_size;
+	}
+#if 1
+	pl310_write4(PL310_DEBUG_CTRL, 0);
+#endif
+	pl310_wait_background_op(PL310_CLEAN_INV_LINE_PA, 1);
+	pl310_cache_sync();
+		
+}
+
+static void
+pl310_wb_range(vm_paddr_t start, vm_size_t size)
+{
+	
+	if (size & g_l2cache_align_mask) {
+		size &= ~g_l2cache_align_mask;
+		size += g_l2cache_line_size;
+	}
+	while (size > 0) {
+		pl310_write4(PL310_CLEAN_LINE_PA, start);
+		start += g_l2cache_line_size;
+		size -= g_l2cache_line_size;
+	}
+	pl310_cache_sync();
+	pl310_wait_background_op(PL310_CLEAN_LINE_PA, 1);
+
+}
+
+static void
+pl310_inv_range(vm_paddr_t start, vm_size_t size)
+{
+
+	if (size & g_l2cache_align_mask) {
+		size &= ~g_l2cache_align_mask;
+		size += g_l2cache_line_size;
+	}
+	while (size > 0) {
+		pl310_write4(PL310_INV_LINE_PA, start);
+		start += g_l2cache_line_size;
+		size -= g_l2cache_line_size;
+	}
+	pl310_cache_sync();
+	pl310_wait_background_op(PL310_INV_LINE_PA, 1);
+
+}
+
+static int
+pl310_probe(device_t dev)
+{
+	
+	if (!ofw_bus_is_compatible(dev, "arm,pl310"))
+		return (ENXIO);
+	device_set_desc(dev, "PL310 L2 cache controller");
+	return (0);
+}
+
+static int
+pl310_attach(device_t dev)
+{
+	struct pl310_softc *sc = device_get_softc(dev);
+	int rid = 0;
+	uint32_t aux_value;
+	uint32_t way_size;
+	uint32_t ways_assoc;
+	uint32_t ctrl_value;
+
+	sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 
+	    RF_ACTIVE);
+	if (sc->sc_mem_res == NULL)
+		panic("%s: Cannot map registers", device_get_name(dev));
+	pl310_softc = sc;
+
+	platform_init_pl310(sc);
+	aux_value = pl310_read4(PL310_AUX_CTRL);
+	way_size = (aux_value & PL310_AUX_CTRL_WAY_SIZE_MASK) >>
+	    PL310_AUX_CTRL_WAY_SIZE_SHIFT;
+	way_size = 1 << (way_size + 13);
+	if (aux_value & (1 << PL310_AUX_CTRL_ASSOCIATIVITY_SHIFT))
+		ways_assoc = 16;
+	else
+		ways_assoc = 8;
+	g_l2cache_way_mask = (1 << ways_assoc) - 1;
+	g_l2cache_size = way_size * ways_assoc;
+	/* Print the information */
+	printf("  L2 Cache: %uKB/%dB %d ways\n", (g_l2cache_size / 1024),
+	       g_l2cache_line_size, ways_assoc);
+	ctrl_value = pl310_read4(PL310_CTRL);
+	if (!(ctrl_value & 0x1)) {
+		/* Enable the L2 cache if disabled */
+		pl310_write4(PL310_CTRL, ctrl_value & 0x1);
+	}
+	pl310_wbinv_all();
+	
+	/* Set the l2 functions in the set of cpufuncs */
+	cpufuncs.cf_l2cache_wbinv_all = pl310_wbinv_all;
+	cpufuncs.cf_l2cache_wbinv_range = pl310_wbinv_range;
+	cpufuncs.cf_l2cache_inv_range = pl310_inv_range;
+	cpufuncs.cf_l2cache_wb_range = pl310_wb_range;
+	return (0);
+}
+
+static device_method_t pl310_methods[] = {
+	DEVMETHOD(device_probe, pl310_probe),
+	DEVMETHOD(device_attach, pl310_attach),
+	{0, 0},
+};
+
+static driver_t pl310_driver = {
+        "l2cache",
+        pl310_methods,
+        sizeof(struct pl310_softc),
+};
+static devclass_t pl310_devclass;
+
+DRIVER_MODULE(pl310, simplebus, pl310_driver, pl310_devclass, 0, 0);
+
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
new file mode 100644
index 0000000..aa7cdf0
--- /dev/null
+++ b/sys/arm/arm/pmap-v6.c
@@ -0,0 +1,3780 @@
+/* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */
+/*-
+ * Copyright 2011 Semihalf
+ * Copyright 2004 Olivier Houchard.
+ * Copyright 2003 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: FreeBSD: src/sys/arm/arm/pmap.c,v 1.113 2009/07/24 13:50:29
+ */
+
+/*-
+ * Copyright (c) 2002-2003 Wasabi Systems, Inc.
+ * Copyright (c) 2001 Richard Earnshaw
+ * Copyright (c) 2001-2002 Christopher Gilbert
+ * All rights reserved.
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1999 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1994-1998 Mark Brinicombe.
+ * Copyright (c) 1994 Brini.
+ * All rights reserved.
+ *
+ * This code is derived from software written for Brini by Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Mark Brinicombe.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ *
+ * RiscBSD kernel project
+ *
+ * pmap.c
+ *
+ * Machine dependant vm stuff
+ *
+ * Created      : 20/09/94
+ */
+
+/*
+ * Special compilation symbols
+ * PMAP_DEBUG           - Build in pmap_debug_level code
+ */
+/* Include header files */
+
+#include "opt_vm.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/msgbuf.h>
+#include <sys/vmmeter.h>
+#include <sys/mman.h>
+#include <sys/smp.h>
+#include <sys/sched.h>
+
+#include <vm/vm.h>
+#include <vm/uma.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/pcb.h>
+
+#ifdef PMAP_DEBUG
+#define PDEBUG(_lev_,_stat_) \
+        if (pmap_debug_level >= (_lev_)) \
+                ((_stat_))
+#define dprintf printf
+
+int pmap_debug_level = 0;
+#define PMAP_INLINE
+#else   /* PMAP_DEBUG */
+#define PDEBUG(_lev_,_stat_) /* Nothing */
+#define dprintf(x, arg...)
+#define PMAP_INLINE __inline
+#endif  /* PMAP_DEBUG */
+
+extern struct pv_addr systempage;
+
+/*
+ * Internal function prototypes
+ */
+static void pmap_free_pv_entry (pv_entry_t);
+static pv_entry_t pmap_get_pv_entry(void);
+
+static void		pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t,
+    vm_prot_t, boolean_t, int);
+static void		pmap_alloc_l1(pmap_t);
+static void		pmap_free_l1(pmap_t);
+
+static int		pmap_clearbit(struct vm_page *, u_int);
+
+static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t);
+static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t);
+static void		pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int);
+static vm_offset_t	kernel_pt_lookup(vm_paddr_t);
+
+static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1");
+
+vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
+vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
+vm_offset_t pmap_curmaxkvaddr;
+vm_paddr_t kernel_l1pa;
+
+extern void *end;
+vm_offset_t kernel_vm_end = 0;
+
+struct pmap kernel_pmap_store;
+
+static pt_entry_t *csrc_pte, *cdst_pte;
+static vm_offset_t csrcp, cdstp;
+static struct mtx cmtx;
+
+static void		pmap_init_l1(struct l1_ttable *, pd_entry_t *);
+/*
+ * These routines are called when the CPU type is identified to set up
+ * the PTE prototypes, cache modes, etc.
+ *
+ * The variables are always here, just in case LKMs need to reference
+ * them (though, they shouldn't).
+ */
+static void pmap_set_prot(pt_entry_t *pte, vm_prot_t prot, uint8_t user);
+pt_entry_t	pte_l1_s_cache_mode;
+pt_entry_t	pte_l1_s_cache_mode_pt;
+
+pt_entry_t	pte_l2_l_cache_mode;
+pt_entry_t	pte_l2_l_cache_mode_pt;
+
+pt_entry_t	pte_l2_s_cache_mode;
+pt_entry_t	pte_l2_s_cache_mode_pt;
+
+/*
+ * Which pmap is currently 'live' in the cache
+ *
+ * XXXSCW: Fix for SMP ...
+ */
+union pmap_cache_state *pmap_cache_state;
+
+struct msgbuf *msgbufp = 0;
+
+/*
+ * Crashdump maps.
+ */
+static caddr_t crashdumpmap;
+
+extern void bcopy_page(vm_offset_t, vm_offset_t);
+extern void bzero_page(vm_offset_t);
+
+extern vm_offset_t alloc_firstaddr;
+
+char *_tmppt;
+
+/*
+ * Metadata for L1 translation tables.
+ */
+struct l1_ttable {
+	/* Entry on the L1 Table list */
+	SLIST_ENTRY(l1_ttable) l1_link;
+
+	/* Entry on the L1 Least Recently Used list */
+	TAILQ_ENTRY(l1_ttable) l1_lru;
+
+	/* Track how many domains are allocated from this L1 */
+	volatile u_int l1_domain_use_count;
+
+	/*
+	 * A free-list of domain numbers for this L1.
+	 * We avoid using ffs() and a bitmap to track domains since ffs()
+	 * is slow on ARM.
+	 */
+	u_int8_t l1_domain_first;
+	u_int8_t l1_domain_free[PMAP_DOMAINS];
+
+	/* Physical address of this L1 page table */
+	vm_paddr_t l1_physaddr;
+
+	/* KVA of this L1 page table */
+	pd_entry_t *l1_kva;
+};
+
+/*
+ * Convert a virtual address into its L1 table index. That is, the
+ * index used to locate the L2 descriptor table pointer in an L1 table.
+ * This is basically used to index l1->l1_kva[].
+ *
+ * Each L2 descriptor table represents 1MB of VA space.
+ */
+#define	L1_IDX(va)		(((vm_offset_t)(va)) >> L1_S_SHIFT)
+
+/*
+ * L1 Page Tables are tracked using a Least Recently Used list.
+ *  - New L1s are allocated from the HEAD.
+ *  - Freed L1s are added to the TAIl.
+ *  - Recently accessed L1s (where an 'access' is some change to one of
+ *    the userland pmaps which owns this L1) are moved to the TAIL.
+ */
+static TAILQ_HEAD(, l1_ttable) l1_lru_list;
+/*
+ * A list of all L1 tables
+ */
+static SLIST_HEAD(, l1_ttable) l1_list;
+static struct mtx l1_lru_lock;
+
+/*
+ * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots.
+ *
+ * This is normally 16MB worth L2 page descriptors for any given pmap.
+ * Reference counts are maintained for L2 descriptors so they can be
+ * freed when empty.
+ */
+struct l2_dtable {
+	/* The number of L2 page descriptors allocated to this l2_dtable */
+	u_int l2_occupancy;
+
+	/* List of L2 page descriptors */
+	struct l2_bucket {
+		pt_entry_t *l2b_kva;	/* KVA of L2 Descriptor Table */
+		vm_paddr_t l2b_phys;	/* Physical address of same */
+		u_short l2b_l1idx;	/* This L2 table's L1 index */
+		u_short l2b_occupancy;	/* How many active descriptors */
+	} l2_bucket[L2_BUCKET_SIZE];
+};
+
+/* pmap_kenter_internal flags */
+#define KENTER_CACHE	0x1
+#define KENTER_USER	0x2
+
+/*
+ * Given an L1 table index, calculate the corresponding l2_dtable index
+ * and bucket index within the l2_dtable.
+ */
+#define	L2_IDX(l1idx)		(((l1idx) >> L2_BUCKET_LOG2) & \
+				 (L2_SIZE - 1))
+#define	L2_BUCKET(l1idx)	((l1idx) & (L2_BUCKET_SIZE - 1))
+
+/*
+ * Given a virtual address, this macro returns the
+ * virtual address required to drop into the next L2 bucket.
+ */
+#define	L2_NEXT_BUCKET(va)	(((va) & L1_S_FRAME) + L1_S_SIZE)
+
+/*
+ * L2 allocation.
+ */
+#define	pmap_alloc_l2_dtable()		\
+		(void*)uma_zalloc(l2table_zone, M_NOWAIT|M_USE_RESERVE)
+#define	pmap_free_l2_dtable(l2)		\
+		uma_zfree(l2table_zone, l2)
+
+/*
+ * We try to map the page tables write-through, if possible.  However, not
+ * all CPUs have a write-through cache mode, so on those we have to sync
+ * the cache when we frob page tables.
+ *
+ * We try to evaluate this at compile time, if possible.  However, it's
+ * not always possible to do that, hence this run-time var.
+ */
+int	pmap_needs_pte_sync;
+
+/*
+ * Macro to determine if a mapping might be resident in the
+ * instruction cache and/or TLB
+ */
+#define	PV_BEEN_EXECD(f)  (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC))
+
+/*
+ * Macro to determine if a mapping might be resident in the
+ * data cache and/or TLB
+ */
+#define	PV_BEEN_REFD(f)   (((f) & PVF_REF) != 0)
+
+#ifndef PMAP_SHPGPERPROC
+#define PMAP_SHPGPERPROC 200
+#endif
+
+#define pmap_is_current(pm)	((pm) == pmap_kernel() || \
+            curproc->p_vmspace->vm_map.pmap == (pm))
+static uma_zone_t pvzone = NULL;
+uma_zone_t l2zone;
+static uma_zone_t l2table_zone;
+static vm_offset_t pmap_kernel_l2dtable_kva;
+static vm_offset_t pmap_kernel_l2ptp_kva;
+static vm_paddr_t pmap_kernel_l2ptp_phys;
+static struct vm_object pvzone_obj;
+static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
+
+int l1_mem_types[] = {
+	ARM_L1S_STRONG_ORD,
+	ARM_L1S_DEVICE_NOSHARE,
+	ARM_L1S_DEVICE_SHARE,
+	ARM_L1S_NRML_NOCACHE,
+	ARM_L1S_NRML_IWT_OWT,
+	ARM_L1S_NRML_IWB_OWB,
+	ARM_L1S_NRML_IWBA_OWBA
+};
+
+int l2l_mem_types[] = {
+	ARM_L2L_STRONG_ORD,
+	ARM_L2L_DEVICE_NOSHARE,
+	ARM_L2L_DEVICE_SHARE,
+	ARM_L2L_NRML_NOCACHE,
+	ARM_L2L_NRML_IWT_OWT,
+	ARM_L2L_NRML_IWB_OWB,
+	ARM_L2L_NRML_IWBA_OWBA
+};
+
+int l2s_mem_types[] = {
+	ARM_L2S_STRONG_ORD,
+	ARM_L2S_DEVICE_NOSHARE,
+	ARM_L2S_DEVICE_SHARE,
+	ARM_L2S_NRML_NOCACHE,
+	ARM_L2S_NRML_IWT_OWT,
+	ARM_L2S_NRML_IWB_OWB,
+	ARM_L2S_NRML_IWBA_OWBA
+};
+
+/*
+ * This list exists for the benefit of pmap_map_chunk().  It keeps track
+ * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can
+ * find them as necessary.
+ *
+ * Note that the data on this list MUST remain valid after initarm() returns,
+ * as pmap_bootstrap() uses it to contruct L2 table metadata.
+ */
+SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list);
+
+static void
+pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt)
+{
+	int i;
+
+	l1->l1_kva = l1pt;
+	l1->l1_domain_use_count = 0;
+	l1->l1_domain_first = 0;
+
+	for (i = 0; i < PMAP_DOMAINS; i++)
+		l1->l1_domain_free[i] = i + 1;
+
+	/*
+	 * Copy the kernel's L1 entries to each new L1.
+	 */
+	if (l1pt != pmap_kernel()->pm_l1->l1_kva)
+		memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE);
+
+	if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0)
+		panic("pmap_init_l1: can't get PA of L1 at %p", l1pt);
+	SLIST_INSERT_HEAD(&l1_list, l1, l1_link);
+	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
+}
+
+static vm_offset_t
+kernel_pt_lookup(vm_paddr_t pa)
+{
+	struct pv_addr *pv;
+
+	SLIST_FOREACH(pv, &kernel_pt_list, pv_list) {
+		if (pv->pv_pa == pa)
+			return (pv->pv_va);
+	}
+	return (0);
+}
+
+void
+pmap_pte_init_mmu_v6(void)
+{
+
+	if (PTE_PAGETABLE >= 3)
+		pmap_needs_pte_sync = 1;
+	pte_l1_s_cache_mode = l1_mem_types[PTE_CACHE];
+	pte_l2_l_cache_mode = l2l_mem_types[PTE_CACHE];
+	pte_l2_s_cache_mode = l2s_mem_types[PTE_CACHE];
+
+	pte_l1_s_cache_mode_pt = l1_mem_types[PTE_PAGETABLE];
+	pte_l2_l_cache_mode_pt = l2l_mem_types[PTE_PAGETABLE];
+	pte_l2_s_cache_mode_pt = l2s_mem_types[PTE_PAGETABLE];
+
+}
+
+/*
+ * Allocate an L1 translation table for the specified pmap.
+ * This is called at pmap creation time.
+ */
+static void
+pmap_alloc_l1(pmap_t pm)
+{
+	struct l1_ttable *l1;
+	u_int8_t domain;
+
+	/*
+	 * Remove the L1 at the head of the LRU list
+	 */
+	mtx_lock(&l1_lru_lock);
+	l1 = TAILQ_FIRST(&l1_lru_list);
+	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
+
+	/*
+	 * Pick the first available domain number, and update
+	 * the link to the next number.
+	 */
+	domain = l1->l1_domain_first;
+	l1->l1_domain_first = l1->l1_domain_free[domain];
+
+	/*
+	 * If there are still free domain numbers in this L1,
+	 * put it back on the TAIL of the LRU list.
+	 */
+	if (++l1->l1_domain_use_count < PMAP_DOMAINS)
+		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
+
+	mtx_unlock(&l1_lru_lock);
+
+	/*
+	 * Fix up the relevant bits in the pmap structure
+	 */
+	pm->pm_l1 = l1;
+	pm->pm_domain = domain + 1;
+}
+
+/*
+ * Free an L1 translation table.
+ * This is called at pmap destruction time.
+ */
+static void
+pmap_free_l1(pmap_t pm)
+{
+	struct l1_ttable *l1 = pm->pm_l1;
+
+	mtx_lock(&l1_lru_lock);
+
+	/*
+	 * If this L1 is currently on the LRU list, remove it.
+	 */
+	if (l1->l1_domain_use_count < PMAP_DOMAINS)
+		TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
+
+	/*
+	 * Free up the domain number which was allocated to the pmap
+	 */
+	l1->l1_domain_free[pm->pm_domain - 1] = l1->l1_domain_first;
+	l1->l1_domain_first = pm->pm_domain - 1;
+	l1->l1_domain_use_count--;
+
+	/*
+	 * The L1 now must have at least 1 free domain, so add
+	 * it back to the LRU list. If the use count is zero,
+	 * put it at the head of the list, otherwise it goes
+	 * to the tail.
+	 */
+	if (l1->l1_domain_use_count == 0) {
+		TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru);
+	}	else
+		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
+
+	mtx_unlock(&l1_lru_lock);
+}
+
+/*
+ * Returns a pointer to the L2 bucket associated with the specified pmap
+ * and VA, or NULL if no L2 bucket exists for the address.
+ */
+static PMAP_INLINE struct l2_bucket *
+pmap_get_l2_bucket(pmap_t pm, vm_offset_t va)
+{
+	struct l2_dtable *l2;
+	struct l2_bucket *l2b;
+	u_short l1idx;
+
+	l1idx = L1_IDX(va);
+
+	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL ||
+	    (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL)
+		return (NULL);
+
+	return (l2b);
+}
+
+/*
+ * Returns a pointer to the L2 bucket associated with the specified pmap
+ * and VA.
+ *
+ * If no L2 bucket exists, perform the necessary allocations to put an L2
+ * bucket/page table in place.
+ *
+ * Note that if a new L2 bucket/page was allocated, the caller *must*
+ * increment the bucket occupancy counter appropriately *before*
+ * releasing the pmap's lock to ensure no other thread or cpu deallocates
+ * the bucket/page in the meantime.
+ */
+static struct l2_bucket *
+pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va)
+{
+	struct l2_dtable *l2;
+	struct l2_bucket *l2b;
+	u_short l1idx;
+
+	l1idx = L1_IDX(va);
+
+	PMAP_ASSERT_LOCKED(pm);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
+		/*
+		 * No mapping at this address, as there is
+		 * no entry in the L1 table.
+		 * Need to allocate a new l2_dtable.
+		 */
+again_l2table:
+		PMAP_UNLOCK(pm);
+		vm_page_unlock_queues();
+		if ((l2 = pmap_alloc_l2_dtable()) == NULL) {
+			vm_page_lock_queues();
+			PMAP_LOCK(pm);
+			return (NULL);
+		}
+		vm_page_lock_queues();
+		PMAP_LOCK(pm);
+		if (pm->pm_l2[L2_IDX(l1idx)] != NULL) {
+			PMAP_UNLOCK(pm);
+			vm_page_unlock_queues();
+			uma_zfree(l2table_zone, l2);
+			vm_page_lock_queues();
+			PMAP_LOCK(pm);
+			l2 = pm->pm_l2[L2_IDX(l1idx)];
+			if (l2 == NULL)
+				goto again_l2table;
+			/*
+			 * Someone already allocated the l2_dtable while
+			 * we were doing the same.
+			 */
+		} else {
+			bzero(l2, sizeof(*l2));
+			/*
+			 * Link it into the parent pmap
+			 */
+			pm->pm_l2[L2_IDX(l1idx)] = l2;
+		}
+	}
+
+	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
+
+	/*
+	 * Fetch pointer to the L2 page table associated with the address.
+	 */
+	if (l2b->l2b_kva == NULL) {
+		pt_entry_t *ptep;
+
+		/*
+		 * No L2 page table has been allocated. Chances are, this
+		 * is because we just allocated the l2_dtable, above.
+		 */
+again_ptep:
+		PMAP_UNLOCK(pm);
+		vm_page_unlock_queues();
+		ptep = (void*)uma_zalloc(l2zone, M_NOWAIT|M_USE_RESERVE);
+		vm_page_lock_queues();
+		PMAP_LOCK(pm);
+		if (l2b->l2b_kva != 0) {
+			/* We lost the race. */
+			PMAP_UNLOCK(pm);
+			vm_page_unlock_queues();
+			uma_zfree(l2zone, ptep);
+			vm_page_lock_queues();
+			PMAP_LOCK(pm);
+			if (l2b->l2b_kva == 0)
+				goto again_ptep;
+			return (l2b);
+		}
+		l2b->l2b_phys = vtophys(ptep);
+		if (ptep == NULL) {
+			/*
+			 * Oops, no more L2 page tables available at this
+			 * time. We may need to deallocate the l2_dtable
+			 * if we allocated a new one above.
+			 */
+			if (l2->l2_occupancy == 0) {
+				pm->pm_l2[L2_IDX(l1idx)] = NULL;
+				pmap_free_l2_dtable(l2);
+			}
+			return (NULL);
+		}
+
+		l2->l2_occupancy++;
+		l2b->l2b_kva = ptep;
+		l2b->l2b_l1idx = l1idx;
+	}
+
+	return (l2b);
+}
+
+static PMAP_INLINE void
+pmap_free_l2_ptp(pt_entry_t *l2)
+{
+	uma_zfree(l2zone, l2);
+}
+/*
+ * One or more mappings in the specified L2 descriptor table have just been
+ * invalidated.
+ *
+ * Garbage collect the metadata and descriptor table itself if necessary.
+ *
+ * The pmap lock must be acquired when this is called (not necessary
+ * for the kernel pmap).
+ */
+static void
+pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count)
+{
+	struct l2_dtable *l2;
+	pd_entry_t *pl1pd, l1pd;
+	pt_entry_t *ptep;
+	u_short l1idx;
+
+
+	/*
+	 * Update the bucket's reference count according to how many
+	 * PTEs the caller has just invalidated.
+	 */
+	l2b->l2b_occupancy -= count;
+
+	/*
+	 * Note:
+	 *
+	 * Level 2 page tables allocated to the kernel pmap are never freed
+	 * as that would require checking all Level 1 page tables and
+	 * removing any references to the Level 2 page table. See also the
+	 * comment elsewhere about never freeing bootstrap L2 descriptors.
+	 *
+	 * We make do with just invalidating the mapping in the L2 table.
+	 *
+	 * This isn't really a big deal in practice and, in fact, leads
+	 * to a performance win over time as we don't need to continually
+	 * alloc/free.
+	 */
+	if (l2b->l2b_occupancy > 0 || pm == pmap_kernel())
+		return;
+
+	/*
+	 * There are no more valid mappings in this level 2 page table.
+	 * Go ahead and NULL-out the pointer in the bucket, then
+	 * free the page table.
+	 */
+	l1idx = l2b->l2b_l1idx;
+	ptep = l2b->l2b_kva;
+	l2b->l2b_kva = NULL;
+
+	pl1pd = &pm->pm_l1->l1_kva[l1idx];
+
+	/*
+	 * If the L1 slot matches the pmap's domain
+	 * number, then invalidate it.
+	 */
+	l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK);
+	if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) {
+		*pl1pd = 0;
+		PTE_SYNC(pl1pd);
+	}
+
+	/*
+	 * Release the L2 descriptor table back to the pool cache.
+	 */
+	pmap_free_l2_ptp(ptep);
+
+	/*
+	 * Update the reference count in the associated l2_dtable
+	 */
+	l2 = pm->pm_l2[L2_IDX(l1idx)];
+	if (--l2->l2_occupancy > 0)
+		return;
+
+	/*
+	 * There are no more valid mappings in any of the Level 1
+	 * slots managed by this l2_dtable. Go ahead and NULL-out
+	 * the pointer in the parent pmap and free the l2_dtable.
+	 */
+	pm->pm_l2[L2_IDX(l1idx)] = NULL;
+	pmap_free_l2_dtable(l2);
+}
+
+/*
+ * Pool cache constructors for L2 descriptor tables, metadata and pmap
+ * structures.
+ */
+static int
+pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep, pte;
+	vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK;
+
+	/*
+	 * The mappings for these page tables were initially made using
+	 * pmap_kenter() by the pool subsystem. Therefore, the cache-
+	 * mode will not be right for page table mappings. To avoid
+	 * polluting the pmap_kenter() code with a special case for
+	 * page tables, we simply fix up the cache-mode here if it's not
+	 * correct.
+	 */
+	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+	ptep = &l2b->l2b_kva[l2pte_index(va)];
+	pte = *ptep;
+
+	cpu_idcache_wbinv_range(va, PAGE_SIZE);
+#ifdef ARM_L2_PIPT
+	cpu_l2cache_wbinv_range(pte & L2_S_FRAME, PAGE_SIZE);
+#else
+	cpu_l2cache_wbinv_range(va, PAGE_SIZE);
+#endif
+	if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
+		/*
+		 * Page tables must have the cache-mode set to
+		 * Write-Thru.
+		 */
+		*ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
+		PTE_SYNC(ptep);
+		cpu_tlb_flushD_SE(va);
+		cpu_cpwait();
+	}
+
+	memset(mem, 0, L2_TABLE_SIZE_REAL);
+	return (0);
+}
+
+/*
+ * Modify pte bits for all ptes corresponding to the given physical address.
+ * We use `maskbits' rather than `clearbits' because we're always passing
+ * constants and the latter would require an extra inversion at run-time.
+ */
+static int
+pmap_clearbit(struct vm_page *pg, u_int maskbits)
+{
+	struct l2_bucket *l2b;
+	struct pv_entry *pv;
+	pt_entry_t *ptep, npte, opte;
+	pmap_t pm;
+	vm_offset_t va;
+	u_int oflags;
+	int count = 0;
+
+	vm_page_lock_queues();
+
+	if (maskbits & PVF_WRITE)
+		maskbits |= PVF_MOD;
+	/*
+	 * Clear saved attributes (modify, reference)
+	 */
+	pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF));
+
+	if (TAILQ_EMPTY(&pg->md.pv_list)) {
+		vm_page_unlock_queues();
+		return (0);
+	}
+
+	/*
+	 * Loop over all current mappings setting/clearing as appropos
+	 */
+	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
+		va = pv->pv_va;
+		pm = pv->pv_pmap;
+		oflags = pv->pv_flags;
+		pv->pv_flags &= ~maskbits;
+
+		PMAP_LOCK(pm);
+
+		l2b = pmap_get_l2_bucket(pm, va);
+
+		ptep = &l2b->l2b_kva[l2pte_index(va)];
+		npte = opte = *ptep;
+
+		if ((maskbits & (PVF_WRITE|PVF_MOD)) && L2_S_WRITABLE(opte)) {
+			vm_page_dirty(pg);
+
+			/* make the pte read only */
+			npte |= L2_APX;
+		}
+
+		if (maskbits & PVF_REF) {
+			/*
+			 * Make the PTE invalid so that we will take a
+			 * page fault the next time the mapping is
+			 * referenced.
+			 */
+			npte &= ~L2_TYPE_MASK;
+			npte |= L2_TYPE_INV;
+		}
+
+		CTR4(KTR_PMAP,"clearbit: pmap:%p bits:%x pte:%x->%x",
+		    pm, maskbits, opte, npte);
+		if (npte != opte) {
+			count++;
+			*ptep = npte;
+			PTE_SYNC(ptep);
+			/* Flush the TLB entry if a current pmap. */
+			if (PV_BEEN_EXECD(oflags))
+				cpu_tlb_flushID_SE(pv->pv_va);
+			else if (PV_BEEN_REFD(oflags))
+				cpu_tlb_flushD_SE(pv->pv_va);
+		}
+
+		PMAP_UNLOCK(pm);
+
+	}
+
+	if (maskbits & PVF_WRITE)
+		vm_page_aflag_clear(pg, PGA_WRITEABLE);
+	vm_page_unlock_queues();
+	return (count);
+}
+
+/*
+ * main pv_entry manipulation functions:
+ *   pmap_enter_pv: enter a mapping onto a vm_page list
+ *   pmap_remove_pv: remove a mappiing from a vm_page list
+ *
+ * NOTE: pmap_enter_pv expects to lock the pvh itself
+ *       pmap_remove_pv expects te caller to lock the pvh before calling
+ */
+
+/*
+ * pmap_enter_pv: enter a mapping onto a vm_page lst
+ *
+ * => caller should hold the proper lock on pmap_main_lock
+ * => caller should have pmap locked
+ * => we will gain the lock on the vm_page and allocate the new pv_entry
+ * => caller should adjust ptp's wire_count before calling
+ * => caller should not adjust pmap's wire_count
+ */
+static void
+pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm,
+    vm_offset_t va, u_int flags)
+{
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+
+	PMAP_ASSERT_LOCKED(pm);
+	pve->pv_pmap = pm;
+	pve->pv_va = va;
+	pve->pv_flags = flags;
+
+	TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
+	TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist);
+	pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD);
+	if (pve->pv_flags & PVF_WIRED)
+		++pm->pm_stats.wired_count;
+	vm_page_aflag_set(pg, PGA_REFERENCED);
+}
+
+/*
+ *
+ * pmap_find_pv: Find a pv entry
+ *
+ * => caller should hold lock on vm_page
+ */
+static PMAP_INLINE struct pv_entry *
+pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
+{
+	struct pv_entry *pv;
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list)
+	    if (pm == pv->pv_pmap && va == pv->pv_va)
+		    break;
+	return (pv);
+}
+
+/*
+ * vector_page_setprot:
+ *
+ *	Manipulate the protection of the vector page.
+ */
+void
+vector_page_setprot(int prot)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep;
+
+	l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page);
+
+	ptep = &l2b->l2b_kva[l2pte_index(vector_page)];
+
+	pmap_set_prot(ptep, prot|VM_PROT_EXECUTE, 0);
+
+	cpu_tlb_flushD_SE(vector_page);
+	cpu_cpwait();
+}
+
+static void
+pmap_set_prot(pt_entry_t *ptep, vm_prot_t prot, uint8_t user)
+{
+
+	*ptep &= ~L2_S_PROT_MASK;
+
+	if (!(prot & VM_PROT_EXECUTE))
+		*ptep |= L2_XN;
+
+	*ptep |= L2_S_PROT_R;
+
+	if (user)
+		*ptep |= L2_S_PROT_U;
+
+	if (prot & VM_PROT_WRITE)
+		*ptep &= ~(L2_APX);
+}
+
+/*
+ * pmap_remove_pv: try to remove a mapping from a pv_list
+ *
+ * => caller should hold proper lock on pmap_main_lock
+ * => pmap should be locked
+ * => caller should hold lock on vm_page [so that attrs can be adjusted]
+ * => caller should adjust ptp's wire_count and free PTP if needed
+ * => caller should NOT adjust pmap's wire_count
+ * => we return the removed pve
+ */
+
+static void
+pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve)
+{
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PMAP_ASSERT_LOCKED(pm);
+
+	TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list);
+	TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist);
+
+	if (pve->pv_flags & PVF_WIRED)
+		--pm->pm_stats.wired_count;
+
+	if (pg->md.pvh_attrs & PVF_MOD)
+		vm_page_dirty(pg);
+
+	if (TAILQ_FIRST(&pg->md.pv_list) == NULL)
+		pg->md.pvh_attrs &= ~PVF_REF;
+	else
+		vm_page_aflag_set(pg, PGA_REFERENCED);
+
+	if (pve->pv_flags & PVF_WRITE) {
+		TAILQ_FOREACH(pve, &pg->md.pv_list, pv_list)
+		    if (pve->pv_flags & PVF_WRITE)
+			    break;
+		if (!pve) {
+			pg->md.pvh_attrs &= ~PVF_MOD;
+			vm_page_aflag_clear(pg, PGA_WRITEABLE);
+		}
+	}
+}
+
+static struct pv_entry *
+pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
+{
+	struct pv_entry *pve;
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	pve = TAILQ_FIRST(&pg->md.pv_list);
+
+	while (pve) {
+		if (pve->pv_pmap == pm && pve->pv_va == va) {	/* match? */
+			pmap_nuke_pv(pg, pm, pve);
+			break;
+		}
+		pve = TAILQ_NEXT(pve, pv_list);
+	}
+
+	return(pve);				/* return removed pve */
+}
+
+/*
+ *
+ * pmap_modify_pv: Update pv flags
+ *
+ * => caller should hold lock on vm_page [so that attrs can be adjusted]
+ * => caller should NOT adjust pmap's wire_count
+ * => we return the old flags
+ *
+ * Modify a physical-virtual mapping in the pv table
+ */
+static u_int
+pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va,
+    u_int clr_mask, u_int set_mask)
+{
+	struct pv_entry *npv;
+	u_int flags, oflags;
+
+	PMAP_ASSERT_LOCKED(pm);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if ((npv = pmap_find_pv(pg, pm, va)) == NULL)
+		return (0);
+
+	/*
+	 * There is at least one VA mapping this page.
+	 */
+
+	if (clr_mask & (PVF_REF | PVF_MOD))
+		pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD);
+
+	oflags = npv->pv_flags;
+	npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask;
+
+	if ((flags ^ oflags) & PVF_WIRED) {
+		if (flags & PVF_WIRED)
+			++pm->pm_stats.wired_count;
+		else
+			--pm->pm_stats.wired_count;
+	}
+	if ((oflags & PVF_WRITE) && !(flags & PVF_WRITE)) {
+		TAILQ_FOREACH(npv, &pg->md.pv_list, pv_list) {
+			if (npv->pv_flags & PVF_WRITE)
+				break;
+		}
+		if (!npv) {
+			pg->md.pvh_attrs &= ~PVF_MOD;
+			vm_page_aflag_clear(pg, PGA_WRITEABLE);
+		}
+	}
+
+	return (oflags);
+}
+
+/* Function to set the debug level of the pmap code */
+#ifdef PMAP_DEBUG
+void
+pmap_debug(int level)
+{
+	pmap_debug_level = level;
+	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
+}
+#endif  /* PMAP_DEBUG */
+
+void
+pmap_pinit0(struct pmap *pmap)
+{
+	PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap));
+
+	dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n",
+		(u_int32_t) pmap, (u_int32_t) pmap->pm_pdir);
+	bcopy(kernel_pmap, pmap, sizeof(*pmap));
+	bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx));
+	PMAP_LOCK_INIT(pmap);
+}
+
+/*
+ *	Initialize a vm_page's machine-dependent fields.
+ */
+void
+pmap_page_init(vm_page_t m)
+{
+
+	TAILQ_INIT(&m->md.pv_list);
+}
+
+/*
+ *      Initialize the pmap module.
+ *      Called by vm_init, to initialize any structures that the pmap
+ *      system needs to map virtual memory.
+ */
+void
+pmap_init(void)
+{
+	int shpgperproc = PMAP_SHPGPERPROC;
+
+	PDEBUG(1, printf("pmap_init: phys_start = %08x\n", PHYSADDR));
+
+	/*
+	 * init the pv free list
+	 */
+	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
+	/*
+	 * Now it is safe to enable pv_table recording.
+	 */
+	PDEBUG(1, printf("pmap_init: done!\n"));
+
+	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
+
+	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_high_water = 9 * (pv_entry_max / 10);
+	l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
+	l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
+
+	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
+
+}
+
+int
+pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user)
+{
+	struct l2_dtable *l2;
+	struct l2_bucket *l2b;
+	pd_entry_t *pl1pd, l1pd;
+	pt_entry_t *ptep, pte;
+	vm_paddr_t pa;
+	u_int l1idx;
+	int rv = 0;
+
+	l1idx = L1_IDX(va);
+	vm_page_lock_queues();
+	PMAP_LOCK(pm);
+
+	/*
+	 * If there is no l2_dtable for this address, then the process
+	 * has no business accessing it.
+	 *
+	 * Note: This will catch userland processes trying to access
+	 * kernel addresses.
+	 */
+	l2 = pm->pm_l2[L2_IDX(l1idx)];
+	if (l2 == NULL)
+		goto out;
+
+	/*
+	 * Likewise if there is no L2 descriptor table
+	 */
+	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
+	if (l2b->l2b_kva == NULL)
+		goto out;
+
+	/*
+	 * Check the PTE itself.
+	 */
+	ptep = &l2b->l2b_kva[l2pte_index(va)];
+	pte = *ptep;
+	if (pte == 0)
+		goto out;
+
+	/*
+	 * Catch a userland access to the vector page mapped at 0x0
+	 */
+	if (user && ((pte & L2_S_PROT_MASK) == L2_S_PROT_U))
+		goto out;
+	if (va == vector_page)
+		goto out;
+
+	pa = l2pte_pa(pte);
+	CTR5(KTR_PMAP, "pmap_fault_fix: pmap:%p va:%x pte:0x%x ftype:%x user:%x",
+	    pm, va, pte, ftype, user);
+	if ((ftype & VM_PROT_WRITE) && !(L2_S_WRITABLE(pte))) {
+		/*
+		 * This looks like a good candidate for "page modified"
+		 * emulation...
+		 */
+		struct pv_entry *pv;
+		struct vm_page *pg;
+
+		/* Extract the physical address of the page */
+		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) {
+			goto out;
+		}
+		/* Get the current flags for this page. */
+
+		pv = pmap_find_pv(pg, pm, va);
+		if (pv == NULL) {
+			goto out;
+		}
+
+		/*
+		 * Do the flags say this page is writable? If not then it
+		 * is a genuine write fault. If yes then the write fault is
+		 * our fault as we did not reflect the write access in the
+		 * PTE. Now we know a write has occurred we can correct this
+		 * and also set the modified bit
+		 */
+		if ((pv->pv_flags & PVF_WRITE) == 0) {
+			goto out;
+		}
+		pg->md.pvh_attrs |= PVF_REF | PVF_MOD;
+		vm_page_dirty(pg);
+		pv->pv_flags |= PVF_REF | PVF_MOD;
+
+		/* Re-enable write permissions for the page */
+		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
+		pmap_set_prot(ptep, VM_PROT_WRITE, *ptep & L2_S_PROT_U);
+		CTR1(KTR_PMAP, "pmap_fault_fix: new pte:0x%x", pte);
+		PTE_SYNC(ptep);
+		rv = 1;
+	} else if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) {
+		/*
+		 * This looks like a good candidate for "page referenced"
+		 * emulation.
+		 */
+		struct pv_entry *pv;
+		struct vm_page *pg;
+
+		/* Extract the physical address of the page */
+		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
+			goto out;
+		/* Get the current flags for this page. */
+
+		pv = pmap_find_pv(pg, pm, va);
+		if (pv == NULL)
+			goto out;
+
+		pg->md.pvh_attrs |= PVF_REF;
+		pv->pv_flags |= PVF_REF;
+
+
+		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
+		PTE_SYNC(ptep);
+		rv = 1;
+	}
+
+	/*
+	 * We know there is a valid mapping here, so simply
+	 * fix up the L1 if necessary.
+	 */
+	pl1pd = &pm->pm_l1->l1_kva[l1idx];
+	l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO;
+	if (*pl1pd != l1pd) {
+		*pl1pd = l1pd;
+		PTE_SYNC(pl1pd);
+		rv = 1;
+	}
+
+#ifdef DEBUG
+	/*
+	 * If 'rv == 0' at this point, it generally indicates that there is a
+	 * stale TLB entry for the faulting address. This happens when two or
+	 * more processes are sharing an L1. Since we don't flush the TLB on
+	 * a context switch between such processes, we can take domain faults
+	 * for mappings which exist at the same VA in both processes. EVEN IF
+	 * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for
+	 * example.
+	 *
+	 * This is extremely likely to happen if pmap_enter() updated the L1
+	 * entry for a recently entered mapping. In this case, the TLB is
+	 * flushed for the new mapping, but there may still be TLB entries for
+	 * other mappings belonging to other processes in the 1MB range
+	 * covered by the L1 entry.
+	 *
+	 * Since 'rv == 0', we know that the L1 already contains the correct
+	 * value, so the fault must be due to a stale TLB entry.
+	 *
+	 * Since we always need to flush the TLB anyway in the case where we
+	 * fixed up the L1, or frobbed the L2 PTE, we effectively deal with
+	 * stale TLB entries dynamically.
+	 *
+	 * However, the above condition can ONLY happen if the current L1 is
+	 * being shared. If it happens when the L1 is unshared, it indicates
+	 * that other parts of the pmap are not doing their job WRT managing
+	 * the TLB.
+	 */
+	if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) {
+		extern int last_fault_code;
+		printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n",
+		    pm, va, ftype);
+		printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n",
+		    l2, l2b, ptep, pl1pd);
+		printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n",
+		    pte, l1pd, last_fault_code);
+#ifdef DDB
+		Debugger();
+#endif
+	}
+#endif
+
+	cpu_tlb_flushID_SE(va);
+	cpu_cpwait();
+
+	rv = 1;
+
+out:
+	vm_page_unlock_queues();
+	PMAP_UNLOCK(pm);
+	return (rv);
+}
+
+void
+pmap_postinit(void)
+{
+	struct l2_bucket *l2b;
+	struct l1_ttable *l1;
+	pd_entry_t *pl1pt;
+	pt_entry_t *ptep, pte;
+	vm_offset_t va, eva;
+	u_int loop, needed;
+
+	needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0);
+	needed -= 1;
+	l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK);
+
+	for (loop = 0; loop < needed; loop++, l1++) {
+		/* Allocate a L1 page table */
+		va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0,
+		    0xffffffff, L1_TABLE_SIZE, 0);
+
+		if (va == 0)
+			panic("Cannot allocate L1 KVM");
+
+		eva = va + L1_TABLE_SIZE;
+		pl1pt = (pd_entry_t *)va;
+
+		while (va < eva) {
+				l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+				ptep = &l2b->l2b_kva[l2pte_index(va)];
+				pte = *ptep;
+				pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
+				*ptep = pte;
+				PTE_SYNC(ptep);
+				cpu_tlb_flushD_SE(va);
+
+				va += PAGE_SIZE;
+		}
+		pmap_init_l1(l1, pl1pt);
+	}
+#ifdef DEBUG
+	printf("pmap_postinit: Allocated %d static L1 descriptor tables\n",
+	    needed);
+#endif
+}
+
+/*
+ * This is used to stuff certain critical values into the PCB where they
+ * can be accessed quickly from cpu_switch() et al.
+ */
+void
+pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb)
+{
+	struct l2_bucket *l2b;
+
+	pcb->pcb_pagedir = pm->pm_l1->l1_physaddr;
+	pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) |
+	    (DOMAIN_CLIENT << (pm->pm_domain * 2));
+
+	if (vector_page < KERNBASE) {
+		pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)];
+		l2b = pmap_get_l2_bucket(pm, vector_page);
+		pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO |
+		    L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL);
+	} else
+		pcb->pcb_pl1vec = NULL;
+}
+
+void
+pmap_activate(struct thread *td)
+{
+	pmap_t pm;
+	struct pcb *pcb;
+
+	pm = vmspace_pmap(td->td_proc->p_vmspace);
+	pcb = td->td_pcb;
+
+	critical_enter();
+	pmap_set_pcb_pagedir(pm, pcb);
+
+	if (td == curthread) {
+		u_int cur_dacr, cur_ttb;
+
+		__asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb));
+		__asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr));
+
+		cur_ttb &= ~(L1_TABLE_SIZE - 1);
+
+		if (cur_ttb == (u_int)pcb->pcb_pagedir &&
+		    cur_dacr == pcb->pcb_dacr) {
+			/*
+			 * No need to switch address spaces.
+			 */
+			critical_exit();
+			return;
+		}
+
+
+		/*
+		 * We MUST, I repeat, MUST fix up the L1 entry corresponding
+		 * to 'vector_page' in the incoming L1 table before switching
+		 * to it otherwise subsequent interrupts/exceptions (including
+		 * domain faults!) will jump into hyperspace.
+		 */
+		if (pcb->pcb_pl1vec) {
+			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
+		}
+
+		cpu_domains(pcb->pcb_dacr);
+		cpu_setttb(pcb->pcb_pagedir);
+	}
+	critical_exit();
+}
+
+static int
+pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va)
+{
+	pd_entry_t *pdep, pde;
+	pt_entry_t *ptep, pte;
+	vm_offset_t pa;
+	int rv = 0;
+
+	/*
+	 * Make sure the descriptor itself has the correct cache mode
+	 */
+	pdep = &kl1[L1_IDX(va)];
+	pde = *pdep;
+
+	if (l1pte_section_p(pde)) {
+		if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) {
+			*pdep = (pde & ~L1_S_CACHE_MASK) |
+			    pte_l1_s_cache_mode_pt;
+			PTE_SYNC(pdep);
+			rv = 1;
+		}
+	} else {
+		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
+		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
+		if (ptep == NULL)
+			panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep);
+
+		ptep = &ptep[l2pte_index(va)];
+		pte = *ptep;
+		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
+			*ptep = (pte & ~L2_S_CACHE_MASK) |
+			    pte_l2_s_cache_mode_pt;
+			PTE_SYNC(ptep);
+			rv = 1;
+		}
+	}
+
+	return (rv);
+}
+
+static void
+pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap,
+    pt_entry_t **ptep)
+{
+	vm_offset_t va = *availp;
+	struct l2_bucket *l2b;
+
+	if (ptep) {
+		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+		if (l2b == NULL)
+			panic("pmap_alloc_specials: no l2b for 0x%x", va);
+
+		*ptep = &l2b->l2b_kva[l2pte_index(va)];
+	}
+
+	*vap = va;
+	*availp = va + (PAGE_SIZE * pages);
+}
+
+/*
+ *	Bootstrap the system enough to run with virtual memory.
+ *
+ *	On the arm this is called after mapping has already been enabled
+ *	and just syncs the pmap module with what has already been done.
+ *	[We can't call it easily with mapping off since the kernel is not
+ *	mapped with PA == VA, hence we would have to relocate every address
+ *	from the linked base (virtual) address "KERNBASE" to the actual
+ *	(physical) address starting relative to 0]
+ */
+#define PMAP_STATIC_L2_SIZE 16
+
+void
+pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt)
+{
+	static struct l1_ttable static_l1;
+	static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE];
+	struct l1_ttable *l1 = &static_l1;
+	struct l2_dtable *l2;
+	struct l2_bucket *l2b;
+	pd_entry_t pde;
+	pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va;
+	pt_entry_t *ptep;
+	vm_paddr_t pa;
+	vm_offset_t va;
+	vm_size_t size;
+	int l1idx, l2idx, l2next = 0;
+
+	PDEBUG(1, printf("firstaddr = %08x, lastaddr = %08x\n",
+	    firstaddr, lastaddr));
+
+	virtual_avail = firstaddr;
+	kernel_pmap->pm_l1 = l1;
+	kernel_l1pa = l1pt->pv_pa;
+
+	/*
+	 * Scan the L1 translation table created by initarm() and create
+	 * the required metadata for all valid mappings found in it.
+	 */
+	for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) {
+		pde = kernel_l1pt[l1idx];
+
+		/*
+		 * We're only interested in Coarse mappings.
+		 * pmap_extract() can deal with section mappings without
+		 * recourse to checking L2 metadata.
+		 */
+		if ((pde & L1_TYPE_MASK) != L1_TYPE_C)
+			continue;
+
+		/*
+		 * Lookup the KVA of this L2 descriptor table
+		 */
+		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
+		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
+
+		if (ptep == NULL) {
+			panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx",
+			    (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa);
+		}
+
+		/*
+		 * Fetch the associated L2 metadata structure.
+		 * Allocate a new one if necessary.
+		 */
+		if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) {
+			if (l2next == PMAP_STATIC_L2_SIZE)
+				panic("pmap_bootstrap: out of static L2s");
+			kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 =
+			    &static_l2[l2next++];
+		}
+
+		/*
+		 * One more L1 slot tracked...
+		 */
+		l2->l2_occupancy++;
+
+		/*
+		 * Fill in the details of the L2 descriptor in the
+		 * appropriate bucket.
+		 */
+		l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
+		l2b->l2b_kva = ptep;
+		l2b->l2b_phys = pa;
+		l2b->l2b_l1idx = l1idx;
+
+		/*
+		 * Establish an initial occupancy count for this descriptor
+		 */
+		for (l2idx = 0;
+		    l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
+		    l2idx++) {
+			if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) {
+				l2b->l2b_occupancy++;
+			}
+		}
+
+		/*
+		 * Make sure the descriptor itself has the correct cache mode.
+		 * If not, fix it, but whine about the problem. Port-meisters
+		 * should consider this a clue to fix up their initarm()
+		 * function. :)
+		 */
+		if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) {
+			printf("pmap_bootstrap: WARNING! wrong cache mode for "
+			    "L2 pte @ %p\n", ptep);
+		}
+	}
+
+
+	/*
+	 * Ensure the primary (kernel) L1 has the correct cache mode for
+	 * a page table. Bitch if it is not correctly set.
+	 */
+	for (va = (vm_offset_t)kernel_l1pt;
+	    va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) {
+		if (pmap_set_pt_cache_mode(kernel_l1pt, va))
+			printf("pmap_bootstrap: WARNING! wrong cache mode for "
+			    "primary L1 @ 0x%x\n", va);
+	}
+
+	cpu_dcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+	cpu_tlb_flushID();
+	cpu_cpwait();
+
+	PMAP_LOCK_INIT(kernel_pmap);
+	CPU_FILL(&kernel_pmap->pm_active);
+	kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL;
+	TAILQ_INIT(&kernel_pmap->pm_pvlist);
+
+	/*
+	 * Reserve some special page table entries/VA space for temporary
+	 * mapping of pages.
+	 */
+
+	pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte);
+	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte);
+	pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte);
+	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte);
+	size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE;
+	pmap_alloc_specials(&virtual_avail,
+	    round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE,
+	    &pmap_kernel_l2ptp_kva, NULL);
+
+	size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE;
+	pmap_alloc_specials(&virtual_avail,
+	    round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE,
+	    &pmap_kernel_l2dtable_kva, NULL);
+
+	pmap_alloc_specials(&virtual_avail,
+	    1, (vm_offset_t*)&_tmppt, NULL);
+	pmap_alloc_specials(&virtual_avail,
+	    MAXDUMPPGS, (vm_offset_t *)&crashdumpmap, NULL);
+	SLIST_INIT(&l1_list);
+	TAILQ_INIT(&l1_lru_list);
+	mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF);
+	pmap_init_l1(l1, kernel_l1pt);
+	cpu_dcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+
+	virtual_avail = round_page(virtual_avail);
+	virtual_end = lastaddr;
+	kernel_vm_end = pmap_curmaxkvaddr;
+	arm_nocache_startaddr = lastaddr;
+	mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF);
+
+	pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb);
+}
+
+/***************************************************
+ * Pmap allocation/deallocation routines.
+ ***************************************************/
+
+/*
+ * Release any resources held by the given physical map.
+ * Called when a pmap initialized by pmap_pinit is being released.
+ * Should only be called if the map contains no valid mappings.
+ */
+void
+pmap_release(pmap_t pmap)
+{
+	struct pcb *pcb;
+
+	cpu_idcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+	cpu_tlb_flushID();
+	cpu_cpwait();
+	if (vector_page < KERNBASE) {
+		struct pcb *curpcb = PCPU_GET(curpcb);
+		pcb = thread0.td_pcb;
+		if (pmap_is_current(pmap)) {
+			/*
+			 * Frob the L1 entry corresponding to the vector
+			 * page so that it contains the kernel pmap's domain
+			 * number. This will ensure pmap_remove() does not
+			 * pull the current vector page out from under us.
+			 */
+			critical_enter();
+			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
+			cpu_domains(pcb->pcb_dacr);
+			cpu_setttb(pcb->pcb_pagedir);
+			critical_exit();
+		}
+		pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE);
+		/*
+		 * Make sure cpu_switch(), et al, DTRT. This is safe to do
+		 * since this process has no remaining mappings of its own.
+		 */
+		curpcb->pcb_pl1vec = pcb->pcb_pl1vec;
+		curpcb->pcb_l1vec = pcb->pcb_l1vec;
+		curpcb->pcb_dacr = pcb->pcb_dacr;
+		curpcb->pcb_pagedir = pcb->pcb_pagedir;
+
+	}
+	pmap_free_l1(pmap);
+	PMAP_LOCK_DESTROY(pmap);
+
+	dprintf("pmap_release()\n");
+}
+
+
+
+/*
+ * Helper function for pmap_grow_l2_bucket()
+ */
+static __inline int
+pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep;
+	vm_paddr_t pa;
+	struct vm_page *pg;
+
+	pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
+	if (pg == NULL)
+		return (1);
+	pa = VM_PAGE_TO_PHYS(pg);
+
+	if (pap)
+		*pap = pa;
+
+	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+
+	ptep = &l2b->l2b_kva[l2pte_index(va)];
+	*ptep = L2_S_PROTO | pa | cache_mode;
+	pmap_set_prot(ptep, VM_PROT_READ | VM_PROT_WRITE, 0);
+	PTE_SYNC(ptep);
+
+	return (0);
+}
+
+/*
+ * This is the same as pmap_alloc_l2_bucket(), except that it is only
+ * used by pmap_growkernel().
+ */
+static __inline struct l2_bucket *
+pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va)
+{
+	struct l2_dtable *l2;
+	struct l2_bucket *l2b;
+	struct l1_ttable *l1;
+	pd_entry_t *pl1pd;
+	u_short l1idx;
+	vm_offset_t nva;
+
+	l1idx = L1_IDX(va);
+
+	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
+		/*
+		 * No mapping at this address, as there is
+		 * no entry in the L1 table.
+		 * Need to allocate a new l2_dtable.
+		 */
+		nva = pmap_kernel_l2dtable_kva;
+		if ((nva & PAGE_MASK) == 0) {
+			/*
+			 * Need to allocate a backing page
+			 */
+			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
+				return (NULL);
+		}
+
+		l2 = (struct l2_dtable *)nva;
+		nva += sizeof(struct l2_dtable);
+
+		if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva &
+		    PAGE_MASK)) {
+			/*
+			 * The new l2_dtable straddles a page boundary.
+			 * Map in another page to cover it.
+			 */
+			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
+				return (NULL);
+		}
+
+		pmap_kernel_l2dtable_kva = nva;
+
+		/*
+		 * Link it into the parent pmap
+		 */
+		pm->pm_l2[L2_IDX(l1idx)] = l2;
+		memset(l2, 0, sizeof(*l2));
+	}
+
+	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
+
+	/*
+	 * Fetch pointer to the L2 page table associated with the address.
+	 */
+	if (l2b->l2b_kva == NULL) {
+		pt_entry_t *ptep;
+
+		/*
+		 * No L2 page table has been allocated. Chances are, this
+		 * is because we just allocated the l2_dtable, above.
+		 */
+		nva = pmap_kernel_l2ptp_kva;
+		ptep = (pt_entry_t *)nva;
+		if ((nva & PAGE_MASK) == 0) {
+			/*
+			 * Need to allocate a backing page
+			 */
+			if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt,
+			    &pmap_kernel_l2ptp_phys))
+				return (NULL);
+		}
+		memset(ptep, 0, L2_TABLE_SIZE_REAL);
+		l2->l2_occupancy++;
+		l2b->l2b_kva = ptep;
+		l2b->l2b_l1idx = l1idx;
+		l2b->l2b_phys = pmap_kernel_l2ptp_phys;
+
+		pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL;
+		pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL;
+	}
+
+	/* Distribute new L1 entry to all other L1s */
+	SLIST_FOREACH(l1, &l1_list, l1_link) {
+			pl1pd = &l1->l1_kva[L1_IDX(va)];
+			*pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) |
+			    L1_C_PROTO;
+			PTE_SYNC(pl1pd);
+	}
+
+	return (l2b);
+}
+
+
+/*
+ * grow the number of kernel page table entries, if needed
+ */
+void
+pmap_growkernel(vm_offset_t addr)
+{
+	pmap_t kpm = pmap_kernel();
+
+	if (addr <= pmap_curmaxkvaddr)
+		return;		/* we are OK */
+
+	/*
+	 * whoops!   we need to add kernel PTPs
+	 */
+
+	/* Map 1MB at a time */
+	for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE)
+		pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr);
+
+	/*
+	 * flush out the cache, expensive but growkernel will happen so
+	 * rarely
+	 */
+	cpu_dcache_wbinv_all();
+	cpu_l2cache_wbinv_all();
+	cpu_tlb_flushD();
+	cpu_cpwait();
+	kernel_vm_end = pmap_curmaxkvaddr;
+}
+
+
+/*
+ * Remove all pages from specified address space
+ * this aids process exit speeds.  Also, this code
+ * is special cased for current process only, but
+ * can have the more generic (and slightly slower)
+ * mode enabled.  This is much faster than pmap_remove
+ * in the case of running down an entire address space.
+ */
+void
+pmap_remove_pages(pmap_t pmap)
+{
+	struct pv_entry *pv, *npv;
+	struct l2_bucket *l2b = NULL;
+	vm_page_t m;
+	pt_entry_t *pt;
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pmap);
+	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
+		if (pv->pv_flags & PVF_WIRED) {
+			/* Cannot remove wired pages now. */
+			npv = TAILQ_NEXT(pv, pv_plist);
+			continue;
+		}
+		pmap->pm_stats.resident_count--;
+		l2b = pmap_get_l2_bucket(pmap, pv->pv_va);
+		KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages"));
+		pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
+		m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK);
+		KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
+		*pt = 0;
+		PTE_SYNC(pt);
+		npv = TAILQ_NEXT(pv, pv_plist);
+		pmap_nuke_pv(m, pmap, pv);
+		if (TAILQ_EMPTY(&m->md.pv_list))
+			vm_page_aflag_clear(m, PGA_WRITEABLE);
+		pmap_free_pv_entry(pv);
+		pmap_free_l2_bucket(pmap, l2b, 1);
+	}
+	vm_page_unlock_queues();
+	cpu_tlb_flushID();
+	cpu_cpwait();
+	PMAP_UNLOCK(pmap);
+}
+
+
+/***************************************************
+ * Low level mapping routines.....
+ ***************************************************/
+
+#ifdef ARM_HAVE_SUPERSECTIONS
+/* Map a super section into the KVA. */
+
+void
+pmap_kenter_supersection(vm_offset_t va, uint64_t pa, int flags)
+{
+	pd_entry_t pd = L1_S_PROTO | L1_S_SUPERSEC | (pa & L1_SUP_FRAME) |
+	    (((pa >> 32) & 0xf) << 20) | L1_S_PROT(PTE_KERNEL,
+	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
+	struct l1_ttable *l1;
+	vm_offset_t va0, va_end;
+
+	KASSERT(((va | pa) & L1_SUP_OFFSET) == 0,
+	    ("Not a valid super section mapping"));
+	if (flags & SECTION_CACHE)
+		pd |= pte_l1_s_cache_mode;
+	else if (flags & SECTION_PT)
+		pd |= pte_l1_s_cache_mode_pt;
+
+	va0 = va & L1_SUP_FRAME;
+	va_end = va + L1_SUP_SIZE;
+	SLIST_FOREACH(l1, &l1_list, l1_link) {
+		va = va0;
+		for (; va < va_end; va += L1_S_SIZE) {
+			l1->l1_kva[L1_IDX(va)] = pd;
+			PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
+		}
+	}
+}
+#endif
+
+/* Map a section into the KVA. */
+
+void
+pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags)
+{
+	pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL,
+	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
+	struct l1_ttable *l1;
+
+	KASSERT(((va | pa) & L1_S_OFFSET) == 0,
+	    ("Not a valid section mapping"));
+	if (flags & SECTION_CACHE)
+		pd |= pte_l1_s_cache_mode;
+	else if (flags & SECTION_PT)
+		pd |= pte_l1_s_cache_mode_pt;
+
+	SLIST_FOREACH(l1, &l1_list, l1_link) {
+		l1->l1_kva[L1_IDX(va)] = pd;
+		PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
+	}
+}
+
+/*
+ * Make a temporary mapping for a physical address.  This is only intended
+ * to be used for panic dumps.
+ */
+void *
+pmap_kenter_temp(vm_paddr_t pa, int i)
+{
+	vm_offset_t va;
+
+	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
+	pmap_kenter(va, pa);
+	return ((void *)crashdumpmap);
+}
+
+/*
+ * add a wired page to the kva
+ * note that in order for the mapping to take effect -- you
+ * should do a invltlb after doing the pmap_kenter...
+ */
+static PMAP_INLINE void
+pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *pte;
+	pt_entry_t opte;
+
+	PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n",
+	    (uint32_t) va, (uint32_t) pa));
+
+
+	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+	if (l2b == NULL)
+		l2b = pmap_grow_l2_bucket(pmap_kernel(), va);
+	KASSERT(l2b != NULL, ("No L2 Bucket"));
+
+	pte = &l2b->l2b_kva[l2pte_index(va)];
+	opte = *pte;
+	if (l2pte_valid(opte)) {
+		cpu_tlb_flushD_SE(va);
+		cpu_cpwait();
+	} else {
+		if (opte == 0)
+			l2b->l2b_occupancy++;
+	}
+
+	if (flags & KENTER_CACHE) {
+		*pte = L2_S_PROTO | pa | pte_l2_s_cache_mode;
+		pmap_set_prot(pte, VM_PROT_READ | VM_PROT_WRITE,
+		    flags & KENTER_USER);
+	} else {
+		*pte = L2_S_PROTO | pa;
+		pmap_set_prot(pte, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE,
+		    0);
+	}
+
+	PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n",
+	    (uint32_t) pte, opte, *pte));
+	PTE_SYNC(pte);
+	cpu_cpwait();
+}
+
+void
+pmap_kenter(vm_offset_t va, vm_paddr_t pa)
+{
+	pmap_kenter_internal(va, pa, KENTER_CACHE);
+}
+
+void
+pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa)
+{
+
+	pmap_kenter_internal(va, pa, 0);
+}
+
+void
+pmap_kenter_user(vm_offset_t va, vm_paddr_t pa)
+{
+
+	pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER);
+	/*
+	 * Call pmap_fault_fixup now, to make sure we'll have no exception
+	 * at the first use of the new address, or bad things will happen,
+	 * as we use one of these addresses in the exception handlers.
+	 */
+	pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1);
+}
+
+/*
+ * remove a page from the kernel pagetables
+ */
+void
+pmap_kremove(vm_offset_t va)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *pte, opte;
+
+	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
+	if (!l2b)
+		return;
+	KASSERT(l2b != NULL, ("No L2 Bucket"));
+	pte = &l2b->l2b_kva[l2pte_index(va)];
+	opte = *pte;
+	if (l2pte_valid(opte)) {
+		va = va & ~PAGE_MASK;
+		cpu_tlb_flushD_SE(va);
+		cpu_cpwait();
+		*pte = 0;
+	}
+}
+
+
+/*
+ *	Used to map a range of physical addresses into kernel
+ *	virtual address space.
+ *
+ *	The value passed in '*virt' is a suggested virtual address for
+ *	the mapping. Architectures which can support a direct-mapped
+ *	physical to virtual region can return the appropriate address
+ *	within that region, leaving '*virt' unchanged. Other
+ *	architectures should map the pages starting at '*virt' and
+ *	update '*virt' with the first usable address after the mapped
+ *	region.
+ */
+vm_offset_t
+pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
+{
+	vm_offset_t sva = *virt;
+	vm_offset_t va = sva;
+
+	PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, "
+	    "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end,
+	    prot));
+
+	while (start < end) {
+		pmap_kenter(va, start);
+		va += PAGE_SIZE;
+		start += PAGE_SIZE;
+	}
+	*virt = va;
+	return (sva);
+}
+
+/*
+ * Add a list of wired pages to the kva
+ * this routine is only used for temporary
+ * kernel mappings that do not need to have
+ * page modification or references recorded.
+ * Note that old mappings are simply written
+ * over.  The page *must* be wired.
+ */
+void
+pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]),
+		    KENTER_CACHE);
+		va += PAGE_SIZE;
+	}
+}
+
+
+/*
+ * this routine jerks page mappings from the
+ * kernel -- it is meant only for temporary mappings.
+ */
+void
+pmap_qremove(vm_offset_t va, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (vtophys(va))
+			pmap_kremove(va);
+
+		va += PAGE_SIZE;
+	}
+}
+
+
+/*
+ * pmap_object_init_pt preloads the ptes for a given object
+ * into the specified pmap.  This eliminates the blast of soft
+ * faults on process startup and immediately after an mmap.
+ */
+void
+pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
+    vm_pindex_t pindex, vm_size_t size)
+{
+
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
+	    ("pmap_object_init_pt: non-device object"));
+}
+
+
+/*
+ *	pmap_is_prefaultable:
+ *
+ *	Return whether or not the specified virtual address is elgible
+ *	for prefault.
+ */
+boolean_t
+pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
+{
+	pd_entry_t *pde;
+	pt_entry_t *pte;
+
+	if (!pmap_get_pde_pte(pmap, addr, &pde, &pte))
+		return (FALSE);
+	KASSERT(pte != NULL, ("Valid mapping but no pte ?"));
+	if (*pte == 0)
+		return (TRUE);
+	return (FALSE);
+}
+
+/*
+ * Fetch pointers to the PDE/PTE for the given pmap/VA pair.
+ * Returns TRUE if the mapping exists, else FALSE.
+ *
+ * NOTE: This function is only used by a couple of arm-specific modules.
+ * It is not safe to take any pmap locks here, since we could be right
+ * in the middle of debugging the pmap anyway...
+ *
+ * It is possible for this routine to return FALSE even though a valid
+ * mapping does exist. This is because we don't lock, so the metadata
+ * state may be inconsistent.
+ *
+ * NOTE: We can return a NULL *ptp in the case where the L1 pde is
+ * a "section" mapping.
+ */
+boolean_t
+pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp)
+{
+	struct l2_dtable *l2;
+	pd_entry_t *pl1pd, l1pd;
+	pt_entry_t *ptep;
+	u_short l1idx;
+
+	if (pm->pm_l1 == NULL)
+		return (FALSE);
+
+	l1idx = L1_IDX(va);
+	*pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx];
+	l1pd = *pl1pd;
+
+	if (l1pte_section_p(l1pd)) {
+		*ptp = NULL;
+		return (TRUE);
+	}
+
+	if (pm->pm_l2 == NULL)
+		return (FALSE);
+
+	l2 = pm->pm_l2[L2_IDX(l1idx)];
+
+	if (l2 == NULL ||
+	    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
+		return (FALSE);
+	}
+
+	*ptp = &ptep[l2pte_index(va)];
+	return (TRUE);
+}
+
+/*
+ *      Routine:        pmap_remove_all
+ *      Function:
+ *              Removes this physical page from
+ *              all physical maps in which it resides.
+ *              Reflects back modify bits to the pager.
+ *
+ *      Notes:
+ *              Original versions of this routine were very
+ *              inefficient because they iteratively called
+ *              pmap_remove (slow...)
+ */
+void
+pmap_remove_all(vm_page_t m)
+{
+	pv_entry_t pv;
+	pt_entry_t *ptep;
+	struct l2_bucket *l2b;
+	boolean_t flush = FALSE;
+	pmap_t curpm;
+	int flags = 0;
+
+	KASSERT((m->flags & PG_FICTITIOUS) == 0,
+	    ("pmap_remove_all: page %p is fictitious", m));
+
+	if (TAILQ_EMPTY(&m->md.pv_list))
+		return;
+	vm_page_lock_queues();
+	pmap_remove_write(m);
+	curpm = vmspace_pmap(curproc->p_vmspace);
+	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
+		if (flush == FALSE && (pv->pv_pmap == curpm ||
+		    pv->pv_pmap == pmap_kernel()))
+			flush = TRUE;
+
+		PMAP_LOCK(pv->pv_pmap);
+		l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
+		KASSERT(l2b != NULL, ("No l2 bucket"));
+		ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
+		*ptep = 0;
+		if (pmap_is_current(pv->pv_pmap))
+			PTE_SYNC(ptep);
+		pmap_free_l2_bucket(pv->pv_pmap, l2b, 1);
+		pv->pv_pmap->pm_stats.resident_count--;
+		flags |= pv->pv_flags;
+		pmap_nuke_pv(m, pv->pv_pmap, pv);
+		PMAP_UNLOCK(pv->pv_pmap);
+		pmap_free_pv_entry(pv);
+	}
+
+	if (flush) {
+		if (PV_BEEN_EXECD(flags))
+			cpu_tlb_flushID();
+		else
+			cpu_tlb_flushD();
+	}
+	vm_page_aflag_clear(m, PGA_WRITEABLE);
+	vm_page_unlock_queues();
+}
+
+int
+pmap_change_attr(vm_offset_t sva, vm_size_t len, int mode)
+{
+	vm_offset_t base, offset, tmpva;
+	vm_size_t size;
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep, pte;
+	vm_offset_t next_bucket;
+
+	PMAP_LOCK(kernel_pmap);
+
+	base = trunc_page(sva);
+	offset = sva & PAGE_MASK;
+	size = roundup(offset + len, PAGE_SIZE);
+
+#ifdef checkit
+	/*
+	 * Only supported on kernel virtual addresses, including the direct
+	 * map but excluding the recursive map.
+	 */
+	if (base < DMAP_MIN_ADDRESS)
+		return (EINVAL);
+#endif
+	for (tmpva = base; tmpva < base + size; ) {
+		next_bucket = L2_NEXT_BUCKET(tmpva);
+		if (next_bucket > base + size)
+			next_bucket = base + size;
+
+		l2b = pmap_get_l2_bucket(kernel_pmap, tmpva);
+		if (l2b == NULL) {
+			tmpva = next_bucket;
+			continue;
+		}
+
+		ptep = &l2b->l2b_kva[l2pte_index(tmpva)];
+
+		if (*ptep == 0)
+			return(EINVAL);
+
+		pte = *ptep &~ L2_S_CACHE_MASK;
+		cpu_idcache_wbinv_range(tmpva, PAGE_SIZE);
+#ifdef ARM_L2_PIPT
+		cpu_l2cache_wbinv_range(pte & L2_S_FRAME, PAGE_SIZE);
+#else
+		cpu_l2cache_wbinv_range(tmpva, PAGE_SIZE);
+#endif
+		*ptep = pte;
+		cpu_tlb_flushID_SE(tmpva);
+
+		dprintf("%s: for va:%x ptep:%x pte:%x\n",
+		    __func__, tmpva, (uint32_t)ptep, pte);
+		tmpva += PAGE_SIZE;
+	}
+
+	PMAP_UNLOCK(kernel_pmap);
+
+	return (0);
+}
+
+/*
+ *	Set the physical protection on the
+ *	specified range of this map as requested.
+ */
+void
+pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep, pte;
+	vm_offset_t next_bucket;
+	u_int flags;
+	int flush;
+
+	if ((prot & VM_PROT_READ) == 0) {
+		pmap_remove(pm, sva, eva);
+		return;
+	}
+
+	if (prot & VM_PROT_WRITE) {
+		/*
+		 * If this is a read->write transition, just ignore it and let
+		 * vm_fault() take care of it later.
+		 */
+		return;
+	}
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pm);
+
+	/*
+	 * OK, at this point, we know we're doing write-protect operation.
+	 * If the pmap is active, write-back the range.
+	 */
+
+	flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1;
+	flags = 0;
+
+	while (sva < eva) {
+		next_bucket = L2_NEXT_BUCKET(sva);
+		if (next_bucket > eva)
+			next_bucket = eva;
+
+		l2b = pmap_get_l2_bucket(pm, sva);
+		if (l2b == NULL) {
+			sva = next_bucket;
+			continue;
+		}
+
+		ptep = &l2b->l2b_kva[l2pte_index(sva)];
+
+		while (sva < next_bucket) {
+			if ((pte = *ptep) != 0 && L2_S_WRITABLE(pte)) {
+				struct vm_page *pg;
+				u_int f;
+
+				pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
+				pmap_set_prot(ptep, prot, !(pm == pmap_kernel()));
+				PTE_SYNC(ptep);
+
+				f = pmap_modify_pv(pg, pm, sva,
+				    PVF_WRITE, 0);
+				if (f & PVF_WRITE)
+					vm_page_dirty(pg);
+
+				if (flush >= 0) {
+					flush++;
+					flags |= f;
+				} else
+				if (PV_BEEN_EXECD(f))
+					cpu_tlb_flushID_SE(sva);
+				else
+				if (PV_BEEN_REFD(f))
+					cpu_tlb_flushD_SE(sva);
+			}
+
+			sva += PAGE_SIZE;
+			ptep++;
+		}
+	}
+
+
+	if (flush) {
+		if (PV_BEEN_EXECD(flags))
+			cpu_tlb_flushID();
+		else
+		if (PV_BEEN_REFD(flags))
+			cpu_tlb_flushD();
+	}
+	vm_page_unlock_queues();
+
+	PMAP_UNLOCK(pm);
+}
+
+
+/*
+ *	Insert the given physical page (p) at
+ *	the specified virtual address (v) in the
+ *	target physical map with the protection requested.
+ *
+ *	If specified, the page will be wired down, meaning
+ *	that the related pte can not be reclaimed.
+ *
+ *	NB:  This is the only routine which MAY NOT lazy-evaluate
+ *	or lose information.  That is, this routine must actually
+ *	insert this page into the given map NOW.
+ */
+
+void
+pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
+    vm_prot_t prot, boolean_t wired)
+{
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pmap);
+	pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK);
+	PMAP_UNLOCK(pmap);
+	vm_page_unlock_queues();
+}
+
+/*
+ *	The page queues and pmap must be locked.
+ */
+static void
+pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+    boolean_t wired, int flags)
+{
+	struct l2_bucket *l2b = NULL;
+	struct vm_page *opg;
+	struct pv_entry *pve = NULL;
+	pt_entry_t *ptep, npte, opte;
+	u_int nflags;
+	u_int oflags;
+	vm_paddr_t pa;
+	u_char user;
+
+	PMAP_ASSERT_LOCKED(pmap);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if (va == vector_page) {
+		pa = systempage.pv_pa;
+		m = NULL;
+	} else {
+		KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
+		    (flags & M_NOWAIT) != 0,
+		    ("pmap_enter_locked: page %p is not busy", m));
+		pa = VM_PAGE_TO_PHYS(m);
+	}
+
+	user = 0;
+	/*
+	 * Make sure userland mappings get the right permissions
+	 */
+	if (pmap != pmap_kernel() && va != vector_page)
+		user = 1;
+
+	nflags = 0;
+
+	if (prot & VM_PROT_WRITE)
+		nflags |= PVF_WRITE;
+	if (prot & VM_PROT_EXECUTE)
+		nflags |= PVF_EXEC;
+	if (wired)
+		nflags |= PVF_WIRED;
+
+	PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, "
+	    "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired));
+
+	if (pmap == pmap_kernel()) {
+		l2b = pmap_get_l2_bucket(pmap, va);
+		if (l2b == NULL)
+			l2b = pmap_grow_l2_bucket(pmap, va);
+	} else {
+do_l2b_alloc:
+		l2b = pmap_alloc_l2_bucket(pmap, va);
+		if (l2b == NULL) {
+			if (flags & M_WAITOK) {
+				PMAP_UNLOCK(pmap);
+				vm_page_unlock_queues();
+				VM_WAIT;
+				vm_page_lock_queues();
+				PMAP_LOCK(pmap);
+				goto do_l2b_alloc;
+			}
+			return;
+		}
+	}
+
+	ptep = &l2b->l2b_kva[l2pte_index(va)];
+
+	opte = *ptep;
+	npte = pa;
+	oflags = 0;
+	if (opte) {
+		/*
+		 * There is already a mapping at this address.
+		 * If the physical address is different, lookup the
+		 * vm_page.
+		 */
+		if (l2pte_pa(opte) != pa)
+			opg = PHYS_TO_VM_PAGE(l2pte_pa(opte));
+		else
+			opg = m;
+	} else
+		opg = NULL;
+
+	if ((prot & (VM_PROT_ALL)) ||
+	    (!m || m->md.pvh_attrs & PVF_REF)) {
+		/*
+		 * - The access type indicates that we don't need
+		 *   to do referenced emulation.
+		 * OR
+		 * - The physical page has already been referenced
+		 *   so no need to re-do referenced emulation here.
+		 */
+		npte |= L2_S_PROTO;
+#ifdef SMP
+		npte |= L2_SHARED;
+#endif
+
+		nflags |= PVF_REF;
+
+		if (m && ((prot & VM_PROT_WRITE) != 0 ||
+		    (m->md.pvh_attrs & PVF_MOD))) {
+			/*
+			 * This is a writable mapping, and the
+			 * page's mod state indicates it has
+			 * already been modified. Make it
+			 * writable from the outset.
+			 */
+			nflags |= PVF_MOD;
+			if (!(m->md.pvh_attrs & PVF_MOD))
+				vm_page_dirty(m);
+		}
+		if (m && opte)
+			vm_page_aflag_set(m, PGA_REFERENCED);
+	} else {
+		/*
+		 * Need to do page referenced emulation.
+		 */
+		npte |= L2_TYPE_INV;
+	}
+
+	npte |= L2_S_PROT_R;
+
+	if (prot & VM_PROT_WRITE) {
+		npte &= ~(L2_APX);
+
+		if (m != NULL &&
+		    (m->oflags & VPO_UNMANAGED) == 0)
+			vm_page_aflag_set(m, PGA_WRITEABLE);
+	}
+
+	if (user)
+		npte |= L2_S_PROT_U;
+
+
+	if (!(prot & VM_PROT_EXECUTE) && m)
+		npte |= L2_XN;
+
+	npte |= pte_l2_s_cache_mode;
+
+	if (m && m == opg) {
+		/*
+		 * We're changing the attrs of an existing mapping.
+		 */
+		oflags = pmap_modify_pv(m, pmap, va,
+		    PVF_WRITE | PVF_EXEC | PVF_WIRED |
+		    PVF_MOD | PVF_REF, nflags);
+	} else {
+		/*
+		 * New mapping, or changing the backing page
+		 * of an existing mapping.
+		 */
+		if (opg) {
+			/*
+			 * Replacing an existing mapping with a new one.
+			 * It is part of our managed memory so we
+			 * must remove it from the PV list
+			 */
+			if ((pve = pmap_remove_pv(opg, pmap, va))) {
+			    oflags = pve->pv_flags;
+
+			    if (m && ((m->oflags & VPO_UNMANAGED))) {
+				pmap_free_pv_entry(pve);
+				pve = NULL;
+			    }
+			}
+		}
+
+		if ((m && !(m->oflags & VPO_UNMANAGED))) {
+			if ((!pve) && (pve = pmap_get_pv_entry()) == NULL)
+				panic("pmap_enter: no pv entries");
+
+			KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
+			("pmap_enter: managed mapping within the clean submap"));
+			KASSERT(pve != NULL, ("No pv"));
+			pmap_enter_pv(m, pve, pmap, va, nflags);
+		}
+	}
+
+	/*
+	 * Keep the stats up to date
+	 */
+	if (opte == 0) {
+		l2b->l2b_occupancy++;
+		pmap->pm_stats.resident_count++;
+	}
+
+	CTR5(KTR_PMAP,"enter: pmap:%p va:%x prot:%x pte:%x->%x",
+	    pmap, va, prot, opte, npte);
+	/*
+	 * If this is just a wiring change, the two PTEs will be
+	 * identical, so there's no need to update the page table.
+	 */
+	if (npte != opte) {
+		boolean_t is_cached = pmap_is_current(pmap);
+
+		*ptep = npte;
+		PTE_SYNC(ptep);
+		if (is_cached) {
+			/*
+			 * We only need to frob the cache/tlb if this pmap
+			 * is current
+			 */
+			if (L1_IDX(va) != L1_IDX(vector_page) &&
+			    l2pte_valid(npte)) {
+				/*
+				 * This mapping is likely to be accessed as
+				 * soon as we return to userland. Fix up the
+				 * L1 entry to avoid taking another
+				 * page/domain fault.
+				 */
+				pd_entry_t *pl1pd, l1pd;
+
+				pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)];
+				l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) |
+				    L1_C_PROTO;
+				if (*pl1pd != l1pd) {
+					*pl1pd = l1pd;
+					PTE_SYNC(pl1pd);
+				}
+			}
+		}
+
+		if (PV_BEEN_EXECD(oflags))
+			cpu_tlb_flushID_SE(va);
+		else if (PV_BEEN_REFD(oflags))
+			cpu_tlb_flushD_SE(va);
+	}
+}
+
+/*
+ * Maps a sequence of resident pages belonging to the same object.
+ * The sequence begins with the given page m_start.  This page is
+ * mapped at the given virtual address start.  Each subsequent page is
+ * mapped at a virtual address that is offset from start by the same
+ * amount as the page is offset from m_start within the object.  The
+ * last page in the sequence is the page with the largest offset from
+ * m_start that can be mapped at a virtual address less than the given
+ * virtual address end.  Not every virtual page between start and end
+ * is mapped; only those for which a resident page exists with the
+ * corresponding offset from m_start are mapped.
+ */
+void
+pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
+    vm_page_t m_start, vm_prot_t prot)
+{
+	vm_page_t m;
+	vm_pindex_t diff, psize;
+
+	psize = atop(end - start);
+	m = m_start;
+	vm_page_lock_queues();
+	PMAP_LOCK(pmap);
+	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
+		pmap_enter_locked(pmap, start + ptoa(diff), m, prot &
+		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT);
+		m = TAILQ_NEXT(m, listq);
+	}
+	PMAP_UNLOCK(pmap);
+	vm_page_unlock_queues();
+}
+
+/*
+ * this code makes some *MAJOR* assumptions:
+ * 1. Current pmap & pmap exists.
+ * 2. Not wired.
+ * 3. Read access.
+ * 4. No page table pages.
+ * but is *MUCH* faster than pmap_enter...
+ */
+
+void
+pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
+{
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pmap);
+	pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
+	    FALSE, M_NOWAIT);
+	PMAP_UNLOCK(pmap);
+	vm_page_unlock_queues();
+}
+
+/*
+ *	Routine:	pmap_change_wiring
+ *	Function:	Change the wiring attribute for a map/virtual-address
+ *			pair.
+ *	In/out conditions:
+ *			The mapping must already exist in the pmap.
+ */
+void
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+{
+	struct l2_bucket *l2b;
+	pt_entry_t *ptep, pte;
+	vm_page_t pg;
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pmap);
+	l2b = pmap_get_l2_bucket(pmap, va);
+	KASSERT(l2b, ("No l2b bucket in pmap_change_wiring"));
+	ptep = &l2b->l2b_kva[l2pte_index(va)];
+	pte = *ptep;
+	pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
+	if (pg)
+		pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired);
+	vm_page_unlock_queues();
+	PMAP_UNLOCK(pmap);
+}
+
+
+/*
+ *	Copy the range specified by src_addr/len
+ *	from the source map to the range dst_addr/len
+ *	in the destination map.
+ *
+ *	This routine is only advisory and need not do anything.
+ */
+void
+pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
+    vm_size_t len, vm_offset_t src_addr)
+{
+}
+
+
+/*
+ *	Routine:	pmap_extract
+ *	Function:
+ *		Extract the physical page address associated
+ *		with the given map/virtual_address pair.
+ */
+vm_paddr_t
+pmap_extract(pmap_t pm, vm_offset_t va)
+{
+	struct l2_dtable *l2;
+	pd_entry_t l1pd;
+	pt_entry_t *ptep, pte;
+	vm_paddr_t pa;
+	u_int l1idx;
+	l1idx = L1_IDX(va);
+
+	PMAP_LOCK(pm);
+	l1pd = pm->pm_l1->l1_kva[l1idx];
+	if (l1pte_section_p(l1pd)) {
+		/*
+		 * These should only happen for pmap_kernel()
+		 */
+		KASSERT(pm == pmap_kernel(), ("huh"));
+		/* XXX: what to do about the bits > 32 ? */
+		if (l1pd & L1_S_SUPERSEC)
+			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
+		else
+			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
+	} else {
+		/*
+		 * Note that we can't rely on the validity of the L1
+		 * descriptor as an indication that a mapping exists.
+		 * We have to look it up in the L2 dtable.
+		 */
+		l2 = pm->pm_l2[L2_IDX(l1idx)];
+
+		if (l2 == NULL ||
+		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
+			PMAP_UNLOCK(pm);
+			return (0);
+		}
+
+		ptep = &ptep[l2pte_index(va)];
+		pte = *ptep;
+
+		if (pte == 0) {
+			PMAP_UNLOCK(pm);
+			return (0);
+		}
+
+		switch (pte & L2_TYPE_MASK) {
+		case L2_TYPE_L:
+			pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
+			break;
+
+		default:
+			pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
+			break;
+		}
+	}
+
+	PMAP_UNLOCK(pm);
+	return (pa);
+}
+
+/*
+ * Atomically extract and hold the physical page with the given
+ * pmap and virtual address pair if that mapping permits the given
+ * protection.
+ *
+ */
+vm_page_t
+pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
+{
+	struct l2_dtable *l2;
+	pd_entry_t l1pd;
+	pt_entry_t *ptep, pte;
+	vm_paddr_t pa, paddr;
+	vm_page_t m = NULL;
+	u_int l1idx;
+	l1idx = L1_IDX(va);
+	paddr = 0;
+
+	PMAP_LOCK(pmap);
+retry:
+	l1pd = pmap->pm_l1->l1_kva[l1idx];
+	if (l1pte_section_p(l1pd)) {
+		/*
+		 * These should only happen for pmap_kernel()
+		 */
+		KASSERT(pmap == pmap_kernel(), ("huh"));
+		/* XXX: what to do about the bits > 32 ? */
+		if (l1pd & L1_S_SUPERSEC)
+			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
+		else
+			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
+		if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
+			goto retry;
+		if (L1_S_WRITABLE(l1pd) || (prot & VM_PROT_WRITE) == 0) {
+			m = PHYS_TO_VM_PAGE(pa);
+			vm_page_hold(m);
+		}
+	} else {
+		/*
+		 * Note that we can't rely on the validity of the L1
+		 * descriptor as an indication that a mapping exists.
+		 * We have to look it up in the L2 dtable.
+		 */
+		l2 = pmap->pm_l2[L2_IDX(l1idx)];
+
+		if (l2 == NULL ||
+		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
+			PMAP_UNLOCK(pmap);
+			return (NULL);
+		}
+
+		ptep = &ptep[l2pte_index(va)];
+		pte = *ptep;
+
+		if (pte == 0) {
+			PMAP_UNLOCK(pmap);
+			return (NULL);
+		} else if ((prot & VM_PROT_WRITE) && (pte & L2_APX)) {
+			PMAP_UNLOCK(pmap);
+			return (NULL);
+		} else {
+			switch (pte & L2_TYPE_MASK) {
+			case L2_TYPE_L:
+				panic("extract and hold section mapping");
+				break;
+			default:
+				pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
+				break;
+			}
+			if (vm_page_pa_tryrelock(pmap, pa & PG_FRAME, &paddr))
+				goto retry;
+			m = PHYS_TO_VM_PAGE(pa);
+			vm_page_hold(m);
+		}
+
+	}
+
+	PMAP_UNLOCK(pmap);
+	PA_UNLOCK_COND(paddr);
+	return (m);
+}
+
+/*
+ * Initialize a preallocated and zeroed pmap structure,
+ * such as one in a vmspace structure.
+ */
+
+int
+pmap_pinit(pmap_t pmap)
+{
+	PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap));
+
+	PMAP_LOCK_INIT(pmap);
+	pmap_alloc_l1(pmap);
+	bzero(pmap->pm_l2, sizeof(pmap->pm_l2));
+
+	CPU_ZERO(&pmap->pm_active);
+
+	TAILQ_INIT(&pmap->pm_pvlist);
+	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	pmap->pm_stats.resident_count = 1;
+	if (vector_page < KERNBASE) {
+		pmap_enter(pmap, vector_page,
+		    VM_PROT_READ, PHYS_TO_VM_PAGE(systempage.pv_pa),
+		    VM_PROT_READ, 1);
+	}
+	return (1);
+}
+
+
+/***************************************************
+ * page management routines.
+ ***************************************************/
+
+
+static void
+pmap_free_pv_entry(pv_entry_t pv)
+{
+	pv_entry_count--;
+	uma_zfree(pvzone, pv);
+}
+
+
+/*
+ * get a new pv_entry, allocating a block from the system
+ * when needed.
+ * the memory allocation is performed bypassing the malloc code
+ * because of the possibility of allocations at interrupt time.
+ */
+static pv_entry_t
+pmap_get_pv_entry(void)
+{
+	pv_entry_t ret_value;
+
+	pv_entry_count++;
+	if (pv_entry_count > pv_entry_high_water)
+		pagedaemon_wakeup();
+	ret_value = uma_zalloc(pvzone, M_NOWAIT);
+	return ret_value;
+}
+
+/*
+ *	Remove the given range of addresses from the specified map.
+ *
+ *	It is assumed that the start and end are properly
+ *	rounded to the page size.
+ */
+#define	PMAP_REMOVE_CLEAN_LIST_SIZE	3
+void
+pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
+{
+	struct l2_bucket *l2b;
+	vm_offset_t next_bucket;
+	pt_entry_t *ptep;
+	u_int total;
+	u_int mappings, is_exec, is_refd;
+	int flushall = 0;
+
+
+	/*
+	 * we lock in the pmap => pv_head direction
+	 */
+
+	vm_page_lock_queues();
+	PMAP_LOCK(pm);
+	total = 0;
+	while (sva < eva) {
+		/*
+		 * Do one L2 bucket's worth at a time.
+		 */
+		next_bucket = L2_NEXT_BUCKET(sva);
+		if (next_bucket > eva)
+			next_bucket = eva;
+
+		l2b = pmap_get_l2_bucket(pm, sva);
+		if (l2b == NULL) {
+			sva = next_bucket;
+			continue;
+		}
+
+		ptep = &l2b->l2b_kva[l2pte_index(sva)];
+		mappings = 0;
+
+		while (sva < next_bucket) {
+			struct vm_page *pg;
+			pt_entry_t pte;
+			vm_paddr_t pa;
+
+			pte = *ptep;
+
+			if (pte == 0) {
+				/*
+				 * Nothing here, move along
+				 */
+				sva += PAGE_SIZE;
+				ptep++;
+				continue;
+			}
+
+			pm->pm_stats.resident_count--;
+			pa = l2pte_pa(pte);
+			is_exec = 0;
+			is_refd = 1;
+
+			/*
+			 * Update flags. In a number of circumstances,
+			 * we could cluster a lot of these and do a
+			 * number of sequential pages in one go.
+			 */
+			if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) {
+				struct pv_entry *pve;
+
+				pve = pmap_remove_pv(pg, pm, sva);
+				if (pve) {
+					is_exec = PV_BEEN_EXECD(pve->pv_flags);
+					is_refd = PV_BEEN_REFD(pve->pv_flags);
+					pmap_free_pv_entry(pve);
+				}
+			}
+
+			if (pmap_is_current(pm)) {
+				total++;
+				if (total < PMAP_REMOVE_CLEAN_LIST_SIZE) {
+					if (is_exec)
+						cpu_tlb_flushID_SE(sva);
+					else if (is_refd)
+						cpu_tlb_flushD_SE(sva);
+				} else if (total == PMAP_REMOVE_CLEAN_LIST_SIZE) {
+					flushall = 1;
+				}
+			}
+			*ptep = 0;
+			PTE_SYNC(ptep);
+
+			sva += PAGE_SIZE;
+			ptep++;
+			mappings++;
+		}
+
+		pmap_free_l2_bucket(pm, l2b, mappings);
+	}
+
+	vm_page_unlock_queues();
+	if (flushall)
+		cpu_tlb_flushID();
+	PMAP_UNLOCK(pm);
+}
+
+/*
+ * pmap_zero_page()
+ *
+ * Zero a given physical page by mapping it at a page hook point.
+ * In doing the zero page op, the page we zero is mapped cachable, as with
+ * StrongARM accesses to non-cached pages are non-burst making writing
+ * _any_ bulk data very slow.
+ */
+static void
+pmap_zero_page_gen(vm_page_t pg, int off, int size)
+{
+
+	vm_paddr_t phys = VM_PAGE_TO_PHYS(pg);
+	if (!TAILQ_EMPTY(&pg->md.pv_list))
+		panic("pmap_zero_page: page has mappings");
+
+	mtx_lock(&cmtx);
+	/*
+	 * Hook in the page, zero it, invalidate the TLB as needed.
+	 *
+	 * Note the temporary zero-page mapping must be a non-cached page in
+	 * order to work without corruption when write-allocate is enabled.
+	 */
+	*cdst_pte = L2_S_PROTO | phys | pte_l2_s_cache_mode;
+	pmap_set_prot(cdst_pte, VM_PROT_WRITE, 0);
+	PTE_SYNC(cdst_pte);
+	cpu_tlb_flushD_SE(cdstp);
+	cpu_cpwait();
+	if (off || size != PAGE_SIZE)
+		bzero((void *)(cdstp + off), size);
+	else
+		bzero_page(cdstp);
+
+	mtx_unlock(&cmtx);
+}
+
+/*
+ *	pmap_zero_page zeros the specified hardware page by mapping
+ *	the page into KVM and using bzero to clear its contents.
+ */
+void
+pmap_zero_page(vm_page_t m)
+{
+	pmap_zero_page_gen(m, 0, PAGE_SIZE);
+}
+
+
+/*
+ *	pmap_zero_page_area zeros the specified hardware page by mapping
+ *	the page into KVM and using bzero to clear its contents.
+ *
+ *	off and size may not cover an area beyond a single hardware page.
+ */
+void
+pmap_zero_page_area(vm_page_t m, int off, int size)
+{
+
+	pmap_zero_page_gen(m, off, size);
+}
+
+
+/*
+ *	pmap_zero_page_idle zeros the specified hardware page by mapping
+ *	the page into KVM and using bzero to clear its contents.  This
+ *	is intended to be called from the vm_pagezero process only and
+ *	outside of Giant.
+ */
+void
+pmap_zero_page_idle(vm_page_t m)
+{
+
+	pmap_zero_page(m);
+}
+
+/*
+ *	pmap_copy_page copies the specified (machine independent)
+ *	page by mapping the page into virtual memory and using
+ *	bcopy to copy the page, one machine dependent page at a
+ *	time.
+ */
+
+/*
+ * pmap_copy_page()
+ *
+ * Copy one physical page into another, by mapping the pages into
+ * hook points. The same comment regarding cachability as in
+ * pmap_zero_page also applies here.
+ */
+void
+pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
+{
+	/*
+	 * Hold the source page's lock for the duration of the copy
+	 * so that no other mappings can be created while we have a
+	 * potentially aliased mapping.
+	 * Map the pages into the page hook points, copy them, and purge
+	 * the cache for the appropriate page. Invalidate the TLB
+	 * as required.
+	 */
+	mtx_lock(&cmtx);
+
+	/* For ARMv6 using System bit is deprecated and mapping with AP
+	 * bits set to 0x0 makes page not accessible. csrc_pte is mapped
+	 * read/write until proper mapping defines are created for ARMv6.
+	 */
+	*csrc_pte = L2_S_PROTO | src | pte_l2_s_cache_mode;
+	pmap_set_prot(csrc_pte, VM_PROT_READ, 0);
+	PTE_SYNC(csrc_pte);
+
+	*cdst_pte = L2_S_PROTO | dst | pte_l2_s_cache_mode;
+	pmap_set_prot(cdst_pte, VM_PROT_READ | VM_PROT_WRITE, 0);
+	PTE_SYNC(cdst_pte);
+	cpu_tlb_flushD_SE(csrcp);
+	cpu_tlb_flushD_SE(cdstp);
+	cpu_cpwait();
+
+	bcopy_page(csrcp, cdstp);
+
+	mtx_unlock(&cmtx);
+}
+
+void
+pmap_copy_page(vm_page_t src, vm_page_t dst)
+{
+
+	if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size &&
+	    _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst),
+	    (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0)
+		return;
+
+	pmap_copy_page_generic(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
+}
+
+/*
+ * this routine returns true if a physical page resides
+ * in the given pmap.
+ */
+boolean_t
+pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
+{
+	pv_entry_t pv;
+	int loops = 0;
+	boolean_t rv;
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_page_exists_quick: page %p is not managed", m));
+	rv = FALSE;
+	vm_page_lock_queues();
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		if (pv->pv_pmap == pmap) {
+			rv = TRUE;
+			break;
+		}
+		loops++;
+		if (loops >= 16)
+			break;
+	}
+
+	vm_page_unlock_queues();
+	return (rv);
+}
+
+/*
+ *	pmap_page_wired_mappings:
+ *
+ *	Return the number of managed mappings to the given physical page
+ *	that are wired.
+ */
+int
+pmap_page_wired_mappings(vm_page_t m)
+{
+	pv_entry_t pv;
+	int count;
+
+	count = 0;
+	if ((m->flags & PG_FICTITIOUS) != 0)
+		return (count);
+	vm_page_lock_queues();
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
+		if ((pv->pv_flags & PVF_WIRED) != 0)
+			count++;
+	vm_page_unlock_queues();
+	return (count);
+}
+
+/*
+ *	pmap_is_referenced:
+ *
+ *	Return whether or not the specified physical page was referenced
+ *	in any physical maps.
+ */
+boolean_t
+pmap_is_referenced(vm_page_t m)
+{
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_is_referenced: page %p is not managed", m));
+	return ((m->md.pvh_attrs & PVF_REF) != 0);
+}
+
+/*
+ *	pmap_ts_referenced:
+ *
+ *	Return the count of reference bits for a page, clearing all of them.
+ */
+int
+pmap_ts_referenced(vm_page_t m)
+{
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_ts_referenced: page %p is not managed", m));
+	return (pmap_clearbit(m, PVF_REF));
+}
+
+
+boolean_t
+pmap_is_modified(vm_page_t m)
+{
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_is_modified: page %p is not managed", m));
+	if (m->md.pvh_attrs & PVF_MOD)
+		return (TRUE);
+
+	return(FALSE);
+}
+
+
+/*
+ *	Clear the modify bits on the specified physical page.
+ */
+void
+pmap_clear_modify(vm_page_t m)
+{
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_clear_modify: page %p is not managed", m));
+	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	KASSERT((m->oflags & VPO_BUSY) == 0,
+	    ("pmap_clear_modify: page %p is busy", m));
+
+	/*
+	 * If the page is not PGA_WRITEABLE, then no mappings can be modified.
+	 * If the object containing the page is locked and the page is not
+	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
+	 */
+	if ((m->aflags & PGA_WRITEABLE) == 0)
+		return;
+
+	if (m->md.pvh_attrs & PVF_MOD)
+		pmap_clearbit(m, PVF_MOD);
+}
+
+
+/*
+ *	pmap_clear_reference:
+ *
+ *	Clear the reference bit on the specified physical page.
+ */
+void
+pmap_clear_reference(vm_page_t m)
+{
+
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_clear_reference: page %p is not managed", m));
+	if (m->md.pvh_attrs & PVF_REF)
+		pmap_clearbit(m, PVF_REF);
+}
+
+
+/*
+ * Clear the write and modified bits in each of the given page's mappings.
+ */
+void
+pmap_remove_write(vm_page_t m)
+{
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("pmap_remove_write: page %p is not managed", m));
+
+	/*
+	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
+	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
+	 * is clear, no page table entries need updating.
+	 */
+	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	if ((m->oflags & VPO_BUSY) != 0 ||
+	    (m->aflags & PGA_WRITEABLE) != 0)
+		pmap_clearbit(m, PVF_WRITE);
+}
+
+
+/*
+ * perform the pmap work for mincore
+ */
+int
+pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
+{
+	printf("pmap_mincore()\n");
+
+	return (0);
+}
+
+void
+pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
+{
+}
+
+/*
+ *	Increase the starting virtual address of the given mapping if a
+ *	different alignment might result in more superpage mappings.
+ */
+void
+pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
+    vm_offset_t *addr, vm_size_t size)
+{
+}
+
+
+/*
+ * Map a set of physical memory pages into the kernel virtual
+ * address space. Return a pointer to where it is mapped. This
+ * routine is intended to be used for mapping device memory,
+ * NOT real memory.
+ */
+void *
+pmap_mapdev(vm_offset_t pa, vm_size_t size)
+{
+	vm_offset_t va, tmpva, offset;
+
+	offset = pa & PAGE_MASK;
+	size = roundup(size, PAGE_SIZE);
+
+	GIANT_REQUIRED;
+
+	va = kmem_alloc_nofault(kernel_map, size);
+	if (!va)
+		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
+	for (tmpva = va; size > 0;) {
+		pmap_kenter_internal(tmpva, pa, 0);
+		size -= PAGE_SIZE;
+		tmpva += PAGE_SIZE;
+		pa += PAGE_SIZE;
+	}
+
+	return ((void *)(va + offset));
+}
+
+/*
+ * pmap_map_section:
+ *
+ *	Create a single section mapping.
+ */
+void
+pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
+    int prot, int cache)
+{
+	pd_entry_t *pde = (pd_entry_t *) l1pt;
+	pd_entry_t fl;
+
+	KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2"));
+
+	fl = l1_mem_types[cache];
+
+	pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
+	    L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL);
+	PTE_SYNC(&pde[va >> L1_S_SHIFT]);
+}
+
+/*
+ * pmap_link_l2pt:
+ *
+ *	Link the L2 page table specified by l2pv.pv_pa into the L1
+ *	page table at the slot for "va".
+ */
+void
+pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv)
+{
+	pd_entry_t *pde = (pd_entry_t *) l1pt, proto;
+	u_int slot = va >> L1_S_SHIFT;
+
+	proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO;
+
+#ifdef VERBOSE_INIT_ARM
+	printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va);
+#endif
+
+	pde[slot + 0] = proto | (l2pv->pv_pa + 0x000);
+	PTE_SYNC(&pde[slot]);
+
+	SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list);
+
+}
+
+/*
+ * pmap_map_entry
+ *
+ *	Create a single page mapping.
+ */
+void
+pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
+    int cache)
+{
+	pd_entry_t *pde = (pd_entry_t *) l1pt;
+	pt_entry_t fl;
+	pt_entry_t *pte;
+
+	KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin"));
+
+	fl = l2s_mem_types[cache];
+
+	if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
+		panic("pmap_map_entry: no L2 table for VA 0x%08x", va);
+
+	pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK);
+
+	if (pte == NULL)
+		panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va);
+
+	pte[l2pte_index(va)] = L2_S_PROTO | pa | fl;
+	pmap_set_prot(&pte[l2pte_index(va)], prot, 0);
+	PTE_SYNC(&pte[l2pte_index(va)]);
+}
+
+/*
+ * pmap_map_chunk:
+ *
+ *	Map a chunk of memory using the most efficient mappings
+ *	possible (section. large page, small page) into the
+ *	provided L1 and L2 tables at the specified virtual address.
+ */
+vm_size_t
+pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
+    vm_size_t size, int prot, int type)
+{
+	pd_entry_t *pde = (pd_entry_t *) l1pt;
+	pt_entry_t *pte, f1, f2s, f2l;
+	vm_size_t resid;
+	int i;
+
+	resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+
+	if (l1pt == 0)
+		panic("pmap_map_chunk: no L1 table provided");
+
+#ifdef VERBOSE_INIT_ARM
+	printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x "
+	    "prot=0x%x type=%d\n", pa, va, size, resid, prot, type);
+#endif
+
+	f1 = l1_mem_types[type];
+	f2l = l2l_mem_types[type];
+	f2s = l2s_mem_types[type];
+
+	size = resid;
+
+	while (resid > 0) {
+		/* See if we can use a section mapping. */
+		if (L1_S_MAPPABLE_P(va, pa, resid)) {
+#ifdef VERBOSE_INIT_ARM
+			printf("S");
+#endif
+			pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
+			    L1_S_PROT(PTE_KERNEL, prot) | f1 |
+			    L1_S_DOM(PMAP_DOMAIN_KERNEL);
+			PTE_SYNC(&pde[va >> L1_S_SHIFT]);
+			va += L1_S_SIZE;
+			pa += L1_S_SIZE;
+			resid -= L1_S_SIZE;
+			continue;
+		}
+
+		/*
+		 * Ok, we're going to use an L2 table.  Make sure
+		 * one is actually in the corresponding L1 slot
+		 * for the current VA.
+		 */
+		if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
+			panic("pmap_map_chunk: no L2 table for VA 0x%08x", va);
+
+		pte = (pt_entry_t *) kernel_pt_lookup(
+		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
+		if (pte == NULL)
+			panic("pmap_map_chunk: can't find L2 table for VA"
+			    "0x%08x", va);
+		/* See if we can use a L2 large page mapping. */
+		if (L2_L_MAPPABLE_P(va, pa, resid)) {
+#ifdef VERBOSE_INIT_ARM
+			printf("L");
+#endif
+			for (i = 0; i < 16; i++) {
+				pte[l2pte_index(va) + i] =
+				    L2_L_PROTO | pa |
+				    L2_L_PROT(PTE_KERNEL, prot) | f2l;
+				PTE_SYNC(&pte[l2pte_index(va) + i]);
+			}
+			va += L2_L_SIZE;
+			pa += L2_L_SIZE;
+			resid -= L2_L_SIZE;
+			continue;
+		}
+
+		/* Use a small page mapping. */
+#ifdef VERBOSE_INIT_ARM
+		printf("P");
+#endif
+		pte[l2pte_index(va)] = L2_S_PROTO | pa | f2s;
+		pmap_set_prot(&pte[l2pte_index(va)], prot, 0);
+		PTE_SYNC(&pte[l2pte_index(va)]);
+		va += PAGE_SIZE;
+		pa += PAGE_SIZE;
+		resid -= PAGE_SIZE;
+	}
+#ifdef VERBOSE_INIT_ARM
+	printf("\n");
+#endif
+	return (size);
+
+}
+
+/********************** Static device map routines ***************************/
+
+static const struct pmap_devmap *pmap_devmap_table;
+
+/*
+ * Register the devmap table.  This is provided in case early console
+ * initialization needs to register mappings created by bootstrap code
+ * before pmap_devmap_bootstrap() is called.
+ */
+void
+pmap_devmap_register(const struct pmap_devmap *table)
+{
+
+	pmap_devmap_table = table;
+}
+
+/*
+ * Map all of the static regions in the devmap table, and remember
+ * the devmap table so other parts of the kernel can look up entries
+ * later.
+ */
+void
+pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table)
+{
+	int i;
+
+	pmap_devmap_table = table;
+
+	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
+#ifdef VERBOSE_INIT_ARM
+		printf("devmap: %08x -> %08x @ %08x\n",
+		    pmap_devmap_table[i].pd_pa,
+		    pmap_devmap_table[i].pd_pa +
+			pmap_devmap_table[i].pd_size - 1,
+		    pmap_devmap_table[i].pd_va);
+#endif
+		pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va,
+		    pmap_devmap_table[i].pd_pa,
+		    pmap_devmap_table[i].pd_size,
+		    pmap_devmap_table[i].pd_prot,
+		    pmap_devmap_table[i].pd_cache);
+	}
+}
+
+const struct pmap_devmap *
+pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size)
+{
+	int i;
+
+	if (pmap_devmap_table == NULL)
+		return (NULL);
+
+	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
+		if (pa >= pmap_devmap_table[i].pd_pa &&
+		    pa + size <= pmap_devmap_table[i].pd_pa +
+				 pmap_devmap_table[i].pd_size)
+			return (&pmap_devmap_table[i]);
+	}
+
+	return (NULL);
+}
+
+const struct pmap_devmap *
+pmap_devmap_find_va(vm_offset_t va, vm_size_t size)
+{
+	int i;
+
+	if (pmap_devmap_table == NULL)
+		return (NULL);
+
+	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
+		if (va >= pmap_devmap_table[i].pd_va &&
+		    va + size <= pmap_devmap_table[i].pd_va +
+				 pmap_devmap_table[i].pd_size)
+			return (&pmap_devmap_table[i]);
+	}
+
+	return (NULL);
+}
+
+int
+pmap_dmap_iscurrent(pmap_t pmap)
+{
+	return(pmap_is_current(pmap));
+}
+
diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c
index 2502e86..ffde8ae 100644
--- a/sys/arm/arm/pmap.c
+++ b/sys/arm/arm/pmap.c
@@ -3243,15 +3243,13 @@ pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 				*ptep = pte;
 				PTE_SYNC(ptep);
 
-				if (pg != NULL) {
-					if (!(pg->oflags & VPO_UNMANAGED)) {
-						f = pmap_modify_pv(pg, pm, sva,
-						    PVF_WRITE, 0);
+				if (!(pg->oflags & VPO_UNMANAGED)) {
+					f = pmap_modify_pv(pg, pm, sva,
+					    PVF_WRITE, 0);
+					if (f & PVF_WRITE)
 						vm_page_dirty(pg);
-					} else
-						f = 0;
 				} else
-					f = PVF_REF | PVF_EXEC;
+					f = 0;
 
 				if (flush >= 0) {
 					flush++;
diff --git a/sys/arm/arm/swtch.S b/sys/arm/arm/swtch.S
index a293c5c..e93b948 100644
--- a/sys/arm/arm/swtch.S
+++ b/sys/arm/arm/swtch.S
@@ -85,64 +85,55 @@
 #include <machine/armreg.h>
 __FBSDID("$FreeBSD$");
 
+#define DOMAIN_CLIENT	0x01
 
-/*
- * New experimental definitions of IRQdisable and IRQenable
- * These keep FIQ's enabled since FIQ's are special.
- */
+#ifdef _ARM_ARCH_6
+#define GET_PCPU(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4;
+#else
+.Lcurpcpu:
+	.word	_C_LABEL(__pcpu)
+
+#define GET_PCPU(tmp) \
+	ldr	tmp, .Lcurpcpu
+#endif
 
-#define DOMAIN_CLIENT	0x01
-#define IRQdisable \
-	mrs	r14, cpsr ; \
-	orr	r14, r14, #(I32_bit) ; \
-	msr	cpsr_c, r14 ; \
-
-#define IRQenable \
-	mrs	r14, cpsr ; \
-	bic	r14, r14, #(I32_bit) ; \
-	msr	cpsr_c, r14 ; \
-
-/*
- * These are used for switching the translation table/DACR.
- * Since the vector page can be invalid for a short time, we must
- * disable both regular IRQs *and* FIQs.
- *
- * XXX: This is not necessary if the vector table is relocated.
- */
-#define IRQdisableALL \
-	mrs	r14, cpsr ; \
-	orr	r14, r14, #(I32_bit | F32_bit) ; \
-	msr	cpsr_c, r14
-
-#define IRQenableALL \
-	mrs	r14, cpsr ; \
-	bic	r14, r14, #(I32_bit | F32_bit) ; \
-	msr	cpsr_c, r14
-	
-.Lcurpcb:
-	.word	_C_LABEL(__pcpu) + PC_CURPCB
 .Lcpufuncs:	
 	.word	_C_LABEL(cpufuncs)
-.Lblock_userspace_access:
-	.word	_C_LABEL(block_userspace_access)
-.Lcpu_do_powersave:
-	.word	_C_LABEL(cpu_do_powersave)
 .Lblocked_lock:
 	.word	_C_LABEL(blocked_lock)
+
 ENTRY(cpu_throw)
 	mov	r5, r1
 
 	/*
+	 * r0 = oldtd
 	 * r5 = newtd
 	 */
 
-	ldr	r7, [r5, #(TD_PCB)]		/* r7 = new thread's PCB */
+	GET_PCPU(r7)
 
+#ifdef ARM_VFP_SUPPORT
+	/*
+	 * vfp_discard will clear pcpu->pc_vfpcthread, and modify
+	 * and modify the control as needed.
+	 */
+	ldr     r4, [r7, #(PC_VFPCTHREAD)]      /* this thread using vfp? */
+	cmp     r0, r4
+	bne     3f
+	bl      _C_LABEL(vfp_discard)           /* yes, shut down vfp */
+3:
+#endif		/* ARM_VFP_SUPPORT */
+
+	ldr	r7, [r5, #(TD_PCB)]		/* r7 = new thread's PCB */
+  
 	/* Switch to lwp0 context */
 
 	ldr	r9, .Lcpufuncs
+#if !defined(CPU_ARM11) && !defined(CPU_CORTEXA) && !defined(CPU_MV_PJ4B)
 	mov	lr, pc
 	ldr	pc, [r9, #CF_IDCACHE_WBINV_ALL]
+#endif
 	ldr	r0, [r7, #(PCB_PL1VEC)]
 	ldr	r1, [r7, #(PCB_DACR)]
 	/*
@@ -200,21 +191,24 @@ ENTRY(cpu_throw)
 #endif
 
 	/* We have a new curthread now so make a note it */
-	ldr	r6, .Lcurthread
+	GET_CURTHREAD_PTR(r6)
 	str	r5, [r6]
 
 	/* Set the new tp */
 	ldr	r6, [r5, #(TD_MD + MD_TP)]
+#ifdef ARM_TP_ADDRESS
 	ldr	r4, =ARM_TP_ADDRESS
 	str	r6, [r4]
 	ldr	r6, [r5, #(TD_MD + MD_RAS_START)]
 	str	r6, [r4, #4] /* ARM_RAS_START */
 	ldr	r6, [r5, #(TD_MD + MD_RAS_END)]
 	str	r6, [r4, #8] /* ARM_RAS_END */
-
+#else
+	mcr p15, 0, r6, c13, c0, 3
+#endif
 	/* Hook in a new pcb */
-	ldr	r6, .Lcurpcb
-	str	r7, [r6]
+	GET_PCPU(r6)
+	str	r7, [r6, #PC_CURPCB]
 
 	ldmfd	sp!, {r4-r7, pc}
 
@@ -226,22 +220,15 @@ ENTRY(cpu_switch)
 	/* rem: r0 = old lwp */
 	/* rem: interrupts are disabled */
 
-#ifdef MULTIPROCESSOR
-	/* XXX use curcpu() */
-	ldr	r2, .Lcpu_info_store
-	str	r2, [r6, #(L_CPU)]
-#endif
-
 	/* Process is now on a processor. */
-
 	/* We have a new curthread now so make a note it */
-	ldr	r7, .Lcurthread
+	GET_CURTHREAD_PTR(r7)
 	str	r1, [r7]
 
 	/* Hook in a new pcb */
-	ldr	r7, .Lcurpcb
+	GET_PCPU(r7)
 	ldr	r2, [r1, #TD_PCB]
-	str	r2, [r7]
+	str	r2, [r7, #PC_CURPCB]
 
 	/* rem: r1 = new process */
 	/* rem: interrupts are enabled */
@@ -267,6 +254,7 @@ ENTRY(cpu_switch)
 	 * NOTE: We can now use r8-r13 until it is time to restore
 	 * them for the new process.
 	 */
+#ifdef ARM_TP_ADDRESS
 	/* Store the old tp */
 	ldr	r3, =ARM_TP_ADDRESS
 	ldr	r9, [r3]
@@ -283,12 +271,19 @@ ENTRY(cpu_switch)
 	str	r9, [r3, #4]
 	ldr	r9, [r1, #(TD_MD + MD_RAS_END)]
 	str	r9, [r3, #8]
+#else
+	/* Store the old tp */
+	mrc p15, 0, r9, c13, c0, 3
+	str	r9, [r0, #(TD_MD + MD_TP)]
 
+	/* Set the new tp */
+	ldr	r9, [r1, #(TD_MD + MD_TP)]
+	mcr p15, 0, r9, c13, c0, 3
+#endif
+	
 	/* Get the user structure for the new process in r9 */
 	ldr	r9, [r1, #(TD_PCB)]
 
-	/* r1 now free! */
-
         mrs	r3, cpsr
 	/*
 	 * We can do that, since
@@ -300,15 +295,39 @@ ENTRY(cpu_switch)
 	str	sp, [r2, #(PCB_UND_SP)]
 
         msr	cpsr_c, r3		/* Restore the old mode */
-	/* rem: r8 = old PCB */
+	/* rem: r2 = old PCB */
 	/* rem: r9 = new PCB */
 	/* rem: interrupts are enabled */
 
-	/* What else needs to be saved  Only FPA stuff when that is supported */
+#ifdef ARM_VFP_SUPPORT
+	/*
+	 * vfp_store will clear pcpu->pc_vfpcthread, save 
+	 * registers and state, and modify the control as needed.
+	 * a future exception will bounce the backup settings in the fp unit.
+	 * XXX vfp_store can't change r4
+	 */
+	GET_PCPU(r7)
+	ldr	r8, [r7, #(PC_VFPCTHREAD)]
+	cmp	r4, r8				/* old thread used vfp? */
+	bne	1f				/* no, don't save */
+	cmp	r1, r4				/* same thread ? */
+	beq	1f				/* yes, skip vfp store */
+#ifdef SMP
+	ldr	r8, [r7, #(PC_CPU)]		/* last used on this cpu? */
+	ldr	r3, [r2, #(PCB_VFPCPU)]
+	cmp	r8, r3		/* last cpu to use these registers? */
+	bne	1f		/* no. these values are stale */
+#endif
+	add	r0, r2, #(PCB_VFPSTATE)
+	bl	_C_LABEL(vfp_store)
+1:
+#endif		/* ARM_VFP_SUPPORT */
+
+	/* r1 now free! */
 
 	/* Third phase : restore saved context */
 
-	/* rem: r8 = old PCB */
+	/* rem: r2 = old PCB */
 	/* rem: r9 = new PCB */
 	/* rem: interrupts are enabled */
 
@@ -333,6 +352,7 @@ ENTRY(cpu_switch)
 	cmpeq	r0, r5				/* Same DACR? */
 	beq	.Lcs_context_switched		/* yes! */
 
+#if !defined(CPU_ARM11) && !defined(CPU_CORTEXA) && !defined(CPU_MV_PJ4B)
 	/*
 	 * Definately need to flush the cache.
 	 */
@@ -340,6 +360,7 @@ ENTRY(cpu_switch)
 	ldr	r1, .Lcpufuncs
 	mov	lr, pc
 	ldr	pc, [r1, #CF_IDCACHE_WBINV_ALL]
+#endif
 .Lcs_cache_purge_skipped:
 	/* rem: r6 = lock */
 	/* rem: r9 = new PCB */
@@ -408,8 +429,7 @@ ENTRY(cpu_switch)
 	/* Release the old thread */
 	str	r6, [r4, #TD_LOCK]
 	ldr	r6, .Lblocked_lock
-	ldr	r3, .Lcurthread
-	ldr	r3, [r3]
+	GET_CURTHREAD_PTR(r3)
 
 1:
 	ldr	r4, [r3, #TD_LOCK]
@@ -484,6 +504,27 @@ ENTRY(savectx)
 	/* Store all the registers in the process's pcb */
 	add	r2, r0, #(PCB_R8)
 	stmia	r2, {r8-r13}
+#ifdef ARM_VFP_SUPPORT
+	/*
+	 * vfp_store will clear pcpu->pc_vfpcthread, save 
+	 * registers and state, and modify the control as needed.
+	 * a future exception will bounce the backup settings in the fp unit.
+	 */
+	GET_PCPU(r7)
+	ldr	r4, [r7, #(PC_VFPCTHREAD)]      /* vfp thread */
+	ldr	r2, [r7, #(PC_CURTHREAD)]       /* current thread */
+	cmp	r4, r2
+	bne	1f
+#ifdef SMP
+	ldr	r2, [r7, #(PC_CPU)]     /* last used on this cpu? */
+	ldr	r3, [r0, #(PCB_VFPCPU)]
+	cmp	r2, r3
+	bne	1f              /* no. these values are stale */
+#endif
+	add	r0, r0, #(PCB_VFPSTATE)
+	bl	_C_LABEL(vfp_store)
+1:
+#endif		/* ARM_VFP_SUPPORT */
 	ldmfd	sp!, {r4-r7, pc}
 
 ENTRY(fork_trampoline)
diff --git a/sys/arm/arm/sys_machdep.c b/sys/arm/arm/sys_machdep.c
index f673dc6..7676a96 100644
--- a/sys/arm/arm/sys_machdep.c
+++ b/sys/arm/arm/sys_machdep.c
@@ -88,7 +88,14 @@ static int
 arm32_set_tp(struct thread *td, void *args)
 {
 
-	td->td_md.md_tp = (register_t)args;
+	if (td != curthread)
+		td->td_md.md_tp = (register_t)args;
+	else 
+#ifndef ARM_TP_ADDRESS
+		set_tls(args);
+#else
+		*(register_t *)ARM_TP_ADDRESS = (register_t)args;
+#endif
 	return (0);
 }
 
@@ -96,7 +103,14 @@ static int
 arm32_get_tp(struct thread *td, void *args)
 {
 
-	td->td_retval[0] = td->td_md.md_tp;
+	if (td != curthread)
+		td->td_retval[0] = td->td_md.md_tp;
+	else
+#ifndef ARM_TP_ADDRESS
+		td->td_retval[0] = (register_t)get_tls();
+#else
+		td->td_retval[0] = *(register_t *)ARM_TP_ADDRESS;
+#endif
 	return (0);
 }
 
diff --git a/sys/arm/arm/undefined.c b/sys/arm/arm/undefined.c
index fcb612d..9a2634a 100644
--- a/sys/arm/arm/undefined.c
+++ b/sys/arm/arm/undefined.c
@@ -237,10 +237,16 @@ undefinedinstruction(trapframe_t *frame)
 	 * instruction trap.
 	 */
 
+	coprocessor = 0;
 	if ((fault_instruction & (1 << 27)) != 0)
 		coprocessor = (fault_instruction >> 8) & 0x0f;
-	else
-		coprocessor = 0;
+#ifdef ARM_VFP_SUPPORT
+	else {          /* check for special instructions */
+		if (((fault_instruction & 0xfe000000) == 0xf2000000) ||
+		    ((fault_instruction & 0xff100000) == 0xf4000000))
+			coprocessor = 10;       /* vfp / simd */
+	}
+#endif	/* ARM_VFP_SUPPORT */
 
 	if ((frame->tf_spsr & PSR_MODE) == PSR_USR32_MODE) {
 		/*
diff --git a/sys/arm/arm/vfp.c b/sys/arm/arm/vfp.c
new file mode 100644
index 0000000..bde566c
--- /dev/null
+++ b/sys/arm/arm/vfp.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2012 Mark Tinguely
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+
+#include <machine/fp.h>
+#include <machine/pcb.h>
+#include <machine/undefined.h>
+#include <machine/vfp.h>
+
+/* function prototypes */
+unsigned int get_coprocessorACR(void);
+int	vfp_bounce(u_int, u_int, struct trapframe *, int);
+void	vfp_discard(void);
+void	vfp_enable(void);
+void	vfp_init(void);
+void	vfp_restore(struct vfp_state *);
+void	vfp_store(struct vfp_state *);
+void	set_coprocessorACR(u_int);
+
+boolean_t vfp_exists;
+static struct undefined_handler vfp10_uh, vfp11_uh;
+
+/* The VFMXR command using coprocessor commands */
+#define fmxr(reg, val) \
+	__asm __volatile("mcr p10, 7, %0, " #reg " , c0, 0" :: "r" (val));
+
+/* The VFMRX command using coprocessor commands */
+#define fmrx(reg) \
+({ u_int val = 0;\
+	__asm __volatile("mrc p10, 7, %0, " #reg " , c0, 0" : "=r" (val));\
+	val; \
+})
+
+u_int
+get_coprocessorACR(void)
+{
+	u_int val;
+	__asm __volatile("mrc p15, 0, %0, c1, c0, 2" : "=r" (val) : : "cc");
+	return val;
+}
+
+void
+set_coprocessorACR(u_int val)
+{
+	__asm __volatile("mcr p15, 0, %0, c1, c0, 2\n\t"
+			 "isb\n\t"
+	 : : "r" (val) : "cc");
+}
+
+
+	/* called for each cpu */
+void
+vfp_init(void)
+{
+	u_int fpsid, fpexc, tmp;
+	u_int coproc;
+
+	coproc = get_coprocessorACR();
+	coproc |= COPROC10 | COPROC11;
+	set_coprocessorACR(coproc);
+	
+	fpsid = fmrx(cr0);		/* read the vfp system id */
+	fpexc = fmrx(cr8);		/* read the vfp exception reg */
+
+	if (!(fpsid & VFPSID_HARDSOFT_IMP)) {
+		vfp_exists = 1;
+		PCPU_SET(vfpsid, fpsid);	/* save the VFPSID */
+		if ((fpsid & VFPSID_SUBVERSION2_MASK) == VFP_ARCH3) {
+			tmp = fmrx(cr7);	/* extended registers */
+			PCPU_SET(vfpmvfr0, tmp);
+			tmp = fmrx(cr6);	/* extended registers */
+			PCPU_SET(vfpmvfr1, tmp);
+		}
+		/* initialize the coprocess 10 and 11 calls
+		 * These are called to restore the registers and enable
+		 * the VFP hardware.
+		 */
+		if (vfp10_uh.uh_handler == NULL) {
+			vfp10_uh.uh_handler = vfp_bounce;
+			vfp11_uh.uh_handler = vfp_bounce;
+			install_coproc_handler_static(10, &vfp10_uh);
+			install_coproc_handler_static(11, &vfp11_uh);
+		}
+	}
+}
+
+SYSINIT(vfp, SI_SUB_CPU, SI_ORDER_ANY, vfp_init, NULL);
+
+
+/* start VFP unit, restore the vfp registers from the PCB  and retry
+ * the instruction
+ */
+int
+vfp_bounce(u_int addr, u_int insn, struct trapframe *frame, int code)
+{
+	u_int fpexc;
+	struct pcb *curpcb;
+	struct thread *vfptd;
+
+	if (!vfp_exists)
+		return 1;		/* vfp does not exist */
+	fpexc = fmrx(cr8);		/* read the vfp exception reg */
+	if (fpexc & VFPEXC_EN) {
+		vfptd = PCPU_GET(vfpcthread);
+		/* did the kernel call the vfp or exception that expect us
+		 * to emulate the command. Newer hardware does not require
+		 * emulation, so we don't emulate yet.
+		 */
+#ifdef SMP
+		/* don't save if newer registers are on another processor */
+		if (vfptd /* && (vfptd == curthread) */ &&
+		   (vfptd->td_pcb->pcb_vfpcpu == PCPU_GET(vfpcpu))
+#else
+		/* someone did not save their registers, */
+		if (vfptd /* && (vfptd == curthread) */)
+#endif
+			vfp_store(&vfptd->td_pcb->pcb_vfpstate);
+
+		fpexc &= ~VFPEXC_EN;
+		fmxr(cr8, fpexc);	/* turn vfp hardware off */
+		if (vfptd == curthread) {
+			/* kill the process - we do not handle emulation */
+			killproc(curthread->td_proc, "vfp emulation");
+			return 1;
+		}
+		/* should not happen. someone did not save their context */
+		printf("vfp_bounce: vfpcthread: %p curthread: %p\n",
+			vfptd, curthread);
+	}
+	fpexc |= VFPEXC_EN;
+	fmxr(cr8, fpexc);	/* enable the vfp and repeat command */
+	curpcb = PCPU_GET(curpcb);
+	/* If we were the last process to use the VFP, the process did not
+	 * use a VFP on another processor, then the registers in the VFP
+	 * will still be ours and are current. Eventually, we will make the
+	 * restore smarter.
+	 */
+	vfp_restore(&curpcb->pcb_vfpstate);
+#ifdef SMP
+	curpcb->pcb_cpu = PCPU_GET(cpu);
+#endif
+	PCPU_SET(vfpcthread, PCPU_GET(curthread));
+	return 0;
+}
+
+/* vfs_store is called from from a VFP command to restore the registers and
+ * turn on the VFP hardware.
+ * Eventually we will use the information that this process was the last
+ * to use the VFP hardware and bypass the restore, just turn on the hardware.
+ */
+void
+vfp_restore(struct vfp_state *vfpsave)
+{
+	u_int vfpscr = 0;
+
+	if (vfpsave) {
+		__asm __volatile("ldc	p10, c0, [%0], #128\n" /* d0-d31 */
+#ifndef VFPv2
+			"ldcl	p11, c0, [%0], #128\n"	/* d16-d31 */
+#else
+			"add	%0, %0, #128\n"		/* slip missing regs */
+#endif
+			"ldr	%1, [%0]\n"		/* set old vfpscr */
+			"mcr	p10, 7, %1, cr1, c0, 0\n"
+				:: "r" (vfpsave), "r" (vfpscr));
+		PCPU_SET(vfpcthread, PCPU_GET(curthread));
+	}
+}
+
+/* vfs_store is called from switch to save the vfp hardware registers
+ * into the pcb before switching to another process.
+ * we already know that the new process is different from this old
+ * process and that this process last used the VFP registers.
+ * Below we check to see if the VFP has been enabled since the last
+ * register save.
+ * This routine will exit with the VFP turned off. The next VFP user
+ * will trap to restore its registers and turn on the VFP hardware.
+ */
+void
+vfp_store(struct vfp_state *vfpsave)
+{
+	u_int tmp, vfpscr = 0;
+
+	tmp = fmrx(cr8);		/* Is the vfp enabled? */
+	if (vfpsave && tmp & VFPEXC_EN) {
+		__asm __volatile("stc	p11, c0, [%1], #128\n" /* d0-d31 */
+#ifndef VFPv2
+			"stcl	p11, c0, [%1], #128\n"
+#else
+			"add	%1, %1, #128\n"
+#endif
+			"mrc	p10, 7, %0, cr1, c0, 0\n"
+			"str	%0, [%1]\n"
+			:  "=&r" (vfpscr) : "r" (vfpsave));
+	}
+#ifndef SMP
+		/* eventually we will use this information for UP also */
+	PCPU_SET(vfpcthread, 0);
+#endif
+	tmp &= ~VFPEXC_EN;	/* disable the vfp hardware */
+	fmxr(cr8 , tmp);
+}
+
+/* discard the registers at cpu_thread_free() when fpcurthread == td.
+ * Turn off the VFP hardware.
+ */
+void
+vfp_discard()
+{
+	u_int tmp = 0;
+
+	PCPU_SET(vfpcthread, 0);	/* permanent forget about reg */
+	tmp = fmrx(cr8);
+	tmp &= ~VFPEXC_EN;		/* turn off VFP hardware */
+	fmxr(cr8, tmp);
+}
+
+/* Enable the VFP hardware without restoring registers.
+ * Called when the registers are still in the VFP unit
+ */
+void
+vfp_enable()
+{
+	u_int tmp = 0;
+
+	tmp = fmrx(cr8);
+	tmp |= VFPEXC_EN;
+	fmxr(cr8 , tmp);
+}
diff --git a/sys/arm/arm/vm_machdep.c b/sys/arm/arm/vm_machdep.c
index 114c683..d5504f6 100644
--- a/sys/arm/arm/vm_machdep.c
+++ b/sys/arm/arm/vm_machdep.c
@@ -146,7 +146,11 @@ cpu_fork(register struct thread *td1, register struct proc *p2,
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_cspr = 0;
+#ifdef ARM_TP_ADDRESS
 	td2->td_md.md_tp = *(register_t *)ARM_TP_ADDRESS;
+#else
+	td2->td_md.md_tp = (register_t) get_tls();
+#endif
 }
 				
 void
@@ -369,11 +373,14 @@ int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 
-	if (td != curthread)
-		td->td_md.md_tp = (register_t)tls_base;
-	else {
+	td->td_md.md_tp = (register_t)tls_base;
+	if (td == curthread) {
 		critical_enter();
+#ifdef ARM_TP_ADDRESS
 		*(register_t *)ARM_TP_ADDRESS = (register_t)tls_base;
+#else
+		set_tls((void *)tls_base);
+#endif
 		critical_exit();
 	}
 	return (0);
@@ -485,7 +492,11 @@ arm_remap_nocache(void *addr, vm_size_t size)
 		for (; tomap < (vm_offset_t)ret + size; tomap += PAGE_SIZE,
 		    vaddr += PAGE_SIZE, physaddr += PAGE_SIZE, i++) {
 			cpu_idcache_wbinv_range(vaddr, PAGE_SIZE);
+#ifdef ARM_L2_PIPT
+			cpu_l2cache_wbinv_range(physaddr, PAGE_SIZE);
+#else
 			cpu_l2cache_wbinv_range(vaddr, PAGE_SIZE);
+#endif
 			pmap_kenter_nocache(tomap, physaddr);
 			cpu_tlb_flushID_SE(vaddr);
 			arm_nocache_allocated[i / BITS_PER_INT] |= 1 << (i %
diff --git a/sys/arm/at91/at91_machdep.c b/sys/arm/at91/at91_machdep.c
index 2dc95e3..c6be9e5 100644
--- a/sys/arm/at91/at91_machdep.c
+++ b/sys/arm/at91/at91_machdep.c
@@ -118,9 +118,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
@@ -471,8 +468,7 @@ initarm(struct arm_boot_params *abp)
 
 	lastaddr = parse_boot_param(abp);
 	set_cpufuncs();
-	pcpu_init(pcpup, 0, sizeof(struct pcpu));
-	PCPU_SET(curthread, &thread0);
+	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
diff --git a/sys/arm/at91/std.at91 b/sys/arm/at91/std.at91
index 0c950b0..3f91330 100644
--- a/sys/arm/at91/std.at91
+++ b/sys/arm/at91/std.at91
@@ -4,6 +4,7 @@ files		"../at91/files.at91"
 cpu		CPU_ARM9
 makeoptions	CONF_CFLAGS=-mcpu=arm9
 options 	PHYSADDR=0x20000000
+options		NO_EVENTTIMERS
 
 # For now, just do the AT91RM9200
 device		at91rm9200
diff --git a/sys/arm/econa/econa_machdep.c b/sys/arm/econa/econa_machdep.c
index de18fd1..9779b6a 100644
--- a/sys/arm/econa/econa_machdep.c
+++ b/sys/arm/econa/econa_machdep.c
@@ -103,9 +103,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
@@ -191,8 +188,7 @@ initarm(struct arm_boot_params *abp)
 	boothowto = RB_VERBOSE;
 	lastaddr = parse_boot_param(abp);
 	set_cpufuncs();
-	pcpu_init(pcpup, 0, sizeof(struct pcpu));
-	PCPU_SET(curthread, &thread0);
+	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
       	init_param1();
diff --git a/sys/arm/econa/std.econa b/sys/arm/econa/std.econa
index 660648d..46d8194 100644
--- a/sys/arm/econa/std.econa
+++ b/sys/arm/econa/std.econa
@@ -12,3 +12,5 @@ options	KERNVIRTADDR=0xc1000000	# Used in ldscript.arm
 options	FLASHADDR=0xD0000000
 options	LOADERRAMADDR=0x00000000
 options	STARTUP_PAGETABLE_ADDR=0x00100000
+
+options	NO_EVENTTIMERS
diff --git a/sys/arm/include/armreg.h b/sys/arm/include/armreg.h
index 0057298..a5638d0 100644
--- a/sys/arm/include/armreg.h
+++ b/sys/arm/include/armreg.h
@@ -93,6 +93,7 @@
 #define CPU_ID_ARCH_V5TE	0x00050000
 #define CPU_ID_ARCH_V5TEJ	0x00060000
 #define CPU_ID_ARCH_V6		0x00070000
+#define CPU_ID_CPUID_SCHEME	0x000f0000
 #define CPU_ID_VARIANT_MASK	0x00f00000
 
 /* Next three nybbles are part number */
@@ -145,12 +146,36 @@
 #define CPU_ID_ARM1026EJS	0x4106a260
 #define CPU_ID_ARM1136JS	0x4107b360
 #define CPU_ID_ARM1136JSR1	0x4117b360
+#define CPU_ID_CORTEXA8R1	0x411fc080
+#define CPU_ID_CORTEXA8R2	0x412fc080
+#define CPU_ID_CORTEXA8R3	0x413fc080
+#define CPU_ID_CORTEXA9R1	0x411fc090
+#define CPU_ID_CORTEXA9R2	0x412fc090
 #define CPU_ID_SA110		0x4401a100
 #define CPU_ID_SA1100		0x4401a110
 #define	CPU_ID_TI925T		0x54029250
 #define CPU_ID_MV88FR131	0x56251310 /* Marvell Feroceon 88FR131 Core */
+#define CPU_ID_MV88FR331	0x56153310 /* Marvell Feroceon 88FR331 Core */
 #define CPU_ID_MV88FR571_VD	0x56155710 /* Marvell Feroceon 88FR571-VD Core (ID from datasheet) */
-#define	CPU_ID_MV88FR571_41	0x41159260 /* Marvell Feroceon 88FR571-VD Core (actual ID from CPU reg) */
+
+/*
+ * LokiPlus core has also ID set to 0x41159260 and this define cause execution of unsupported
+ * L2-cache instructions so need to disable it. 0x41159260 is a generic ARM926E-S ID.
+ */
+#ifdef SOC_MV_LOKIPLUS
+#define CPU_ID_MV88FR571_41	0x00000000
+#else
+#define CPU_ID_MV88FR571_41	0x41159260 /* Marvell Feroceon 88FR571-VD Core (actual ID from CPU reg) */
+#endif
+
+#define CPU_ID_MV88SV581X_V6		0x560F5810 /* Marvell Sheeva 88SV581x v6 Core */
+#define CPU_ID_MV88SV581X_V7		0x561F5810 /* Marvell Sheeva 88SV581x v7 Core */
+#define CPU_ID_MV88SV584X		0x561F5840 /* Marvell Sheeva 88SV584x v6 Core */
+/* Marvell's CPUIDs with ARM ID in implementor field */
+#define CPU_ID_ARM_88SV581X_V6		0x410fb760 /* Marvell Sheeva 88SV581x v6 Core */
+#define CPU_ID_ARM_88SV581X_V7		0x413FC080 /* Marvell Sheeva 88SV581x v7 Core */
+#define CPU_ID_ARM_88SV584X		0x410FB024 /* Marvell Sheeva 88SV584x v6 Core */
+
 #define	CPU_ID_FA526		0x66015260
 #define	CPU_ID_FA626TE		0x66056260
 #define CPU_ID_SA1110		0x6901b110
@@ -191,6 +216,20 @@
 #define ARM3_CTL_SHARED		0x00000002
 #define ARM3_CTL_MONITOR	0x00000004
 
+/* CPUID registers */
+#define ARM_PFR0_ARM_ISA_MASK	0x0000000f
+
+#define ARM_PFR0_THUMB_MASK	0x000000f0
+#define ARM_PFR0_THUMB		0x10
+#define ARM_PFR0_THUMB2		0x30
+
+#define ARM_PFR0_JAZELLE_MASK	0x00000f00
+#define ARM_PFR0_THUMBEE_MASK	0x0000f000
+
+#define ARM_PFR1_ARMV4_MASK	0x0000000f
+#define ARM_PFR1_SEC_EXT_MASK	0x000000f0
+#define ARM_PFR1_MICROCTRL_MASK	0x00000f00
+
 /*
  * Post-ARM3 CP15 registers:
  *
@@ -244,6 +283,7 @@
 #define CPU_CONTROL_VECRELOC	0x00002000 /* V: Vector relocation */
 #define CPU_CONTROL_ROUNDROBIN	0x00004000 /* RR: Predictable replacement */
 #define CPU_CONTROL_V4COMPAT	0x00008000 /* L4: ARMv4 compat LDR R15 etc */
+#define CPU_CONTROL_V6_EXTPAGE	0x00800000 /* XP: ARMv6 extended page tables */
 #define CPU_CONTROL_L2_ENABLE	0x04000000 /* L2 Cache enabled */
 
 #define CPU_CONTROL_IDC_ENABLE	CPU_CONTROL_DC_ENABLE
@@ -260,23 +300,24 @@
 /* Xscale Core 3 only */
 #define XSCALE_AUXCTL_LLR	0x00000400 /* Enable L2 for LLR Cache */
 
-/* Marvell Feroceon Extra Features Register (CP15 register 1, opcode2 0) */
-#define FC_DCACHE_REPL_LOCK	0x80000000 /* Replace DCache Lock */
-#define FC_DCACHE_STREAM_EN	0x20000000 /* DCache Streaming Switch */
-#define FC_WR_ALLOC_EN		0x10000000 /* Enable Write Allocate */
-#define FC_L2_PREF_DIS		0x01000000 /* L2 Cache Prefetch Disable */
-#define FC_L2_INV_EVICT_LINE	0x00800000 /* L2 Invalidates Uncorrectable Error Line Eviction */
-#define FC_L2CACHE_EN		0x00400000 /* L2 enable */
-#define FC_ICACHE_REPL_LOCK	0x00080000 /* Replace ICache Lock */
-#define FC_GLOB_HIST_REG_EN	0x00040000 /* Branch Global History Register Enable */
-#define FC_BRANCH_TARG_BUF_DIS	0x00020000 /* Branch Target Buffer Disable */
-#define FC_L1_PAR_ERR_EN	0x00010000 /* L1 Parity Error Enable */
+/* Marvell Extra Features Register (CP15 register 1, opcode2 0) */
+#define MV_DC_REPLACE_LOCK	0x80000000 /* Replace DCache Lock */
+#define MV_DC_STREAM_ENABLE	0x20000000 /* DCache Streaming Switch */
+#define MV_WA_ENABLE		0x10000000 /* Enable Write Allocate */
+#define MV_L2_PREFETCH_DISABLE	0x01000000 /* L2 Cache Prefetch Disable */
+#define MV_L2_INV_EVICT_ERR	0x00800000 /* L2 Invalidates Uncorrectable Error Line Eviction */
+#define MV_L2_ENABLE		0x00400000 /* L2 Cache enable */
+#define MV_IC_REPLACE_LOCK	0x00080000 /* Replace ICache Lock */
+#define MV_BGH_ENABLE		0x00040000 /* Branch Global History Register Enable */
+#define MV_BTB_DISABLE		0x00020000 /* Branch Target Buffer Disable */
+#define MV_L1_PARERR_ENABLE	0x00010000 /* L1 Parity Error Enable */
 
 /* Cache type register definitions */
 #define	CPU_CT_ISIZE(x)		((x) & 0xfff)		/* I$ info */
 #define	CPU_CT_DSIZE(x)		(((x) >> 12) & 0xfff)	/* D$ info */
 #define	CPU_CT_S		(1U << 24)		/* split cache */
 #define	CPU_CT_CTYPE(x)		(((x) >> 25) & 0xf)	/* cache type */
+#define	CPU_CT_FORMAT(x)	((x) >> 29)
 
 #define	CPU_CT_CTYPE_WT		0	/* write-through */
 #define	CPU_CT_CTYPE_WB1	1	/* write-back, clean w/ read */
@@ -289,6 +330,27 @@
 #define	CPU_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x7)	/* associativity */
 #define	CPU_CT_xSIZE_SIZE(x)	(((x) >> 6) & 0x7)	/* size */
 
+#define	CPU_CT_ARMV7		0x4
+/* ARM v7 Cache type definitions */
+#define	CPUV7_CT_CTYPE_WT	(1 << 31)
+#define	CPUV7_CT_CTYPE_WB	(1 << 30)
+#define	CPUV7_CT_CTYPE_RA	(1 << 29)
+#define	CPUV7_CT_CTYPE_WA	(1 << 28)
+
+#define	CPUV7_CT_xSIZE_LEN(x)	((x) & 0x7)		/* line size */
+#define	CPUV7_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x3ff)	/* associativity */
+#define	CPUV7_CT_xSIZE_SET(x)	(((x) >> 13) & 0x7fff)	/* num sets */
+
+#define	CPU_CLIDR_CTYPE(reg,x)	(((reg) >> ((x) * 3)) & 0x7)
+#define	CPU_CLIDR_LOUIS(reg)	(((reg) >> 21) & 0x7)
+#define	CPU_CLIDR_LOC(reg)	(((reg) >> 24) & 0x7)
+#define	CPU_CLIDR_LOUU(reg)	(((reg) >> 27) & 0x7)
+
+#define	CACHE_ICACHE		1
+#define	CACHE_DCACHE		2
+#define	CACHE_SEP_CACHE		3
+#define	CACHE_UNI_CACHE		4
+
 /* Fault status register definitions */
 
 #define FAULT_TYPE_MASK 0x0f
diff --git a/sys/arm/include/asm.h b/sys/arm/include/asm.h
index 00f8265..3ae25b8 100644
--- a/sys/arm/include/asm.h
+++ b/sys/arm/include/asm.h
@@ -130,45 +130,52 @@
 	.stabs __STRING(_/**/sym),1,0,0,0
 #endif /* __STDC__ */
 
+/* Exactly one of the __ARM_ARCH_*__ macros will be defined by the compiler. */
+/* The _ARM_ARCH_* macros are deprecated and will be removed soon. */
+/* This should be moved into another header so it can be used in
+ * both asm and C code. machine/asm.h cannot be included in C code. */
+#if defined (__ARM_ARCH_7__) || defined (__ARM_ARCH_7A__)
+#define _ARM_ARCH_7
+#define _HAVE_ARMv7_INSTRUCTIONS 1
+#endif
 
-#if defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__)
+#if defined (_HAVE_ARMv7_INSTRUCTIONS) || defined (__ARM_ARCH_6__) || \
+	defined (__ARM_ARCH_6J__) || defined (__ARM_ARCH_6K__) || \
+	defined (__ARM_ARCH_6Z__) || defined (__ARM_ARCH_6ZK__)
 #define _ARM_ARCH_6
+#define _HAVE_ARMv6_INSTRUCTIONS 1
 #endif
 
-#if defined (_ARM_ARCH_6) || defined (__ARM_ARCH_5__) || \
-    defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5TE__) || \
+#if defined (_HAVE_ARMv6_INSTRUCTIONS) || defined (__ARM_ARCH_5TE__) || \
     defined (__ARM_ARCH_5TEJ__) || defined (__ARM_ARCH_5E__)
-#define _ARM_ARCH_5
+#define _ARM_ARCH_5E
+#define _HAVE_ARMv5E_INSTRUCTIONS 1
 #endif
 
-#if defined (_ARM_ARCH_6) || defined(__ARM_ARCH_5TE__) || \
-    defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_5E__)
-#define _ARM_ARCH_5E
+#if defined (_HAVE_ARMv5E_INSTRUCTIONS) || defined (__ARM_ARCH_5__) || \
+    defined (__ARM_ARCH_5T__)
+#define _ARM_ARCH_5
+#define _HAVE_ARMv5_INSTRUCTIONS 1
 #endif
 
-#if defined (_ARM_ARCH_5) || defined (__ARM_ARCH_4T__)
+#if defined (_HAVE_ARMv5_INSTRUCTIONS) || defined (__ARM_ARCH_4T__)
 #define _ARM_ARCH_4T
+#define _HAVE_ARMv4T_INSTRUCTIONS 1
 #endif
 
+/* FreeBSD requires ARMv4, so this is always set. */
+#define _HAVE_ARMv4_INSTRUCTIONS 1
 
-#if defined (_ARM_ARCH_4T)
+#if defined (_HAVE_ARMv4T_INSTRUCTIONS)
 # define RET	bx	lr
 # define RETeq	bxeq	lr
 # define RETne	bxne	lr
-# ifdef __STDC__
-#  define RETc(c) bx##c	lr
-# else
-#  define RETc(c) bx/**/c	lr
-# endif
+# define RETc(c) bx##c	lr
 #else
 # define RET	mov	pc, lr
 # define RETeq	moveq	pc, lr
 # define RETne	movne	pc, lr
-# ifdef __STDC__
-#  define RETc(c) mov##c	pc, lr
-# else
-#  define RETc(c) mov/**/c	pc, lr
-# endif
+# define RETc(c) mov##c	pc, lr
 #endif
 
 #endif /* !_MACHINE_ASM_H_ */
diff --git a/sys/arm/include/asmacros.h b/sys/arm/include/asmacros.h
index da612df..f83c14f 100644
--- a/sys/arm/include/asmacros.h
+++ b/sys/arm/include/asmacros.h
@@ -40,9 +40,12 @@
 #ifndef	_MACHINE_ASMACROS_H_
 #define	_MACHINE_ASMACROS_H_
 
+#include <machine/asm.h>
+
 #ifdef _KERNEL
 
 #ifdef LOCORE
+#include "opt_global.h"
 
 /*
  * ASM macros for pushing and pulling trapframes from the stack
@@ -58,7 +61,7 @@
  * NOTE: r13 and r14 are stored separately as a work around for the
  * SA110 rev 2 STM^ bug
  */
-
+#ifdef ARM_TP_ADDRESS
 #define PUSHFRAME							   \
 	str	lr, [sp, #-4]!;		/* Push the return address */	   \
 	sub	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
@@ -73,12 +76,24 @@
 	str	r1, [r0];						   \
 	mov	r1, #0xffffffff;					   \
 	str	r1, [r0, #4];
+#else
+#define PUSHFRAME							   \
+	str	lr, [sp, #-4]!;		/* Push the return address */	   \
+	sub	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
+	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
+	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
+	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
+        mov     r0, r0;                 /* NOP for previous instruction */ \
+	mrs	r0, spsr_all;		/* Put the SPSR on the stack */	   \
+	str	r0, [sp, #-4]!;
+#endif
 
 /*
  * PULLFRAME - macro to pull a trap frame from the stack in the current mode
  * Since the current mode is used, the SVC lr field is ignored.
  */
 
+#ifdef ARM_TP_ADDRESS
 #define PULLFRAME							   \
         ldr     r0, [sp], #0x0004;      /* Get the SPSR from stack */	   \
         msr     spsr_all, r0;						   \
@@ -86,6 +101,16 @@
         mov     r0, r0;                 /* NOP for previous instruction */ \
 	add	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
  	ldr	lr, [sp], #0x0004;	/* Pull the return address */
+#else 
+#define PULLFRAME							   \
+        ldr     r0, [sp], #0x0004;      /* Get the SPSR from stack */	   \
+        msr     spsr_all, r0;						   \
+	clrex;								   \
+        ldmia   sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
+        mov     r0, r0;                 /* NOP for previous instruction */ \
+	add	sp, sp, #(4*17);	/* Adjust the stack pointer */	   \
+ 	ldr	lr, [sp], #0x0004;	/* Pull the return address */
+#endif
 
 /*
  * PUSHFRAMEINSVC - macro to push a trap frame on the stack in SVC32 mode
@@ -97,7 +122,7 @@
  * NOTE: r13 and r14 are stored separately as a work around for the
  * SA110 rev 2 STM^ bug
  */
-
+#ifdef ARM_TP_ADDRESS
 #define PUSHFRAMEINSVC							   \
 	stmdb	sp, {r0-r3};		/* Save 4 registers */		   \
 	mov	r0, lr;			/* Save xxx32 r14 */		   \
@@ -132,6 +157,30 @@
 	strhi   r3, [r0, #16];          /* the RAS_START location.      */ \
 	mrs     r0, spsr_all;                                              \
 	str     r0, [sp, #-4]!
+#else
+#define PUSHFRAMEINSVC							   \
+	stmdb	sp, {r0-r3};		/* Save 4 registers */		   \
+	mov	r0, lr;			/* Save xxx32 r14 */		   \
+	mov	r1, sp;			/* Save xxx32 sp */		   \
+	mrs	r3, spsr;		/* Save xxx32 spsr */		   \
+	mrs     r2, cpsr;		/* Get the CPSR */		   \
+	bic     r2, r2, #(PSR_MODE);	/* Fix for SVC mode */		   \
+	orr     r2, r2, #(PSR_SVC32_MODE);				   \
+	msr     cpsr_c, r2;		/* Punch into SVC mode */	   \
+	mov	r2, sp;			/* Save	SVC sp */		   \
+	str	r0, [sp, #-4]!;		/* Push return address */	   \
+	str	lr, [sp, #-4]!;		/* Push SVC lr */		   \
+	str	r2, [sp, #-4]!;		/* Push SVC sp */		   \
+	msr     spsr_all, r3;		/* Restore correct spsr */	   \
+	ldmdb	r1, {r0-r3};		/* Restore 4 regs from xxx mode */ \
+	sub	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
+	stmia	sp, {r0-r12};		/* Push the user mode registers */ \
+	add	r0, sp, #(4*13);	/* Adjust the stack pointer */	   \
+	stmia	r0, {r13-r14}^;		/* Push the user mode registers */ \
+        mov     r0, r0;                 /* NOP for previous instruction */ \
+	mrs	r0, spsr_all;		/* Put the SPSR on the stack */	   \
+	str	r0, [sp, #-4]!
+#endif
 
 /*
  * PULLFRAMEFROMSVCANDEXIT - macro to pull a trap frame from the stack
@@ -140,6 +189,7 @@
  * exit.
  */
 
+#ifdef ARM_TP_ADDRESS
 #define PULLFRAMEFROMSVCANDEXIT						   \
         ldr     r0, [sp], #0x0004;	/* Get the SPSR from stack */	   \
         msr     spsr_all, r0;		/* restore SPSR */		   \
@@ -147,6 +197,16 @@
         mov     r0, r0;	  		/* NOP for previous instruction */ \
 	add	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
 	ldmia	sp, {sp, lr, pc}^	/* Restore lr and exit */
+#else 
+#define PULLFRAMEFROMSVCANDEXIT						   \
+        ldr     r0, [sp], #0x0004;	/* Get the SPSR from stack */	   \
+        msr     spsr_all, r0;		/* restore SPSR */		   \
+	clrex;								   \
+        ldmia   sp, {r0-r14}^;		/* Restore registers (usr mode) */ \
+        mov     r0, r0;	  		/* NOP for previous instruction */ \
+	add	sp, sp, #(4*15);	/* Adjust the stack pointer */	   \
+	ldmia	sp, {sp, lr, pc}^	/* Restore lr and exit */
+#endif 
 
 #define	DATA(name) \
 	.data ; \
@@ -155,9 +215,20 @@
 	.type	name, %object ; \
 name:
 
-#define	EMPTY
+#ifdef _ARM_ARCH_6
+#define	AST_LOCALS
+#define GET_CURTHREAD_PTR(tmp) \
+	mrc p15, 0, tmp, c13, c0, 4; \
+	add	tmp, tmp, #(PC_CURTHREAD)
+#else
+#define	AST_LOCALS							;\
+.Lcurthread:								;\
+	.word	_C_LABEL(__pcpu) + PC_CURTHREAD
+
+#define GET_CURTHREAD_PTR(tmp) \
+	ldr	tmp, .Lcurthread
+#endif
 
-		
 #define	DO_AST								\
 	ldr	r0, [sp]		/* Get the SPSR from stack */	;\
 	mrs	r4, cpsr		/* save CPSR */			;\
@@ -167,7 +238,7 @@ name:
 	teq	r0, #(PSR_USR32_MODE)					;\
 	bne	2f			/* Nope, get out now */		;\
 	bic	r4, r4, #(I32_bit|F32_bit)				;\
-1:	ldr	r5, .Lcurthread						;\
+1:	GET_CURTHREAD_PTR(r5)						;\
 	ldr	r5, [r5]						;\
 	ldr	r1, [r5, #(TD_FLAGS)]					;\
 	and	r1, r1, #(TDF_ASTPENDING|TDF_NEEDRESCHED)		;\
@@ -181,11 +252,6 @@ name:
 	b	1b							;\
 2:
 
-
-#define	AST_LOCALS							;\
-.Lcurthread:								;\
-	.word	_C_LABEL(__pcpu) + PC_CURTHREAD
-
 #endif /* LOCORE */
 
 #endif /* _KERNEL */
diff --git a/sys/arm/include/atomic.h b/sys/arm/include/atomic.h
index a64fc4a..72d91e2 100644
--- a/sys/arm/include/atomic.h
+++ b/sys/arm/include/atomic.h
@@ -39,17 +39,17 @@
 #ifndef	_MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
 
-#ifndef _LOCORE
-
 #include <sys/types.h>
 
 #ifndef _KERNEL
 #include <machine/sysarch.h>
+#else
+#include <machine/cpuconf.h>
 #endif
 
-#define	mb()
-#define	wmb()
-#define	rmb()
+#define mb()
+#define wmb()
+#define rmb()
 
 #ifndef I32_bit
 #define I32_bit (1 << 7)        /* IRQ disable */
@@ -58,6 +58,356 @@
 #define F32_bit (1 << 6)        /* FIQ disable */
 #endif
 
+/*
+ * It would be nice to use _HAVE_ARMv6_INSTRUCTIONS from machine/asm.h
+ * here, but that header can't be included here because this is C
+ * code.  I would like to move the _HAVE_ARMv6_INSTRUCTIONS definition
+ * out of asm.h so it can be used in both asm and C code. - kientzle@
+ */
+#if defined (__ARM_ARCH_7__) || \
+	defined (__ARM_ARCH_7A__) || \
+	defined (__ARM_ARCH_6__) || \
+	defined (__ARM_ARCH_6J__) || \
+	defined (__ARM_ARCH_6K__) || \
+	defined (__ARM_ARCH_6Z__) || \
+	defined (__ARM_ARCH_6ZK__)
+static __inline void
+__do_dmb(void)
+{
+
+#if defined (__ARM_ARCH_7__) || defined (__ARM_ARCH_7A__)
+	__asm __volatile("dmb" : : : "memory");
+#else
+	__asm __volatile("mcr p15, 0, r0, c7, c10, 5" : : : "memory");
+#endif
+}
+
+#define ATOMIC_ACQ_REL_LONG(NAME)					\
+static __inline void							\
+atomic_##NAME##_acq_long(__volatile u_long *p, u_long v)		\
+{									\
+	atomic_##NAME##_long(p, v);					\
+	__do_dmb();							\
+}									\
+									\
+static __inline  void							\
+atomic_##NAME##_rel_long(__volatile u_long *p, u_long v)		\
+{									\
+	__do_dmb();							\
+	atomic_##NAME##_long(p, v);					\
+}
+
+#define	ATOMIC_ACQ_REL(NAME, WIDTH)					\
+static __inline  void							\
+atomic_##NAME##_acq_##WIDTH(__volatile uint##WIDTH##_t *p, uint##WIDTH##_t v)\
+{									\
+	atomic_##NAME##_##WIDTH(p, v);					\
+	__do_dmb();							\
+}									\
+									\
+static __inline  void							\
+atomic_##NAME##_rel_##WIDTH(__volatile uint##WIDTH##_t *p, uint##WIDTH##_t v)\
+{									\
+	__do_dmb();							\
+	atomic_##NAME##_##WIDTH(p, v);					\
+}
+
+static __inline void
+atomic_set_32(volatile uint32_t *address, uint32_t setmask)
+{
+	uint32_t tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "orr %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			   : "=&r" (tmp), "+r" (tmp2)
+			   , "+r" (address), "+r" (setmask) : : "memory");
+			     
+}
+
+static __inline void
+atomic_set_long(volatile u_long *address, u_long setmask)
+{
+	u_long tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "orr %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			   : "=&r" (tmp), "+r" (tmp2)
+			   , "+r" (address), "+r" (setmask) : : "memory");
+			     
+}
+
+static __inline void
+atomic_clear_32(volatile uint32_t *address, uint32_t setmask)
+{
+	uint32_t tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "bic %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			   : "=&r" (tmp), "+r" (tmp2)
+			   ,"+r" (address), "+r" (setmask) : : "memory");
+}
+
+static __inline void
+atomic_clear_long(volatile u_long *address, u_long setmask)
+{
+	u_long tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "bic %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			   : "=&r" (tmp), "+r" (tmp2)
+			   ,"+r" (address), "+r" (setmask) : : "memory");
+}
+
+static __inline u_int32_t
+atomic_cmpset_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_int32_t newval)
+{
+	uint32_t ret;
+	
+	__asm __volatile("1: ldrex %0, [%1]\n"
+	                 "cmp %0, %2\n"
+			 "movne %0, #0\n"
+			 "bne 2f\n"
+			 "strex %0, %3, [%1]\n"
+			 "cmp %0, #0\n"
+			 "bne	1b\n"
+			 "moveq %0, #1\n"
+			 "2:"
+			 : "=&r" (ret)
+			 ,"+r" (p), "+r" (cmpval), "+r" (newval) : : "memory");
+	return (ret);
+}
+
+static __inline u_long
+atomic_cmpset_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
+{
+	u_long ret;
+	
+	__asm __volatile("1: ldrex %0, [%1]\n"
+	                 "cmp %0, %2\n"
+			 "movne %0, #0\n"
+			 "bne 2f\n"
+			 "strex %0, %3, [%1]\n"
+			 "cmp %0, #0\n"
+			 "bne	1b\n"
+			 "moveq %0, #1\n"
+			 "2:"
+			 : "=&r" (ret)
+			 ,"+r" (p), "+r" (cmpval), "+r" (newval) : : "memory");
+	return (ret);
+}
+
+static __inline u_int32_t
+atomic_cmpset_acq_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_int32_t newval)
+{
+	u_int32_t ret = atomic_cmpset_32(p, cmpval, newval);
+
+	__do_dmb();
+	return (ret);
+}
+
+static __inline u_long
+atomic_cmpset_acq_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
+{
+	u_long ret = atomic_cmpset_long(p, cmpval, newval);
+
+	__do_dmb();
+	return (ret);
+}
+
+static __inline u_int32_t
+atomic_cmpset_rel_32(volatile u_int32_t *p, volatile u_int32_t cmpval, volatile u_int32_t newval)
+{
+	
+	__do_dmb();
+	return (atomic_cmpset_32(p, cmpval, newval));
+}
+
+static __inline u_long
+atomic_cmpset_rel_long(volatile u_long *p, volatile u_long cmpval, volatile u_long newval)
+{
+	
+	__do_dmb();
+	return (atomic_cmpset_long(p, cmpval, newval));
+}
+
+
+static __inline void
+atomic_add_32(volatile u_int32_t *p, u_int32_t val)
+{
+	uint32_t tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "add %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			    : "=&r" (tmp), "+r" (tmp2)
+			    ,"+r" (p), "+r" (val) : : "memory");
+}
+
+static __inline void
+atomic_add_long(volatile u_long *p, u_long val)
+{
+	u_long tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "add %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			    : "=&r" (tmp), "+r" (tmp2)
+			    ,"+r" (p), "+r" (val) : : "memory");
+}
+
+static __inline void
+atomic_subtract_32(volatile u_int32_t *p, u_int32_t val)
+{
+	uint32_t tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "sub %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			    : "=&r" (tmp), "+r" (tmp2)
+			    ,"+r" (p), "+r" (val) : : "memory");
+}
+
+static __inline void
+atomic_subtract_long(volatile u_long *p, u_long val)
+{
+	u_long tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%2]\n"
+	    		    "sub %0, %0, %3\n"
+			    "strex %1, %0, [%2]\n"
+			    "cmp %1, #0\n"
+			    "bne	1b\n"
+			    : "=&r" (tmp), "+r" (tmp2)
+			    ,"+r" (p), "+r" (val) : : "memory");
+}
+
+ATOMIC_ACQ_REL(clear, 32)
+ATOMIC_ACQ_REL(add, 32)
+ATOMIC_ACQ_REL(subtract, 32)
+ATOMIC_ACQ_REL(set, 32)
+ATOMIC_ACQ_REL_LONG(clear)
+ATOMIC_ACQ_REL_LONG(add)
+ATOMIC_ACQ_REL_LONG(subtract)
+ATOMIC_ACQ_REL_LONG(set)
+
+#undef ATOMIC_ACQ_REL
+#undef ATOMIC_ACQ_REL_LONG
+
+static __inline uint32_t
+atomic_fetchadd_32(volatile uint32_t *p, uint32_t val)
+{
+	uint32_t tmp = 0, tmp2 = 0, ret = 0;
+
+	__asm __volatile("1: ldrex %0, [%3]\n"
+	    		    "add %1, %0, %4\n"
+			    "strex %2, %1, [%3]\n"
+			    "cmp %2, #0\n"
+			    "bne	1b\n"
+			   : "+r" (ret), "=&r" (tmp), "+r" (tmp2)
+			   ,"+r" (p), "+r" (val) : : "memory");
+	return (ret);
+}
+
+static __inline uint32_t
+atomic_readandclear_32(volatile u_int32_t *p)
+{
+	uint32_t ret, tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%3]\n"
+	    		 "mov %1, #0\n"
+			 "strex %2, %1, [%3]\n"
+			 "cmp %2, #0\n"
+			 "bne 1b\n"
+			 : "=r" (ret), "=&r" (tmp), "+r" (tmp2)
+			 ,"+r" (p) : : "memory");
+	return (ret);
+}
+
+static __inline uint32_t
+atomic_load_acq_32(volatile uint32_t *p)
+{
+	uint32_t v;
+
+	v = *p;
+	__do_dmb();
+	return (v);
+}
+
+static __inline void
+atomic_store_rel_32(volatile uint32_t *p, uint32_t v)
+{
+	
+	__do_dmb();
+	*p = v;
+}
+
+static __inline u_long
+atomic_fetchadd_long(volatile u_long *p, u_long val)
+{
+	u_long tmp = 0, tmp2 = 0, ret = 0;
+
+	__asm __volatile("1: ldrex %0, [%3]\n"
+	    		    "add %1, %0, %4\n"
+			    "strex %2, %1, [%3]\n"
+			    "cmp %2, #0\n"
+			    "bne	1b\n"
+			   : "+r" (ret), "=&r" (tmp), "+r" (tmp2)
+			   ,"+r" (p), "+r" (val) : : "memory");
+	return (ret);
+}
+
+static __inline u_long
+atomic_readandclear_long(volatile u_long *p)
+{
+	u_long ret, tmp = 0, tmp2 = 0;
+
+	__asm __volatile("1: ldrex %0, [%3]\n"
+	    		 "mov %1, #0\n"
+			 "strex %2, %1, [%3]\n"
+			 "cmp %2, #0\n"
+			 "bne 1b\n"
+			 : "=r" (ret), "=&r" (tmp), "+r" (tmp2)
+			 ,"+r" (p) : : "memory");
+	return (ret);
+}
+
+static __inline u_long
+atomic_load_acq_long(volatile u_long *p)
+{
+	u_long v;
+
+	v = *p;
+	__do_dmb();
+	return (v);
+}
+
+static __inline void
+atomic_store_rel_long(volatile u_long *p, u_long v)
+{
+	
+	__do_dmb();
+	*p = v;
+}
+#else /* < armv6 */
+
 #define __with_interrupts_disabled(expr) \
 	do {						\
 		u_int cpsr_save, tmp;			\
@@ -287,6 +637,83 @@ atomic_fetchadd_32(volatile uint32_t *p, uint32_t v)
 
 #endif /* _KERNEL */
 
+
+static __inline uint32_t
+atomic_readandclear_32(volatile u_int32_t *p)
+{
+
+	return (__swp(0, p));
+}
+
+#define atomic_cmpset_rel_32	atomic_cmpset_32
+#define atomic_cmpset_acq_32	atomic_cmpset_32
+#define atomic_set_rel_32	atomic_set_32
+#define atomic_set_acq_32	atomic_set_32
+#define atomic_clear_rel_32	atomic_clear_32
+#define atomic_clear_acq_32	atomic_clear_32
+#define atomic_add_rel_32	atomic_add_32
+#define atomic_add_acq_32	atomic_add_32
+#define atomic_subtract_rel_32	atomic_subtract_32
+#define atomic_subtract_acq_32	atomic_subtract_32
+#define atomic_store_rel_32	atomic_store_32
+#define atomic_store_rel_long	atomic_store_long
+#define atomic_load_acq_32	atomic_load_32
+#define atomic_load_acq_long	atomic_load_long
+#undef __with_interrupts_disabled
+
+static __inline void
+atomic_add_long(volatile u_long *p, u_long v)
+{
+
+	atomic_add_32((volatile uint32_t *)p, v);
+}
+
+static __inline void
+atomic_clear_long(volatile u_long *p, u_long v)
+{
+
+	atomic_clear_32((volatile uint32_t *)p, v);
+}
+
+static __inline int
+atomic_cmpset_long(volatile u_long *dst, u_long old, u_long newe)
+{
+
+	return (atomic_cmpset_32((volatile uint32_t *)dst, old, newe));
+}
+
+static __inline u_long
+atomic_fetchadd_long(volatile u_long *p, u_long v)
+{
+
+	return (atomic_fetchadd_32((volatile uint32_t *)p, v));
+}
+
+static __inline void
+atomic_readandclear_long(volatile u_long *p)
+{
+
+	atomic_readandclear_32((volatile uint32_t *)p);
+}
+
+static __inline void
+atomic_set_long(volatile u_long *p, u_long v)
+{
+
+	atomic_set_32((volatile uint32_t *)p, v);
+}
+
+static __inline void
+atomic_subtract_long(volatile u_long *p, u_long v)
+{
+
+	atomic_subtract_32((volatile uint32_t *)p, v);
+}
+
+
+
+#endif /* Arch >= v6 */
+
 static __inline int
 atomic_load_32(volatile uint32_t *v)
 {
@@ -300,88 +727,57 @@ atomic_store_32(volatile uint32_t *dst, uint32_t src)
 	*dst = src;
 }
 
-static __inline uint32_t
-atomic_readandclear_32(volatile u_int32_t *p)
+static __inline int
+atomic_load_long(volatile u_long *v)
 {
 
-	return (__swp(0, p));
+	return (*v);
 }
 
-#undef __with_interrupts_disabled
-
-#endif /* _LOCORE */
+static __inline void
+atomic_store_long(volatile u_long *dst, u_long src)
+{
+	*dst = src;
+}
 
-#define	atomic_add_long(p, v) \
-	atomic_add_32((volatile u_int *)(p), (u_int)(v))
 #define atomic_add_acq_long		atomic_add_long
 #define atomic_add_rel_long		atomic_add_long
-#define	atomic_subtract_long(p, v) \
-	atomic_subtract_32((volatile u_int *)(p), (u_int)(v))
 #define atomic_subtract_acq_long	atomic_subtract_long
 #define atomic_subtract_rel_long	atomic_subtract_long
-#define	atomic_clear_long(p, v) \
-	atomic_clear_32((volatile u_int *)(p), (u_int)(v))
 #define atomic_clear_acq_long		atomic_clear_long
 #define atomic_clear_rel_long		atomic_clear_long
-#define	atomic_set_long(p, v) \
-	atomic_set_32((volatile u_int *)(p), (u_int)(v))
 #define atomic_set_acq_long		atomic_set_long
 #define atomic_set_rel_long		atomic_set_long
-#define	atomic_cmpset_long(dst, old, new) \
-	atomic_cmpset_32((volatile u_int *)(dst), (u_int)(old), (u_int)(new))
 #define atomic_cmpset_acq_long		atomic_cmpset_long
 #define atomic_cmpset_rel_long		atomic_cmpset_long
-#define	atomic_fetchadd_long(p, v) \
-	atomic_fetchadd_32((volatile u_int *)(p), (u_int)(v))
-#define	atomic_readandclear_long(p) \
-	atomic_readandclear_long((volatile u_int *)(p))
-#define	atomic_load_long(p) \
-	atomic_load_32((volatile u_int *)(p))
 #define atomic_load_acq_long		atomic_load_long
-#define	atomic_store_rel_long(p, v) \
-	atomic_store_rel_32((volatile u_int *)(p), (u_int)(v))
-
 
 #define atomic_clear_ptr		atomic_clear_32
 #define atomic_set_ptr			atomic_set_32
-#define	atomic_cmpset_ptr(dst, old, new)	\
-    atomic_cmpset_32((volatile u_int *)(dst), (u_int)(old), (u_int)(new))
-#define atomic_cmpset_rel_ptr		atomic_cmpset_ptr
-#define atomic_cmpset_acq_ptr		atomic_cmpset_ptr
+#define atomic_cmpset_ptr		atomic_cmpset_32
+#define atomic_cmpset_rel_ptr		atomic_cmpset_rel_32
+#define atomic_cmpset_acq_ptr		atomic_cmpset_acq_32
 #define atomic_store_ptr		atomic_store_32
 #define atomic_store_rel_ptr		atomic_store_ptr
 
 #define atomic_add_int			atomic_add_32
-#define atomic_add_acq_int		atomic_add_int
-#define atomic_add_rel_int		atomic_add_int
+#define atomic_add_acq_int		atomic_add_acq_32
+#define atomic_add_rel_int		atomic_add_rel_32
 #define atomic_subtract_int		atomic_subtract_32
-#define atomic_subtract_acq_int		atomic_subtract_int
-#define atomic_subtract_rel_int		atomic_subtract_int
+#define atomic_subtract_acq_int		atomic_subtract_acq_32
+#define atomic_subtract_rel_int		atomic_subtract_rel_32
 #define atomic_clear_int		atomic_clear_32
-#define atomic_clear_acq_int		atomic_clear_int
-#define atomic_clear_rel_int		atomic_clear_int
+#define atomic_clear_acq_int		atomic_clear_acq_32
+#define atomic_clear_rel_int		atomic_clear_rel_32
 #define atomic_set_int			atomic_set_32
-#define atomic_set_acq_int		atomic_set_int
-#define atomic_set_rel_int		atomic_set_int
+#define atomic_set_acq_int		atomic_set_acq_32
+#define atomic_set_rel_int		atomic_set_rel_32
 #define atomic_cmpset_int		atomic_cmpset_32
-#define atomic_cmpset_acq_int		atomic_cmpset_int
-#define atomic_cmpset_rel_int		atomic_cmpset_int
+#define atomic_cmpset_acq_int		atomic_cmpset_acq_32
+#define atomic_cmpset_rel_int		atomic_cmpset_rel_32
 #define atomic_fetchadd_int		atomic_fetchadd_32
 #define atomic_readandclear_int		atomic_readandclear_32
-#define atomic_load_acq_int		atomic_load_32
-#define atomic_store_rel_int		atomic_store_32
-
-#define atomic_add_acq_32		atomic_add_32
-#define atomic_add_rel_32		atomic_add_32
-#define atomic_subtract_acq_32		atomic_subtract_32
-#define atomic_subtract_rel_32		atomic_subtract_32
-#define atomic_clear_acq_32		atomic_clear_32
-#define atomic_clear_rel_32		atomic_clear_32
-#define atomic_set_acq_32		atomic_set_32
-#define atomic_set_rel_32		atomic_set_32
-#define atomic_cmpset_acq_32		atomic_cmpset_32
-#define atomic_cmpset_rel_32		atomic_cmpset_32
-#define atomic_load_acq_32		atomic_load_32
-#define atomic_store_rel_32		atomic_store_32
+#define atomic_load_acq_int		atomic_load_acq_32
+#define atomic_store_rel_int		atomic_store_rel_32
 
 #endif /* _MACHINE_ATOMIC_H_ */
diff --git a/sys/arm/include/cpuconf.h b/sys/arm/include/cpuconf.h
index 7fa6157..95d4b91 100644
--- a/sys/arm/include/cpuconf.h
+++ b/sys/arm/include/cpuconf.h
@@ -63,7 +63,9 @@
 			 defined(CPU_XSCALE_PXA2X0) +			\
 			 defined(CPU_FA526) +				\
 			 defined(CPU_FA626TE) +				\
-			 defined(CPU_XSCALE_IXP425))
+			 defined(CPU_XSCALE_IXP425)) +			\
+			 defined(CPU_CORTEXA) +				\
+			 defined(CPU_MV_PJ4B)
 
 /*
  * Step 2: Determine which ARM architecture versions are configured.
@@ -86,18 +88,26 @@
 #define	ARM_ARCH_5	0
 #endif
 
-#if defined(CPU_ARM11)
+#if !defined(ARM_ARCH_6)
+#if defined(CPU_ARM11) || defined(CPU_MV_PJ4B)
 #define ARM_ARCH_6	1
 #else
 #define ARM_ARCH_6	0
 #endif
+#endif
+
+#if defined(CPU_CORTEXA)
+#define ARM_ARCH_7A	1
+#else
+#define ARM_ARCH_7A	0
+#endif
 
-#define	ARM_NARCH	(ARM_ARCH_4 + ARM_ARCH_5 + ARM_ARCH_6)
+#define	ARM_NARCH	(ARM_ARCH_4 + ARM_ARCH_5 + ARM_ARCH_6 | ARM_ARCH_7A)
 #if ARM_NARCH == 0 && !defined(KLD_MODULE) && defined(_KERNEL)
 #error ARM_NARCH is 0
 #endif
 
-#if ARM_ARCH_5 || ARM_ARCH_6
+#if ARM_ARCH_5 || ARM_ARCH_6 || ARM_ARCH_7A
 /*
  * We could support Thumb code on v4T, but the lack of clean interworking
  * makes that hard.
@@ -113,6 +123,10 @@
  *
  *	ARM_MMU_GENERIC		Generic ARM MMU, compatible with ARM6.
  *
+ *	ARM_MMU_V6		ARMv6 MMU.
+ *
+ *	ARM_MMU_V7		ARMv7 MMU.
+ *
  *	ARM_MMU_SA1		StrongARM SA-1 MMU.  Compatible with generic
  *				ARM MMU, but has no write-through cache mode.
  *
@@ -128,13 +142,25 @@
 
 #if (defined(CPU_ARM6) || defined(CPU_ARM7) || defined(CPU_ARM7TDMI) ||	\
      defined(CPU_ARM8) || defined(CPU_ARM9) || defined(CPU_ARM9E) ||	\
-     defined(CPU_ARM10) || defined(CPU_ARM11) || defined(CPU_FA526) ||	\
+     defined(CPU_ARM10) || defined(CPU_FA526) ||	\
      defined(CPU_FA626TE))
 #define	ARM_MMU_GENERIC		1
 #else
 #define	ARM_MMU_GENERIC		0
 #endif
 
+#if defined(CPU_ARM11) || defined(CPU_MV_PJ4B)
+#define ARM_MMU_V6		1
+#else
+#define ARM_MMU_V6		0
+#endif
+
+#if defined(CPU_CORTEXA)
+#define ARM_MMU_V7		1
+#else
+#define ARM_MMU_V7		0
+#endif
+
 #if (defined(CPU_SA110) || defined(CPU_SA1100) || defined(CPU_SA1110) ||\
      defined(CPU_IXP12X0))
 #define	ARM_MMU_SA1		1
@@ -150,8 +176,8 @@
 #define	ARM_MMU_XSCALE		0
 #endif
 
-#define	ARM_NMMUS		(ARM_MMU_MEMC + ARM_MMU_GENERIC +	\
-				 ARM_MMU_SA1 + ARM_MMU_XSCALE)
+#define	ARM_NMMUS		(ARM_MMU_MEMC + ARM_MMU_GENERIC + ARM_MMU_V6 + \
+				 ARM_MMU_V7 + ARM_MMU_SA1 + ARM_MMU_XSCALE)
 #if ARM_NMMUS == 0 && !defined(KLD_MODULE) && defined(_KERNEL)
 #error ARM_NMMUS is 0
 #endif
diff --git a/sys/arm/include/cpufunc.h b/sys/arm/include/cpufunc.h
index dbba8b1..d774fbe 100644
--- a/sys/arm/include/cpufunc.h
+++ b/sys/arm/include/cpufunc.h
@@ -176,6 +176,8 @@ extern u_int cputype;
 #define cpu_faultstatus()	cpufuncs.cf_faultstatus()
 #define cpu_faultaddress()	cpufuncs.cf_faultaddress()
 
+#ifndef SMP
+
 #define	cpu_tlb_flushID()	cpufuncs.cf_tlb_flushID()
 #define	cpu_tlb_flushID_SE(e)	cpufuncs.cf_tlb_flushID_SE(e)
 #define	cpu_tlb_flushI()	cpufuncs.cf_tlb_flushI()
@@ -183,6 +185,51 @@ extern u_int cputype;
 #define	cpu_tlb_flushD()	cpufuncs.cf_tlb_flushD()
 #define	cpu_tlb_flushD_SE(e)	cpufuncs.cf_tlb_flushD_SE(e)
 
+#else
+void tlb_broadcast(int);
+
+#ifdef CPU_CORTEXA
+#define TLB_BROADCAST	/* No need to explicitely send an IPI */
+#else
+#define TLB_BROADCAST	tlb_broadcast(7)
+#endif
+
+#define	cpu_tlb_flushID() do { \
+	cpufuncs.cf_tlb_flushID(); \
+	TLB_BROADCAST; \
+} while(0)
+
+#define	cpu_tlb_flushID_SE(e) do { \
+	cpufuncs.cf_tlb_flushID_SE(e); \
+	TLB_BROADCAST; \
+} while(0)
+
+
+#define	cpu_tlb_flushI() do { \
+	cpufuncs.cf_tlb_flushI(); \
+	TLB_BROADCAST; \
+} while(0)
+
+
+#define	cpu_tlb_flushI_SE(e) do { \
+	cpufuncs.cf_tlb_flushI_SE(e); \
+	TLB_BROADCAST; \
+} while(0)
+
+
+#define	cpu_tlb_flushD() do { \
+	cpufuncs.cf_tlb_flushD(); \
+	TLB_BROADCAST; \
+} while(0)
+
+
+#define	cpu_tlb_flushD_SE(e) do { \
+	cpufuncs.cf_tlb_flushD_SE(e); \
+	TLB_BROADCAST; \
+} while(0)
+
+#endif
+
 #define	cpu_icache_sync_all()	cpufuncs.cf_icache_sync_all()
 #define	cpu_icache_sync_range(a, s) cpufuncs.cf_icache_sync_range((a), (s))
 
@@ -222,10 +269,12 @@ int	cpufunc_null_fixup	(void *);
 int	early_abort_fixup	(void *);
 int	late_abort_fixup	(void *);
 u_int	cpufunc_id		(void);
+u_int	cpufunc_cpuid		(void);
 u_int	cpufunc_control		(u_int clear, u_int bic);
 void	cpufunc_domains		(u_int domains);
 u_int	cpufunc_faultstatus	(void);
 u_int	cpufunc_faultaddress	(void);
+u_int	cpu_pfr			(int);
 
 #ifdef CPU_ARM3
 u_int	arm3_control		(u_int clear, u_int bic);
@@ -413,8 +462,9 @@ void	sheeva_l2cache_wb_range		(vm_offset_t, vm_size_t);
 void	sheeva_l2cache_wbinv_all	(void);
 #endif
 
-#ifdef CPU_ARM11
+#if defined(CPU_ARM11) || defined(CPU_MV_PJ4B) || defined(CPU_CORTEXA)
 void	arm11_setttb		(u_int);
+void	arm11_sleep		(int);
 
 void	arm11_tlb_flushID_SE	(u_int);
 void	arm11_tlb_flushI_SE	(u_int);
@@ -428,6 +478,51 @@ void	arm11_tlb_flushD	(void);
 void	arm11_tlb_flushD_SE	(u_int va);
 
 void	arm11_drain_writebuf	(void);
+
+void	pj4b_setttb			(u_int);
+
+void	pj4b_icache_sync_range		(vm_offset_t, vm_size_t);
+
+void	pj4b_dcache_wbinv_range		(vm_offset_t, vm_size_t);
+void	pj4b_dcache_inv_range		(vm_offset_t, vm_size_t);
+void	pj4b_dcache_wb_range		(vm_offset_t, vm_size_t);
+
+void	pj4b_idcache_wbinv_range	(vm_offset_t, vm_size_t);
+
+void	pj4b_drain_readbuf		(void);
+void	pj4b_flush_brnchtgt_all		(void);
+void	pj4b_flush_brnchtgt_va		(u_int);
+void	pj4b_sleep			(int);
+
+void	armv6_icache_sync_all		(void);
+void	armv6_dcache_wbinv_all		(void);
+void	armv6_idcache_wbinv_all		(void);
+
+void	armv7_setttb			(u_int);
+void	armv7_tlb_flushID		(void);
+void	armv7_tlb_flushID_SE		(u_int);
+void	armv7_icache_sync_range		(vm_offset_t, vm_size_t);
+void	armv7_idcache_wbinv_range	(vm_offset_t, vm_size_t);
+void	armv7_dcache_wbinv_all		(void);
+void	armv7_idcache_wbinv_all		(void);
+void	armv7_dcache_wbinv_range	(vm_offset_t, vm_size_t);
+void	armv7_dcache_inv_range		(vm_offset_t, vm_size_t);
+void	armv7_dcache_wb_range		(vm_offset_t, vm_size_t);
+void	armv7_cpu_sleep			(int);
+void	armv7_setup			(char *string);
+void	armv7_context_switch		(void);
+void	armv7_drain_writebuf		(void);
+void	armv7_sev			(void);
+u_int	armv7_auxctrl			(u_int, u_int);
+void	pj4bv7_setup			(char *string);
+void	pj4bv6_setup			(char *string);
+void	pj4b_config			(void);
+
+int	get_core_id			(void);
+
+void	armadaxp_idcache_wbinv_all	(void);
+
+void 	cortexa_setup			(char *);
 #endif
 
 #if defined(CPU_ARM9E) || defined (CPU_ARM10)
@@ -445,7 +540,7 @@ void	armv5_ec_idcache_wbinv_all(void);
 void	armv5_ec_idcache_wbinv_range(vm_offset_t, vm_size_t);
 #endif
 
-#if defined (CPU_ARM10) || defined (CPU_ARM11)
+#if defined (CPU_ARM10)
 void	armv5_setttb(u_int);
 
 void	armv5_icache_sync_all(void);
@@ -636,6 +731,10 @@ extern int	arm_pcache_unified;
 extern int	arm_dcache_align;
 extern int	arm_dcache_align_mask;
 
+extern u_int	arm_cache_level;
+extern u_int	arm_cache_loc;
+extern u_int	arm_cache_type[14];
+
 #endif	/* _KERNEL */
 #endif	/* _MACHINE_CPUFUNC_H_ */
 
diff --git a/sys/arm/include/fp.h b/sys/arm/include/fp.h
index 25effd6..fee313e 100644
--- a/sys/arm/include/fp.h
+++ b/sys/arm/include/fp.h
@@ -66,12 +66,19 @@ typedef struct fp_extended_precision fp_reg_t;
  * This needs to move and be hidden from userland.
  */
 
+#ifdef ARM_VFP_SUPPORT
+struct vfp_state {
+	u_int64_t reg[32];
+	u_int32_t fpscr;
+};
+#else
 struct fpe_sp_state {
 	unsigned int fp_flags;
 	unsigned int fp_sr;
 	unsigned int fp_cr;
 	fp_reg_t fp_registers[16];
 };
+#endif
 
 /*
  * Type for a saved FP context, if we want to translate the context to a
diff --git a/sys/arm/include/intr.h b/sys/arm/include/intr.h
index bf6a15a..0aee703 100644
--- a/sys/arm/include/intr.h
+++ b/sys/arm/include/intr.h
@@ -50,6 +50,8 @@
 #elif defined(CPU_ARM9) || defined(SOC_MV_KIRKWOOD) || \
     defined(CPU_XSCALE_IXP435)
 #define NIRQ		64
+#elif defined(CPU_CORTEXA)
+#define NIRQ		128
 #else
 #define NIRQ		32
 #endif
@@ -63,4 +65,7 @@ void arm_setup_irqhandler(const char *, int (*)(void*), void (*)(void*),
     void *, int, int, void **);
 int arm_remove_irqhandler(int, void *);
 extern void (*arm_post_filter)(void *);
+
+void gic_init_secondary(void);
+
 #endif	/* _MACHINE_INTR_H */
diff --git a/sys/arm/include/md_var.h b/sys/arm/include/md_var.h
index 1f622e2..efb973d 100644
--- a/sys/arm/include/md_var.h
+++ b/sys/arm/include/md_var.h
@@ -62,6 +62,7 @@ enum cpu_class {
 	CPU_CLASS_ARM9EJS,
 	CPU_CLASS_ARM10E,
 	CPU_CLASS_ARM10EJ,
+	CPU_CLASS_CORTEXA,
 	CPU_CLASS_SA1,
 	CPU_CLASS_XSCALE,
 	CPU_CLASS_ARM11J,
diff --git a/sys/arm/include/param.h b/sys/arm/include/param.h
index bb76c5f..4a7ebbd 100644
--- a/sys/arm/include/param.h
+++ b/sys/arm/include/param.h
@@ -56,17 +56,25 @@
 #define	MACHINE		"arm"
 #endif
 #ifndef MACHINE_ARCH
+#ifdef __FreeBSD_ARCH_armv6__
+#ifdef __ARMEB__
+#define	MACHINE_ARCH	"armv6eb"
+#else
+#define	MACHINE_ARCH	"armv6"
+#endif
+#else
 #ifdef __ARMEB__
 #define	MACHINE_ARCH	"armeb"
 #else
 #define	MACHINE_ARCH	"arm"
 #endif
 #endif
+#endif
 #define	MID_MACHINE	MID_ARM6
 
 #if defined(SMP) || defined(KLD_MODULE)
 #ifndef MAXCPU
-#define	MAXCPU		2
+#define	MAXCPU		4
 #endif
 #else
 #define	MAXCPU		1
diff --git a/sys/arm/include/pcb.h b/sys/arm/include/pcb.h
index ce9ab97..ec15c9b 100644
--- a/sys/arm/include/pcb.h
+++ b/sys/arm/include/pcb.h
@@ -80,7 +80,12 @@ struct pcb {
 #define PCB_NOALIGNFLT	0x00000002
 	caddr_t	pcb_onfault;			/* On fault handler */
 	struct	pcb_arm32 un_32;
+#ifdef ARM_VFP_SUPPORT
+	struct vfp_state pcb_vfpstate;          /* VP/NEON state */
+	u_int pcb_vfpcpu;                       /* VP/NEON last cpu */
+#else
 	struct	fpe_sp_state pcb_fpstate;	/* Floating Point state */
+#endif
 };
 
 /*
diff --git a/sys/arm/include/pcpu.h b/sys/arm/include/pcpu.h
index 184db4c..f12f903 100644
--- a/sys/arm/include/pcpu.h
+++ b/sys/arm/include/pcpu.h
@@ -32,6 +32,7 @@
 
 #ifdef _KERNEL
 
+#include <machine/cpuconf.h>
 #include <machine/frame.h>
 
 #define	ALT_STACK_SIZE	128
@@ -40,7 +41,18 @@ struct vmspace;
 
 #endif	/* _KERNEL */
 
-#define	PCPU_MD_FIELDS
+#ifdef ARM_VFP_SUPPORT
+#define PCPU_MD_FIELDS							\
+	unsigned int pc_cpu;						\
+	unsigned int pc_vfpsid;						\
+	unsigned int pc_vfpmvfr0;					\
+	unsigned int pc_vfpmvfr1;					\
+	struct thread *pc_vfpcthread;					\
+	struct pmap *pc_curpmap;
+#else
+#define PCPU_MD_FIELDS
+#endif
+
 
 #ifdef _KERNEL
 
@@ -48,19 +60,50 @@ struct pcb;
 struct pcpu;
 
 extern struct pcpu *pcpup;
-extern struct pcpu __pcpu;
+#if ARM_ARCH_6 || ARM_ARCH_7A
+/* or ARM_TP_ADDRESS 	mark REMOVE ME NOTE */
+static inline struct pcpu *
+get_pcpu(void)
+{
+	void *pcpu;
 
-#define	PCPU_GET(member)	(__pcpu.pc_ ## member)
+	__asm __volatile("mrc p15, 0, %0, c13, c0, 4" : "=r" (pcpu));
+	return (pcpu);
+}
 
-/*
- * XXX The implementation of this operation should be made atomic
- * with respect to preemption.
- */
-#define	PCPU_ADD(member, value)	(__pcpu.pc_ ## member += (value))
+static inline void
+set_pcpu(void *pcpu)
+{
+
+	__asm __volatile("mcr p15, 0, %0, c13, c0, 4" : : "r" (pcpu));
+}
+
+static inline void *
+get_tls(void)
+{
+	void *tls;
+
+	__asm __volatile("mrc p15, 0, %0, c13, c0, 3" : "=r" (tls));
+	return (tls);
+}
+
+static inline void
+set_tls(void *tls)
+{
+
+	__asm __volatile("mcr p15, 0, %0, c13, c0, 3" : : "r" (tls));
+}
+#else
+#define get_pcpu()	pcpup
+#endif
+
+#define	PCPU_GET(member)	(get_pcpu()->pc_ ## member)
+#define	PCPU_ADD(member, value)	(get_pcpu()->pc_ ## member += (value))
 #define	PCPU_INC(member)	PCPU_ADD(member, 1)
-#define	PCPU_PTR(member)	(&__pcpu.pc_ ## member)
-#define	PCPU_SET(member,value)	(__pcpu.pc_ ## member = (value))
+#define	PCPU_PTR(member)	(&pcpup->pc_ ## member)
+#define	PCPU_SET(member,value)	(pcpup->pc_ ## member = (value))
 
+void pcpu0_init(void);
 #endif	/* _KERNEL */
 
 #endif	/* !_MACHINE_PCPU_H_ */
diff --git a/sys/arm/include/pl310.h b/sys/arm/include/pl310.h
new file mode 100644
index 0000000..98df53c
--- /dev/null
+++ b/sys/arm/include/pl310.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2012 Olivier Houchard.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/*
+ * $FreeBSD$
+ */
+
+#ifndef PL310_H_
+#define PL310_H_
+struct pl310_softc {
+	struct resource *sc_mem_res;
+};
+
+void platform_init_pl310(struct pl310_softc *sc);
+
+#endif /* PL310_H_ */
diff --git a/sys/arm/include/pmap.h b/sys/arm/include/pmap.h
index 85ea31a..7a3c8ee 100644
--- a/sys/arm/include/pmap.h
+++ b/sys/arm/include/pmap.h
@@ -55,9 +55,30 @@
 /*
  * Pte related macros
  */
-#define PTE_NOCACHE	0
-#define PTE_CACHE	1
-#define PTE_PAGETABLE	2
+#if ARM_ARCH_6 || ARM_ARCH_7A
+#ifdef SMP
+#define PTE_NOCACHE	2
+#else
+#define PTE_NOCACHE	1
+#endif
+#define PTE_CACHE	4
+#define PTE_DEVICE	2
+#define PTE_PAGETABLE	4
+#else
+#define PTE_NOCACHE	1
+#define PTE_CACHE	2
+#define PTE_PAGETABLE	3
+#endif
+
+enum mem_type {
+	STRONG_ORD = 0,
+	DEVICE_NOSHARE,
+	DEVICE_SHARE,
+	NRML_NOCACHE,
+	NRML_IWT_OWT,
+	NRML_IWB_OWB,
+	NRML_IWBA_OWBA
+};
 
 #ifndef LOCORE
 
@@ -209,6 +230,7 @@ extern vm_offset_t virtual_avail;
 extern vm_offset_t virtual_end;
 
 void	pmap_bootstrap(vm_offset_t, vm_offset_t, struct pv_addr *);
+int	pmap_change_attr(vm_offset_t, vm_size_t, int);
 void	pmap_kenter(vm_offset_t va, vm_paddr_t pa);
 void	pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa);
 void	*pmap_kenter_temp(vm_paddr_t pa, int i);
@@ -225,6 +247,7 @@ void
 pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
     int cache);
 int pmap_fault_fixup(pmap_t, vm_offset_t, vm_prot_t, int);
+int pmap_dmap_iscurrent(pmap_t pmap);
 
 /*
  * Definitions for MMU domains
@@ -251,18 +274,11 @@ extern int pmap_needs_pte_sync;
  * We use these macros since we use different bits on different processor
  * models.
  */
-#define	L1_S_PROT_U		(L1_S_AP(AP_U))
-#define	L1_S_PROT_W		(L1_S_AP(AP_W))
-#define	L1_S_PROT_MASK		(L1_S_PROT_U|L1_S_PROT_W)
 
 #define	L1_S_CACHE_MASK_generic	(L1_S_B|L1_S_C)
 #define	L1_S_CACHE_MASK_xscale	(L1_S_B|L1_S_C|L1_S_XSCALE_TEX(TEX_XSCALE_X)|\
     				L1_S_XSCALE_TEX(TEX_XSCALE_T))
 
-#define	L2_L_PROT_U		(L2_AP(AP_U))
-#define	L2_L_PROT_W		(L2_AP(AP_W))
-#define	L2_L_PROT_MASK		(L2_L_PROT_U|L2_L_PROT_W)
-
 #define	L2_L_CACHE_MASK_generic	(L2_B|L2_C)
 #define	L2_L_CACHE_MASK_xscale	(L2_B|L2_C|L2_XSCALE_L_TEX(TEX_XSCALE_X) | \
     				L2_XSCALE_L_TEX(TEX_XSCALE_T))
@@ -293,6 +309,11 @@ extern int pmap_needs_pte_sync;
 /*
  * User-visible names for the ones that vary with MMU class.
  */
+#if (ARM_MMU_V6 + ARM_MMU_V7) != 0
+#define	L2_AP(x)	(L2_AP0(x))
+#else
+#define	L2_AP(x)	(L2_AP0(x) | L2_AP1(x) | L2_AP2(x) | L2_AP3(x))
+#endif
 
 #if ARM_NMMUS > 1
 /* More than one MMU class configured; use variables. */
@@ -334,6 +355,77 @@ extern int pmap_needs_pte_sync;
 #define	L1_C_PROTO		L1_C_PROTO_xscale
 #define	L2_S_PROTO		L2_S_PROTO_xscale
 
+#elif (ARM_MMU_V6 + ARM_MMU_V7) != 0
+
+#define	L2_S_PROT_U		(L2_AP0(2))		/* user access */
+#define	L2_S_PROT_R		(L2_APX|L2_AP0(1))	/* read access */
+
+#define	L2_S_PROT_MASK		(L2_S_PROT_U|L2_S_PROT_R)
+#define	L2_S_WRITABLE(pte)	(!(pte & L2_APX))
+
+#ifndef SMP
+#define	L1_S_CACHE_MASK		(L1_S_TEX_MASK|L1_S_B|L1_S_C)
+#define	L2_L_CACHE_MASK		(L2_L_TEX_MASK|L2_B|L2_C)
+#define	L2_S_CACHE_MASK		(L2_S_TEX_MASK|L2_B|L2_C)
+#else
+#define	L1_S_CACHE_MASK		(L1_S_TEX_MASK|L1_S_B|L1_S_C|L1_SHARED)
+#define	L2_L_CACHE_MASK		(L2_L_TEX_MASK|L2_B|L2_C|L2_SHARED)
+#define	L2_S_CACHE_MASK		(L2_S_TEX_MASK|L2_B|L2_C|L2_SHARED)
+#endif  /* SMP */
+
+#define	L1_S_PROTO		(L1_TYPE_S)
+#define	L1_C_PROTO		(L1_TYPE_C)
+#define	L2_S_PROTO		(L2_TYPE_S)
+
+#ifndef SMP
+#define ARM_L1S_STRONG_ORD	(0)
+#define ARM_L1S_DEVICE_NOSHARE	(L1_S_TEX(2))
+#define ARM_L1S_DEVICE_SHARE	(L1_S_B)
+#define ARM_L1S_NRML_NOCACHE	(L1_S_TEX(1))
+#define ARM_L1S_NRML_IWT_OWT	(L1_S_C)
+#define ARM_L1S_NRML_IWB_OWB	(L1_S_C|L1_S_B)
+#define ARM_L1S_NRML_IWBA_OWBA	(L1_S_TEX(1)|L1_S_C|L1_S_B)
+
+#define ARM_L2L_STRONG_ORD	(0)
+#define ARM_L2L_DEVICE_NOSHARE	(L2_L_TEX(2))
+#define ARM_L2L_DEVICE_SHARE	(L2_B)
+#define ARM_L2L_NRML_NOCACHE	(L2_L_TEX(1))
+#define ARM_L2L_NRML_IWT_OWT	(L2_C)
+#define ARM_L2L_NRML_IWB_OWB	(L2_C|L2_B)
+#define ARM_L2L_NRML_IWBA_OWBA	(L2_L_TEX(1)|L2_C|L2_B)
+
+#define ARM_L2S_STRONG_ORD	(0)
+#define ARM_L2S_DEVICE_NOSHARE	(L2_S_TEX(2))
+#define ARM_L2S_DEVICE_SHARE	(L2_B)
+#define ARM_L2S_NRML_NOCACHE	(L2_S_TEX(1))
+#define ARM_L2S_NRML_IWT_OWT	(L2_C)
+#define ARM_L2S_NRML_IWB_OWB	(L2_C|L2_B)
+#define ARM_L2S_NRML_IWBA_OWBA	(L2_S_TEX(1)|L2_C|L2_B)
+#else
+#define ARM_L1S_STRONG_ORD	(0)
+#define ARM_L1S_DEVICE_NOSHARE	(L1_S_TEX(2))
+#define ARM_L1S_DEVICE_SHARE	(L1_S_B)
+#define ARM_L1S_NRML_NOCACHE	(L1_S_TEX(1)|L1_SHARED)
+#define ARM_L1S_NRML_IWT_OWT	(L1_S_C|L1_SHARED)
+#define ARM_L1S_NRML_IWB_OWB	(L1_S_C|L1_S_B|L1_SHARED)
+#define ARM_L1S_NRML_IWBA_OWBA	(L1_S_TEX(1)|L1_S_C|L1_S_B|L1_SHARED)
+
+#define ARM_L2L_STRONG_ORD	(0)
+#define ARM_L2L_DEVICE_NOSHARE	(L2_L_TEX(2))
+#define ARM_L2L_DEVICE_SHARE	(L2_B)
+#define ARM_L2L_NRML_NOCACHE	(L2_L_TEX(1)|L2_SHARED)
+#define ARM_L2L_NRML_IWT_OWT	(L2_C|L2_SHARED)
+#define ARM_L2L_NRML_IWB_OWB	(L2_C|L2_B|L2_SHARED)
+#define ARM_L2L_NRML_IWBA_OWBA	(L2_L_TEX(1)|L2_C|L2_B|L2_SHARED)
+
+#define ARM_L2S_STRONG_ORD	(0)
+#define ARM_L2S_DEVICE_NOSHARE	(L2_S_TEX(2))
+#define ARM_L2S_DEVICE_SHARE	(L2_B)
+#define ARM_L2S_NRML_NOCACHE	(L2_S_TEX(1)|L2_SHARED)
+#define ARM_L2S_NRML_IWT_OWT	(L2_C|L2_SHARED)
+#define ARM_L2S_NRML_IWB_OWB	(L2_C|L2_B|L2_SHARED)
+#define ARM_L2S_NRML_IWBA_OWBA	(L2_S_TEX(1)|L2_C|L2_B|L2_SHARED)
+#endif /* SMP */
 #endif /* ARM_NMMUS > 1 */
 
 #if (ARM_MMU_SA1 == 1) && (ARM_NMMUS == 1)
@@ -350,14 +442,41 @@ extern int pmap_needs_pte_sync;
  * These macros return various bits based on kernel/user and protection.
  * Note that the compiler will usually fold these at compile time.
  */
+#if (ARM_MMU_V6 + ARM_MMU_V7) == 0
+
+#define	L1_S_PROT_U		(L1_S_AP(AP_U))
+#define	L1_S_PROT_W		(L1_S_AP(AP_W))
+#define	L1_S_PROT_MASK		(L1_S_PROT_U|L1_S_PROT_W)
+#define	L1_S_WRITABLE(pd)	((pd) & L1_S_PROT_W)
+
 #define	L1_S_PROT(ku, pr)	((((ku) == PTE_USER) ? L1_S_PROT_U : 0) | \
 				 (((pr) & VM_PROT_WRITE) ? L1_S_PROT_W : 0))
 
+#define	L2_L_PROT_U		(L2_AP(AP_U))
+#define	L2_L_PROT_W		(L2_AP(AP_W))
+#define	L2_L_PROT_MASK		(L2_L_PROT_U|L2_L_PROT_W)
+
 #define	L2_L_PROT(ku, pr)	((((ku) == PTE_USER) ? L2_L_PROT_U : 0) | \
 				 (((pr) & VM_PROT_WRITE) ? L2_L_PROT_W : 0))
 
 #define	L2_S_PROT(ku, pr)	((((ku) == PTE_USER) ? L2_S_PROT_U : 0) | \
 				 (((pr) & VM_PROT_WRITE) ? L2_S_PROT_W : 0))
+#else
+#define	L1_S_PROT_U		(L1_S_AP(AP_U))
+#define	L1_S_PROT_MASK		(L1_S_APX|L1_S_AP(0x3))
+#define	L1_S_WRITABLE(pd)	(!((pd) & L1_S_APX))
+
+#define	L1_S_PROT(ku, pr)	(L1_S_PROT_MASK & ~((((ku) == PTE_KERNEL) ? L1_S_PROT_U : 0) | \
+				 (((pr) & VM_PROT_WRITE) ? L1_S_APX : 0)))
+
+#define	L2_L_PROT_MASK		(L2_APX|L2_AP0(0x3))
+#define	L2_L_PROT(ku, pr)	(L2_L_PROT_MASK & ~((((ku) == PTE_KERNEL) ? L2_S_PROT_U : 0) | \
+				 (((pr) & VM_PROT_WRITE) ? L2_APX : 0)))
+
+#define	L2_S_PROT(ku, pr)	(L2_S_PROT_MASK & ~((((ku) == PTE_KERNEL) ? L2_S_PROT_U : 0) | \
+				 (((pr) & VM_PROT_WRITE) ? L2_APX : 0)))
+
+#endif
 
 /*
  * Macros to test if a mapping is mappable with an L1 Section mapping
@@ -422,7 +541,7 @@ extern pt_entry_t		pte_l2_s_proto;
 extern void (*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t);
 extern void (*pmap_zero_page_func)(vm_paddr_t, int, int);
 
-#if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_81342)
+#if (ARM_MMU_GENERIC + ARM_MMU_V6 + ARM_MMU_V7 + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_81342)
 void	pmap_copy_page_generic(vm_paddr_t, vm_paddr_t);
 void	pmap_zero_page_generic(vm_paddr_t, int, int);
 
@@ -436,6 +555,9 @@ void	pmap_pte_init_arm9(void);
 #if defined(CPU_ARM10)
 void	pmap_pte_init_arm10(void);
 #endif /* CPU_ARM10 */
+#if (ARM_MMU_V6 + ARM_MMU_V7) != 0
+void	pmap_pte_init_mmu_v6(void);
+#endif /* CPU_ARM11 */
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if /* ARM_MMU_SA1 == */1
diff --git a/sys/arm/include/pte.h b/sys/arm/include/pte.h
index 5921d20..1a91786 100644
--- a/sys/arm/include/pte.h
+++ b/sys/arm/include/pte.h
@@ -37,18 +37,11 @@
 #ifndef _MACHINE_PTE_H_
 #define _MACHINE_PTE_H_
 
-#define PDSHIFT		20		/* LOG2(NBPDR) */
-#define NBPD		(1 << PDSHIFT)	/* bytes/page dir */
-#define NPTEPD		(NBPD / PAGE_SIZE)
-
 #ifndef LOCORE
 typedef	uint32_t	pd_entry_t;		/* page directory entry */
 typedef	uint32_t	pt_entry_t;		/* page table entry */
 #endif
 
-#define PD_MASK		0xfff00000	/* page directory address bits */
-#define PT_MASK		0x000ff000	/* page table address bits */
-
 #define PG_FRAME	0xfffff000
 
 /* The PT_SIZE definition is misleading... A page table is only 0x400
@@ -73,27 +66,6 @@ typedef	uint32_t	pt_entry_t;		/* page table entry */
 #define L2_MASK		0x03	/* Mask for L2 entry type */
 #define L2_INVAL	0x00	/* L2 invalid type */
 
-/* PTE construction macros */
-#define	L2_LPTE(p, a, f)	((p) | PT_AP(a) | L2_LPAGE | (f))
-#define L2_SPTE(p, a, f)	((p) | PT_AP(a) | L2_SPAGE | (f))
-#define L2_PTE(p, a)		L2_SPTE((p), (a), PT_CACHEABLE)
-#define L2_PTE_NC(p, a)		L2_SPTE((p), (a), PT_B)
-#define L2_PTE_NC_NB(p, a)	L2_SPTE((p), (a), 0)
-#define L1_SECPTE(p, a, f)	((p) | ((a) << AP_SECTION_SHIFT) | (f) \
-				| L1_SECTION | PT_U)
-
-#define L1_PTE(p)	((p) | 0x00 | L1_PAGE | PT_U)
-#define L1_SEC(p, c)	L1_SECPTE((p), AP_KRW, (c))
-
-#define L1_SEC_SIZE	(1 << PDSHIFT)
-#define L2_LPAGE_SIZE	(NBPG * 16)
-
-/* Domain types */
-#define DOMAIN_FAULT		0x00
-#define DOMAIN_CLIENT		0x01
-#define DOMAIN_RESERVED		0x02
-#define DOMAIN_MANAGER		0x03
-
 /* L1 and L2 address masks */
 #define L1_ADDR_MASK		0xfffffc00
 #define L2_ADDR_MASK		0xfffff000
@@ -205,7 +177,10 @@ typedef	uint32_t	pt_entry_t;		/* page table entry */
 #define	L1_S_DOM_MASK	L1_S_DOM(0xf)
 #define	L1_S_AP(x)	((x) << 10)	/* access permissions */
 #define	L1_S_ADDR_MASK	0xfff00000	/* phys address of section */
-#define L1_SHARED	(1 << 16)
+#define	L1_S_TEX(x)	(((x) & 0x7) << 12)	/* Type Extension */
+#define	L1_S_TEX_MASK	(0x7 << 12)	/* Type Extension */
+#define	L1_S_APX	(1 << 15)
+#define	L1_SHARED	(1 << 16)
 
 #define	L1_S_XSCALE_P	0x00000200	/* ECC enable for this section */
 #define	L1_S_XSCALE_TEX(x) ((x) << 12)	/* Type Extension */
@@ -256,7 +231,14 @@ typedef	uint32_t	pt_entry_t;		/* page table entry */
 #define	L2_AP1(x)	((x) << 6)	/* access permissions (sp 1) */
 #define	L2_AP2(x)	((x) << 8)	/* access permissions (sp 2) */
 #define	L2_AP3(x)	((x) << 10)	/* access permissions (sp 3) */
-#define	L2_AP(x)	(L2_AP0(x) | L2_AP1(x) | L2_AP2(x) | L2_AP3(x))
+
+#define	L2_SHARED	(1 << 10)
+#define	L2_APX		(1 << 9)
+#define	L2_XN		(1 << 0)
+#define	L2_L_TEX_MASK	(0x7 << 12)	/* Type Extension */
+#define	L2_L_TEX(x)	(((x) & 0x7) << 12)
+#define	L2_S_TEX_MASK	(0x7 << 6)	/* Type Extension */
+#define	L2_S_TEX(x)	(((x) & 0x7) << 6)
 
 #define	L2_XSCALE_L_TEX(x) ((x) << 12)	/* Type Extension */
 #define L2_XSCALE_L_S(x)   (1 << 15)	/* Shared */
diff --git a/sys/arm/include/smp.h b/sys/arm/include/smp.h
index ca707e3..3cf3cc1 100644
--- a/sys/arm/include/smp.h
+++ b/sys/arm/include/smp.h
@@ -3,4 +3,33 @@
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
+#include <sys/_cpuset.h>
+
+#define IPI_AST		0
+#define IPI_PREEMPT	2
+#define IPI_RENDEZVOUS	3
+#define IPI_STOP	4
+#define IPI_STOP_HARD	5
+#define IPI_HARDCLOCK	6
+#define IPI_TLB		7
+
+void	init_secondary(int cpu);
+
+void	ipi_all_but_self(u_int ipi);
+void	ipi_cpu(int cpu, u_int ipi);
+void	ipi_selected(cpuset_t cpus, u_int ipi);
+
+/* PIC interface */
+void	pic_ipi_send(cpuset_t cpus, u_int ipi);
+void	pic_ipi_clear(int ipi);
+int	pic_ipi_get(int arg);
+
+/* Platform interface */
+void	platform_mp_setmaxid(void);
+int	platform_mp_probe(void);
+void	platform_mp_start_ap(void);
+void	platform_mp_init_secondary(void);
+
+void	platform_ipi_send(cpuset_t cpus, u_int ipi);
+
 #endif /* !_MACHINE_SMP_H_ */
diff --git a/sys/arm/include/sysarch.h b/sys/arm/include/sysarch.h
index 138e91f..b0de6db 100644
--- a/sys/arm/include/sysarch.h
+++ b/sys/arm/include/sysarch.h
@@ -50,9 +50,18 @@
  * if ARM_RAS_END moves in relation to ARM_RAS_START (look for occurrances
  * of ldr/str rm,[rn, #4]).
  */
+
+/* ARM_TP_ADDRESS is needed for processors that don't support
+ * the exclusive-access opcodes introduced with ARMv6K. */
+/* TODO: #if !defined(_HAVE_ARMv6K_INSTRUCTIONS) */
+#if !defined (__ARM_ARCH_7__) && \
+	!defined (__ARM_ARCH_7A__) && \
+	!defined (__ARM_ARCH_6K__) &&  \
+	!defined (__ARM_ARCH_6ZK__)
 #define ARM_TP_ADDRESS		(ARM_VECTORS_HIGH + 0x1000)
 #define ARM_RAS_START		(ARM_TP_ADDRESS + 4)
 #define ARM_RAS_END		(ARM_TP_ADDRESS + 8)
+#endif
 
 #ifndef LOCORE
 #ifndef __ASSEMBLER__
diff --git a/sys/arm/include/vfp.h b/sys/arm/include/vfp.h
new file mode 100644
index 0000000..618f491
--- /dev/null
+++ b/sys/arm/include/vfp.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2012 Mark Tinguely
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * $FreeBSD$
+ */
+
+
+#ifndef _MACHINE__VFP_H_
+#define _MACHINE__VFP_H_
+
+/* fpsid, fpscr, fpexc are defined in the newer gas */
+#define	VFPSID			cr0
+#define	VFPSCR			cr1
+#define	VMVFR1			cr6
+#define	VMVFR0			cr7
+#define	VFPEXC			cr8
+#define	VFPINST			cr9	/* vfp 1 and 2 except instruction */
+#define	VFPINST2		cr10 	/* vfp 2? */
+
+/* VFPSID */
+#define	VFPSID_IMPLEMENTOR_OFF	24
+#define	VFPSID_IMPLEMENTOR_MASK	(0xff000000)
+#define	VFPSID_HARDSOFT_IMP	(0x00800000)
+#define	VFPSID_SINGLE_PREC	20	 /* version 1 and 2 */
+#define	VFPSID_SUBVERSION_OFF	16
+#define	VFPSID_SUBVERSION2_MASK	(0x000f0000)	 /* version 1 and 2 */
+#define	VFPSID_SUBVERSION3_MASK	(0x007f0000)	 /* version 3 */
+#define VFP_ARCH3		(0x00030000)
+#define	VFPSID_PARTNUMBER_OFF	8
+#define	VFPSID_PARTNUMBER_MASK	(0x0000ff00)
+#define	VFPSID_VARIANT_OFF	4
+#define	VFPSID_VARIANT_MASK	(0x000000f0)
+#define	VFPSID_REVISION_MASK	0x0f
+
+/* VFPSCR */
+#define	VFPSCR_CC_N		(0x80000000)	/* comparison less than */
+#define	VFPSCR_CC_Z		(0x40000000)	/* comparison equal */
+#define	VFPSCR_CC_C		(0x20000000)	/* comparison = > unordered */
+#define	VFPSCR_CC_V		(0x10000000)	/* comparison unordered */
+#define	VFPSCR_QC		(0x08000000)	/* saturation cululative */
+#define	VFPSCR_DN		(0x02000000)	/* default NaN enable */
+#define	VFPSCR_FZ		(0x01000000)	/* flush to zero enabled */
+
+#define	VFPSCR_RMODE_OFF	22		/* rounding mode offset */
+#define	VFPSCR_RMODE_MASK	(0x00c00000)	/* rounding mode mask */
+#define	VFPSCR_RMODE_RN		(0x00000000)	/* round nearest */
+#define	VFPSCR_RMODE_RPI	(0x00400000)	/* round to plus infinity */
+#define	VFPSCR_RMODE_RNI	(0x00800000)	/* round to neg infinity */
+#define	VFPSCR_RMODE_RM		(0x00c00000)	/* round to zero */
+
+#define	VFPSCR_STRIDE_OFF	20		/* vector stride -1 */
+#define	VFPSCR_STRIDE_MASK	(0x00300000)
+#define	VFPSCR_LEN_OFF		16		/* vector length -1 */
+#define	VFPSCR_LEN_MASK		(0x00070000)
+#define	VFPSCR_IDE		(0x00008000)	/* input subnormal exc enable */
+#define	VFPSCR_IXE		(0x00001000)	/* inexact exception enable */
+#define	VFPSCR_UFE		(0x00000800)	/* underflow exception enable */
+#define	VFPSCR_OFE		(0x00000400)	/* overflow exception enable */
+#define	VFPSCR_DNZ		(0x00000200)	/* div by zero exception en */
+#define	VFPSCR_IOE		(0x00000100)	/* invalid op exec enable */
+#define	VFPSCR_IDC		(0x00000080)	/* input subnormal cumul */
+#define	VFPSCR_IXC		(0x00000010)	/* Inexact cumulative flag */
+#define	VFPSCR_UFC		(0x00000008)	/* underflow cumulative flag */
+#define	VFPSCR_OFC		(0x00000004)	/* overflow cumulative flag */
+#define	VFPSCR_DZC		(0x00000002)	/* division by zero flag */
+#define	VFPSCR_IOC		(0x00000001)	/* invalid operation cumul */
+
+/* VFPEXC */
+#define	VFPEXC_EX 		(0x80000000)	/* exception v1 v2 */
+#define	VFPEXC_EN		(0x40000000)	/* vfp enable */
+
+/* version 3 registers */
+/* VMVFR0 */
+#define	VMVFR0_RM_OFF		28
+#define	VMVFR0_RM_MASK 		(0xf0000000)	/* VFP rounding modes */
+
+#define	VMVFR0_SV_OFF		24
+#define	VMVFR0_SV_MASK		(0x0f000000)	/* VFP short vector supp */
+#define	VMVFR0_SR_OFF		20
+#define	VMVFR0_SR		(0x00f00000)	/* VFP hw sqrt supp */
+#define	VMVFR0_D_OFF		16
+#define	VMVFR0_D_MASK		(0x000f0000)	/* VFP divide supp */
+#define	VMVFR0_TE_OFF		12
+#define	VMVFR0_TE_MASK		(0x0000f000)	/* VFP trap exception supp */
+#define	VMVFR0_DP_OFF		8
+#define	VMVFR0_DP_MASK		(0x00000f00)	/* VFP double prec support */
+#define	VMVFR0_SP_OFF		4
+#define	VMVFR0_SP_MASK		(0x000000f0)	/* VFP single prec support */
+#define	VMVFR0_RB_MASK		(0x0000000f)	/* VFP 64 bit media support */
+
+/* VMVFR1 */
+#define	VMVFR1_SP_OFF		16
+#define	VMVFR1_SP_MASK 		(0x000f0000)	/* Neon single prec support */
+#define VMVFR1_I_OFF		12
+#define	VMVFR1_I_MASK		(0x0000f000)	/* Neon integer support */
+#define VMVFR1_LS_OFF		8
+#define	VMVFR1_LS_MASK		(0x00000f00)	/* Neon ld/st instr support */
+#define VMVFR1_DN_OFF		4
+#define	VMVFR1_DN_MASK		(0x000000f0)	/* Neon prop NaN support */
+#define	VMVFR1_FZ_MASK		(0x0000000f)	/* Neon denormal arith supp */
+
+#define COPROC10		(0x3 << 20)
+#define COPROC11		(0x3 << 22)
+
+
+#endif
diff --git a/sys/arm/include/vmparam.h b/sys/arm/include/vmparam.h
index edff00b..eac46cc 100644
--- a/sys/arm/include/vmparam.h
+++ b/sys/arm/include/vmparam.h
@@ -116,7 +116,9 @@
 #endif
 #define VM_MAXUSER_ADDRESS	KERNBASE - ARM_KERN_DIRECTMAP
 #else /* ARM_USE_SMALL_ALLOC */
+#ifndef VM_MAXUSER_ADDRESS
 #define VM_MAXUSER_ADDRESS      KERNBASE
+#endif /* VM_MAXUSER_ADDRESS */
 #endif /* ARM_USE_SMALL_ALLOC */
 #define VM_MAX_ADDRESS          VM_MAXUSER_ADDRESS
 
diff --git a/sys/arm/s3c2xx0/s3c24x0_machdep.c b/sys/arm/s3c2xx0/s3c24x0_machdep.c
index 4f14c5c..f531bd0 100644
--- a/sys/arm/s3c2xx0/s3c24x0_machdep.c
+++ b/sys/arm/s3c2xx0/s3c24x0_machdep.c
@@ -118,9 +118,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
@@ -241,8 +238,7 @@ initarm(struct arm_boot_params *abp)
 	set_cpufuncs();
 	cpufuncs.cf_sleep = s3c24x0_sleep;
 
-	pcpu_init(pcpup, 0, sizeof(struct pcpu));
-	PCPU_SET(curthread, &thread0);
+	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
diff --git a/sys/arm/s3c2xx0/std.ln2410sbc b/sys/arm/s3c2xx0/std.ln2410sbc
index 2b53144..d73d62d 100644
--- a/sys/arm/s3c2xx0/std.ln2410sbc
+++ b/sys/arm/s3c2xx0/std.ln2410sbc
@@ -7,4 +7,5 @@ options		KERNPHYSADDR=0x30000000
 options		KERNVIRTADDR=0xc0000000
 options		PHYSADDR=0x30000000
 options		STARTUP_PAGETABLE_ADDR=0x30800000
+options		NO_EVENTTIMERS
 
diff --git a/sys/arm/s3c2xx0/std.s3c2410 b/sys/arm/s3c2xx0/std.s3c2410
index 31ec680..8d30f4e 100644
--- a/sys/arm/s3c2xx0/std.s3c2410
+++ b/sys/arm/s3c2xx0/std.s3c2410
@@ -4,3 +4,4 @@ files	"../s3c2xx0/files.s3c2xx0"
 cpu	CPU_ARM9
 
 makeoptions	CONF_CFLAGS=-mcpu=arm920t
+options		NO_EVENTTIMERS
diff --git a/sys/arm/sa11x0/assabet_machdep.c b/sys/arm/sa11x0/assabet_machdep.c
index 3a3e0f9..96f9888 100644
--- a/sys/arm/sa11x0/assabet_machdep.c
+++ b/sys/arm/sa11x0/assabet_machdep.c
@@ -123,9 +123,6 @@ extern vm_offset_t sa1110_uart_vaddr;
 
 extern vm_offset_t sa1_cache_clean_addr;
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 #ifndef MD_ROOT_SIZE
 #define MD_ROOT_SIZE 65535
 #endif
@@ -197,7 +194,6 @@ cpu_reset()
 void *
 initarm(struct arm_boot_params *abp)
 {
-	struct pcpu *pc;
 	struct pv_addr  kernel_l1pt;
 	struct pv_addr	md_addr;
 	struct pv_addr	md_bla;
@@ -215,9 +211,7 @@ initarm(struct arm_boot_params *abp)
 	cninit();
 	set_cpufuncs();
 	physmem = memsize / PAGE_SIZE;
-	pc = &__pcpu;
-	pcpu_init(pc, 0, sizeof(struct pcpu));
-	PCPU_SET(curthread, &thread0);
+	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
diff --git a/sys/arm/sa11x0/std.sa11x0 b/sys/arm/sa11x0/std.sa11x0
index 3cf8465..9cc5289 100644
--- a/sys/arm/sa11x0/std.sa11x0
+++ b/sys/arm/sa11x0/std.sa11x0
@@ -5,3 +5,4 @@ cpu		CPU_SA1100
 cpu		CPU_SA1110
 makeoptions	KERNPHYSADDR=0xc0000000
 makeoptions	KERNVIRTADDR=0xc0000000
+options		NO_EVENTTIMERS
diff --git a/sys/arm/xscale/i80321/ep80219_machdep.c b/sys/arm/xscale/i80321/ep80219_machdep.c
index fe781cc..ce1a949 100644
--- a/sys/arm/xscale/i80321/ep80219_machdep.c
+++ b/sys/arm/xscale/i80321/ep80219_machdep.c
@@ -115,9 +115,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
diff --git a/sys/arm/xscale/i80321/iq31244_machdep.c b/sys/arm/xscale/i80321/iq31244_machdep.c
index e9f3fae..bd79655 100644
--- a/sys/arm/xscale/i80321/iq31244_machdep.c
+++ b/sys/arm/xscale/i80321/iq31244_machdep.c
@@ -115,9 +115,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
diff --git a/sys/arm/xscale/i8134x/crb_machdep.c b/sys/arm/xscale/i8134x/crb_machdep.c
index 2fa7ca1..831ed5d 100644
--- a/sys/arm/xscale/i8134x/crb_machdep.c
+++ b/sys/arm/xscale/i8134x/crb_machdep.c
@@ -118,9 +118,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
diff --git a/sys/arm/xscale/ixp425/avila_machdep.c b/sys/arm/xscale/ixp425/avila_machdep.c
index 6c87510..b99c3fa 100644
--- a/sys/arm/xscale/ixp425/avila_machdep.c
+++ b/sys/arm/xscale/ixp425/avila_machdep.c
@@ -119,9 +119,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
diff --git a/sys/arm/xscale/pxa/pxa_machdep.c b/sys/arm/xscale/pxa/pxa_machdep.c
index d0e3f0c..a0bba2f 100644
--- a/sys/arm/xscale/pxa/pxa_machdep.c
+++ b/sys/arm/xscale/pxa/pxa_machdep.c
@@ -115,9 +115,6 @@ extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
-struct pcpu __pcpu;
-struct pcpu *pcpup = &__pcpu;
-
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[PXA2X0_SDRAM_BANKS * 2 + 4];
diff --git a/sys/arm/xscale/std.xscale b/sys/arm/xscale/std.xscale
index 336000b..06e9dc8 100644
--- a/sys/arm/xscale/std.xscale
+++ b/sys/arm/xscale/std.xscale
@@ -1,3 +1,4 @@
 # $FreeBSD$
 # machine arm armeb
 options		ARM_CACHE_LOCK_ENABLE
+options		NO_EVENTTIMERS
diff --git a/sys/conf/Makefile.arm b/sys/conf/Makefile.arm
index 57d7fb6..32a7311 100644
--- a/sys/conf/Makefile.arm
+++ b/sys/conf/Makefile.arm
@@ -75,7 +75,9 @@ FILES_CPU_FUNC =	$S/$M/$M/cpufunc_asm_arm7tdmi.S \
 	$S/$M/$M/cpufunc_asm_sa1.S $S/$M/$M/cpufunc_asm_arm10.S \
 	$S/$M/$M/cpufunc_asm_xscale.S $S/$M/$M/cpufunc_asm.S \
 	$S/$M/$M/cpufunc_asm_xscale_c3.S $S/$M/$M/cpufunc_asm_armv5_ec.S \
-	$S/$M/$M/cpufunc_asm_sheeva.S $S/$M/$M/cpufunc_asm_fa526.S
+	$S/$M/$M/cpufunc_asm_fa526.S $S/$M/$M/cpufunc_asm_sheeva.S \
+	$S/$M/$M/cpufunc_asm_pj4b.S $S/$M/$M/cpufunc_asm_armv7.S
+
 KERNEL_EXTRA=trampoline
 KERNEL_EXTRA_INSTALL=kernel.gz.tramp
 trampoline: ${KERNEL_KO}.tramp
diff --git a/sys/conf/files.arm b/sys/conf/files.arm
index d597296..9213f53 100644
--- a/sys/conf/files.arm
+++ b/sys/conf/files.arm
@@ -7,7 +7,8 @@ arm/arm/bcopyinout.S		standard
 arm/arm/blockio.S		standard
 arm/arm/bootconfig.c		standard
 arm/arm/bus_space_asm_generic.S	standard
-arm/arm/busdma_machdep.c 	standard
+arm/arm/busdma_machdep.c 	optional	cpu_arm9 | cpu_arm9e | cpu_fa526 | cpu_sa1100 | cpu_sa1110 | cpu_xscale_80219 | cpu_xscale_80321 | cpu_xscale_81342 | cpu_xscale_ixp425 | cpu_xscale_ixp435 | cpu_xscale_pxa2x0
+arm/arm/busdma_machdep-v6.c 	optional	cpu_arm11 | cpu_cortexa | cpu_mv_pj4b
 arm/arm/copystr.S		standard
 arm/arm/cpufunc.c		standard
 arm/arm/cpufunc_asm.S		standard
@@ -31,8 +32,11 @@ arm/arm/locore.S		standard	no-obj
 arm/arm/machdep.c		standard
 arm/arm/mem.c			optional	mem
 arm/arm/minidump_machdep.c	optional	mem
+arm/arm/mp_machdep.c		optional	smp
 arm/arm/nexus.c			standard
-arm/arm/pmap.c			standard
+arm/arm/pl310.c			optional	pl310
+arm/arm/pmap.c			optional	cpu_arm9 | cpu_arm9e | cpu_fa526 | cpu_sa1100 | cpu_sa1110 | cpu_xscale_80219 | cpu_xscale_80321 | cpu_xscale_81342 | cpu_xscale_ixp425 | cpu_xscale_ixp435 | cpu_xscale_pxa2x0
+arm/arm/pmap-v6.c		optional	cpu_arm11 | cpu_cortexa | cpu_mv_pj4b
 arm/arm/setcpsr.S		standard
 arm/arm/setstack.s		standard
 arm/arm/stack_machdep.c		optional	ddb | stack
@@ -44,6 +48,7 @@ arm/arm/uio_machdep.c		standard
 arm/arm/undefined.c		standard
 arm/arm/vectors.S		standard
 arm/arm/vm_machdep.c		standard
+arm/arm/vfp.c			optional	vfp
 arm/fpe-arm/armfpe_glue.S	optional	armfpe
 arm/fpe-arm/armfpe_init.c	optional	armfpe
 arm/fpe-arm/armfpe.S		optional	armfpe
diff --git a/sys/conf/options.arm b/sys/conf/options.arm
index a1cca86..46b10b6 100644
--- a/sys/conf/options.arm
+++ b/sys/conf/options.arm
@@ -3,13 +3,21 @@ ARM9_CACHE_WRITE_THROUGH	opt_global.h
 ARM_CACHE_LOCK_ENABLE	opt_global.h
 ARMFPE			opt_global.h
 ARM_KERN_DIRECTMAP	opt_vm.h
+ARM_L2_PIPT		opt_global.h
 ARM_MANY_BOARD		opt_global.h
 ARM_USE_SMALL_ALLOC	opt_global.h
+ARM_VFP_SUPPORT		opt_global.h
+ARM_WANT_TP_ADDRESS	opt_global.h
 COUNTS_PER_SEC		opt_timer.h
-CPU_SA1100		opt_global.h
-CPU_SA1110		opt_global.h
 CPU_ARM9		opt_global.h
 CPU_ARM9E		opt_global.h
+CPU_ARM11		opt_global.h
+CPU_CORTEXA		opt_global.h
+CPU_FA526		opt_global.h
+CPU_FA626TE		opt_global.h
+CPU_MV_PJ4B		opt_global.h
+CPU_SA1100		opt_global.h
+CPU_SA1110		opt_global.h
 CPU_XSCALE_80219	opt_global.h
 CPU_XSCALE_80321	opt_global.h
 CPU_XSCALE_81342	opt_global.h
@@ -17,24 +25,34 @@ CPU_XSCALE_IXP425	opt_global.h
 CPU_XSCALE_IXP435	opt_global.h
 CPU_XSCALE_PXA2X0	opt_global.h
 FLASHADDR		opt_global.h
+IPI_IRQ_START		opt_smp.h
+IPI_IRQ_END		opt_smp.h
 FREEBSD_BOOT_LOADER	opt_global.h
 IXP4XX_FLASH_SIZE	opt_global.h
 KERNPHYSADDR		opt_global.h
 KERNVIRTADDR		opt_global.h
 LINUX_BOOT_ABI		opt_global.h
 LOADERRAMADDR		opt_global.h
+NO_EVENTTIMERS		opt_timer.h
 PHYSADDR		opt_global.h
 QEMU_WORKAROUNDS	opt_global.h
+SOC_MV_ARMADAXP		opt_global.h
 SOC_MV_DISCOVERY	opt_global.h
+SOC_MV_DOVE		opt_global.h
+SOC_MV_FREY		opt_global.h
 SOC_MV_KIRKWOOD		opt_global.h
+SOC_MV_LOKIPLUS		opt_global.h
 SOC_MV_ORION		opt_global.h
+SOC_OMAP3		opt_global.h
+SOC_OMAP4		opt_global.h
+SOC_TI_AM335X		opt_global.h
+SOC_TEGRA2		opt_global.h
 STARTUP_PAGETABLE_ADDR	opt_global.h
 XSCALE_CACHE_READ_WRITE_ALLOCATE	opt_global.h
 XSACLE_DISABLE_CCNT	opt_timer.h
 VERBOSE_INIT_ARM	opt_global.h
+VM_MAXUSER_ADDRESS	opt_global.h
 AT91_ATE_USE_RMII	opt_at91.h
 AT91_MCI_HAS_4WIRE	opt_at91.h
 AT91_MCI_SLOT_B		opt_at91.h
 AT91C_MAIN_CLOCK	opt_at91.h
-CPU_FA526		opt_global.h
-CPU_FA626TE		opt_global.h
author	gonzo <gonzo@FreeBSD.org>	2012-08-15 03:03:03 +0000
committer	gonzo <gonzo@FreeBSD.org>	2012-08-15 03:03:03 +0000
commit	032427f3e9854fccfdddaea8fb15ae4603391a11 (patch)
tree	68d86df1ea7d9bfea335c91632747716f5a0df4a
parent	eca813ad76756aea4f70787cf7827d4b319cfe94 (diff)
download	FreeBSD-src-032427f3e9854fccfdddaea8fb15ae4603391a11.zip FreeBSD-src-032427f3e9854fccfdddaea8fb15ae4603391a11.tar.gz