MFC

author: attilio <attilio@FreeBSD.org> 2013-02-27 18:17:34 +0000
committer: attilio <attilio@FreeBSD.org> 2013-02-27 18:17:34 +0000
commit: 52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf (patch)
tree: d0908474209a17865e044675940a2f62f9ff2493 /sys
parent: c74a3afc6a5d7d1ced989c36d4ba0a7d2bbc43b9 (diff)
download: FreeBSD-src-52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf.zip
FreeBSD-src-52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf.tar.gz
25 files changed, 1064 insertions, 571 deletions
diff --git a/sys/arm/ti/ti_gpio.c b/sys/arm/ti/ti_gpio.c
index 58de516..4edb10e 100644
--- a/sys/arm/ti/ti_gpio.c
+++ b/sys/arm/ti/ti_gpio.c
@@ -653,6 +653,9 @@ ti_gpio_attach(device_t dev)
 	struct ti_gpio_softc *sc = device_get_softc(dev);
 	unsigned int i;
 	int err = 0;
+	int pin;
+	uint32_t flags;
+	uint32_t reg_oe;
 
 	sc->sc_dev = dev;
 
@@ -720,6 +723,17 @@ ti_gpio_attach(device_t dev)
 			/* Disable interrupts for all pins */
 			ti_gpio_write_4(sc, i, TI_GPIO_CLEARIRQENABLE1, 0xffffffff);
 			ti_gpio_write_4(sc, i, TI_GPIO_CLEARIRQENABLE2, 0xffffffff);
+
+			/* Init OE register based on pads configuration */
+			reg_oe = 0xffffffff;
+			for (pin = 0; pin < 32; pin++) {
+				ti_scm_padconf_get_gpioflags(
+				    PINS_PER_BANK*i + pin, &flags);
+				if (flags & GPIO_PIN_OUTPUT)
+					reg_oe &= ~(1U << pin);
+			}
+
+			ti_gpio_write_4(sc, i, TI_GPIO_OE, reg_oe);
 		}
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index e81dc02..d6651f9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -48,6 +48,14 @@ uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space_map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+
+/*
  * This value defines the number of allowed allocation failures per vdev.
  * If a device reaches this threshold in a given txg then we consider skipping
  * allocations on that device.
@@ -215,9 +223,9 @@ metaslab_compare(const void *x1, const void *x2)
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
-	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+	if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 		return (-1);
-	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+	if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
@@ -732,14 +740,15 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	space_map_create(&msp->ms_map, start, size,
+	msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
+	space_map_create(msp->ms_map, start, size,
 	    vd->vdev_ashift, &msp->ms_lock);
 
 	metaslab_group_add(mg, msp);
 
 	if (metaslab_debug && smo->smo_object != 0) {
 		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+		VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 		mutex_exit(&msp->ms_lock);
 	}
@@ -767,22 +776,27 @@ metaslab_fini(metaslab_t *msp)
 	metaslab_group_t *mg = msp->ms_group;
 
 	vdev_space_update(mg->mg_vd,
-	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
+	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_unload(&msp->ms_map);
-	space_map_destroy(&msp->ms_map);
+	space_map_unload(msp->ms_map);
+	space_map_destroy(msp->ms_map);
+	kmem_free(msp->ms_map, sizeof (*msp->ms_map));
 
 	for (int t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(&msp->ms_allocmap[t]);
-		space_map_destroy(&msp->ms_freemap[t]);
+		space_map_destroy(msp->ms_allocmap[t]);
+		space_map_destroy(msp->ms_freemap[t]);
+		kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
+		kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
 	}
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		space_map_destroy(&msp->ms_defermap[t]);
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		space_map_destroy(msp->ms_defermap[t]);
+		kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+	}
 
 	ASSERT0(msp->ms_deferspace);
 
@@ -801,7 +815,7 @@ static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
@@ -809,6 +823,16 @@ metaslab_weight(metaslab_t *msp)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
+	 * This vdev is in the process of being removed so there is nothing
+	 * for us to do here.
+	 */
+	if (vd->vdev_removing) {
+		ASSERT0(smo->smo_alloc);
+		ASSERT0(vd->vdev_ms_shift);
+		return (0);
+	}
+
+	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = sm->sm_size - smo->smo_alloc;
@@ -861,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 	 * Prefetch the next potential metaslabs
 	 */
 	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
-		space_map_t *sm = &msp->ms_map;
+		space_map_t *sm = msp->ms_map;
 		space_map_obj_t *smo = &msp->ms_smo;
 
 		/* If we have reached our prefetch limit then we're done */
@@ -882,7 +906,7 @@ static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *sm = msp->ms_map;
 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -899,7 +923,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 				return (error);
 			}
 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
-				space_map_walk(&msp->ms_defermap[t],
+				space_map_walk(msp->ms_defermap[t],
 				    space_map_claim, sm);
 
 		}
@@ -930,12 +954,158 @@ metaslab_passivate(metaslab_t *msp, uint64_t size)
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
+ * Determine if the in-core space map representation can be condensed on-disk.
+ * We would like to use the following criteria to make our decision:
+ *
+ * 1. The size of the space map object should not dramatically increase as a
+ * result of writing out our in-core free map.
+ *
+ * 2. The minimal on-disk space map representation is zfs_condense_pct/100
+ * times the size than the in-core representation (i.e. zfs_condense_pct = 110
+ * and in-core = 1MB, minimal = 1.1.MB).
+ *
+ * Checking the first condition is tricky since we don't want to walk
+ * the entire AVL tree calculating the estimated on-disk size. Instead we
+ * use the size-ordered AVL tree in the space map and calculate the
+ * size required for the largest segment in our in-core free map. If the
+ * size required to represent that segment on disk is larger than the space
+ * map object then we avoid condensing this map.
+ *
+ * To determine the second criterion we use a best-case estimate and assume
+ * each segment can be represented on-disk as a single 64-bit entry. We refer
+ * to this best-case estimate as the space map's minimal form.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo_syncing;
+	space_seg_t *ss;
+	uint64_t size, entries, segsz;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(sm->sm_loaded);
+
+	/*
+	 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
+	 * the largest segment in the in-core free map. If the tree is
+	 * empty then we should condense the map.
+	 */
+	ss = avl_last(sm->sm_pp_root);
+	if (ss == NULL)
+		return (B_TRUE);
+
+	/*
+	 * Calculate the number of 64-bit entries this segment would
+	 * require when written to disk. If this single segment would be
+	 * larger on-disk than the entire current on-disk structure, then
+	 * clearly condensing will increase the on-disk structure size.
+	 */
+	size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
+	entries = size / (MIN(size, SM_RUN_MAX));
+	segsz = entries * sizeof (uint64_t);
+
+	return (segsz <= smo->smo_objsize &&
+	    smo->smo_objsize >= (zfs_condense_pct *
+	    sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed by
+ * the in-core free map.
+ */
+static void
+metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
+	space_map_t condense_map;
+	space_map_t *sm = msp->ms_map;
+	objset_t *mos = spa_meta_objset(spa);
+	space_map_obj_t *smo = &msp->ms_smo_syncing;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(sm->sm_loaded);
+
+	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
+	    "smo size %llu, segments %lu", txg,
+	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+	    smo->smo_objsize, avl_numnodes(&sm->sm_root));
+
+	/*
+	 * Create an map that is a 100% allocated map. We remove segments
+	 * that have been freed in this txg, any deferred frees that exist,
+	 * and any allocation in the future. Removing segments should be
+	 * a relatively inexpensive operation since we expect these maps to
+	 * a small number of nodes.
+	 */
+	space_map_create(&condense_map, sm->sm_start, sm->sm_size,
+	    sm->sm_shift, sm->sm_lock);
+	space_map_add(&condense_map, condense_map.sm_start,
+	    condense_map.sm_size);
+
+	/*
+	 * Remove what's been freed in this txg from the condense_map.
+	 * Since we're in sync_pass 1, we know that all the frees from
+	 * this txg are in the freemap.
+	 */
+	space_map_walk(freemap, space_map_remove, &condense_map);
+
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		space_map_walk(msp->ms_defermap[t],
+		    space_map_remove, &condense_map);
+
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
+		space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
+		    space_map_remove, &condense_map);
+
+	/*
+	 * We're about to drop the metaslab's lock thus allowing
+	 * other consumers to change it's content. Set the
+	 * space_map's sm_condensing flag to ensure that
+	 * allocations on this metaslab do not occur while we're
+	 * in the middle of committing it to disk. This is only critical
+	 * for the ms_map as all other space_maps use per txg
+	 * views of their content.
+	 */
+	sm->sm_condensing = B_TRUE;
+
+	mutex_exit(&msp->ms_lock);
+	space_map_truncate(smo, mos, tx);
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * While we would ideally like to create a space_map representation
+	 * that consists only of allocation records, doing so can be
+	 * prohibitively expensive because the in-core free map can be
+	 * large, and therefore computationally expensive to subtract
+	 * from the condense_map. Instead we sync out two maps, a cheap
+	 * allocation only map followed by the in-core free map. While not
+	 * optimal, this is typically close to optimal, and much cheaper to
+	 * compute.
+	 */
+	space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
+	space_map_vacate(&condense_map, NULL, NULL);
+	space_map_destroy(&condense_map);
+
+	space_map_sync(sm, SM_FREE, smo, mos, tx);
+	sm->sm_condensing = B_FALSE;
+
+	spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
+	    "smo size %llu", txg,
+	    (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
+	    smo->smo_objsize);
+}
+
+/*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
@@ -944,17 +1114,29 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *sm = &msp->ms_map;
+	space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
+	space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
+	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t *sm = msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo_syncing;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
-	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+	/*
+	 * This metaslab has just been added so there's no work to do now.
+	 */
+	if (*freemap == NULL) {
+		ASSERT3P(allocmap, ==, NULL);
+		return;
+	}
+
+	ASSERT3P(allocmap, !=, NULL);
+	ASSERT3P(*freemap, !=, NULL);
+	ASSERT3P(*freed_map, !=, NULL);
+
+	if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
 		return;
 
 	/*
@@ -982,49 +1164,36 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_walk(freemap, space_map_add, freed_map);
-
-	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
-	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
-		/*
-		 * The in-core space map representation is twice as compact
-		 * as the on-disk one, so it's time to condense the latter
-		 * by generating a pure allocmap from first principles.
-		 *
-		 * This metaslab is 100% allocated,
-		 * minus the content of the in-core map (sm),
-		 * minus what's been freed this txg (freed_map),
-		 * minus deferred frees (ms_defermap[]),
-		 * minus allocations from txgs in the future
-		 * (because they haven't been committed yet).
-		 */
-		space_map_vacate(allocmap, NULL, NULL);
-		space_map_vacate(freemap, NULL, NULL);
-
-		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
-
-		space_map_walk(sm, space_map_remove, allocmap);
-		space_map_walk(freed_map, space_map_remove, allocmap);
+	if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
+	    metaslab_should_condense(msp)) {
+		metaslab_condense(msp, txg, tx);
+	} else {
+		space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+		space_map_sync(*freemap, SM_FREE, smo, mos, tx);
+	}
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_walk(&msp->ms_defermap[t],
-			    space_map_remove, allocmap);
+	space_map_vacate(allocmap, NULL, NULL);
 
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
-			    space_map_remove, allocmap);
-
-		mutex_exit(&msp->ms_lock);
-		space_map_truncate(smo, mos, tx);
-		mutex_enter(&msp->ms_lock);
+	/*
+	 * For sync pass 1, we avoid walking the entire space map and
+	 * instead will just swap the pointers for freemap and
+	 * freed_map. We can safely do this since the freed_map is
+	 * guaranteed to be empty on the initial pass.
+	 */
+	if (spa_sync_pass(spa) == 1) {
+		ASSERT0((*freed_map)->sm_space);
+		ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
+		space_map_swap(freemap, freed_map);
+	} else {
+		space_map_vacate(*freemap, space_map_add, *freed_map);
 	}
 
-	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
-	space_map_sync(freemap, SM_FREE, smo, mos, tx);
+	ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
+	ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
 
 	mutex_exit(&msp->ms_lock);
 
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+	VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	ASSERT3U(db->db_size, >=, sizeof (*smo));
 	bcopy(smo, db->db_data, sizeof (*smo));
@@ -1042,9 +1211,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	space_map_obj_t *smo = &msp->ms_smo;
 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
-	space_map_t *sm = &msp->ms_map;
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
+	space_map_t *sm = msp->ms_map;
+	space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	int64_t alloc_delta, defer_delta;
@@ -1055,40 +1224,57 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
-	 * allocmaps and freemaps and add its capacity to the vdev.
+	 * allocmaps, freemaps, and defermap and add its capacity to the vdev.
 	 */
-	if (freed_map->sm_size == 0) {
+	if (*freed_map == NULL) {
+		ASSERT(*defer_map == NULL);
 		for (int t = 0; t < TXG_SIZE; t++) {
-			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+			msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_allocmap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-			space_map_create(&msp->ms_freemap[t], sm->sm_start,
+			msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_create(&msp->ms_defermap[t], sm->sm_start,
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
+			    KM_SLEEP);
+			space_map_create(msp->ms_defermap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+		}
+
+		freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+		defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 
 		vdev_space_update(vd, 0, 0, sm->sm_size);
 	}
 
 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
-	defer_delta = freed_map->sm_space - defer_map->sm_space;
+	defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
 
 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
-	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
-	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+	ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
+	ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
 
 	/*
 	 * If there's a space_map_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
-	 * Then, add defer_map (oldest deferred frees) to this map and
-	 * transfer freed_map (this txg's frees) to defer_map.
 	 */
 	space_map_load_wait(sm);
-	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
-	space_map_vacate(freed_map, space_map_add, defer_map);
+
+	/*
+	 * Move the frees from the defer_map to this map (if it's loaded).
+	 * Swap the freed_map and the defer_map -- this is safe to do
+	 * because we've just emptied out the defer_map.
+	 */
+	space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	ASSERT0((*defer_map)->sm_space);
+	ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
+	space_map_swap(freed_map, defer_map);
 
 	*smo = *smosync;
 
@@ -1112,7 +1298,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		int evictable = 1;
 
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+			if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
 				evictable = 0;
 
 		if (evictable && !metaslab_debug)
@@ -1137,7 +1323,7 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
-		if (msp->ms_map.sm_start > mg->mg_bonus_area)
+		if (msp->ms_map->sm_start > mg->mg_bonus_area)
 			break;
 
 		mutex_enter(&msp->ms_lock);
@@ -1158,7 +1344,7 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-	uint64_t start = msp->ms_map.sm_start >> ms_shift;
+	uint64_t start = msp->ms_map->sm_start >> ms_shift;
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (1ULL << 63);
@@ -1206,6 +1392,13 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
+
+			/*
+			 * If the selected metaslab is condensing, skip it.
+			 */
+			if (msp->ms_map->sm_condensing)
+				continue;
+
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1271,20 +1464,30 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
+		/*
+		 * If this metaslab is currently condensing then pick again as
+		 * we can't manipulate this metaslab until it's committed
+		 * to disk.
+		 */
+		if (msp->ms_map->sm_condensing) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
 			break;
 
 		atomic_inc_64(&mg->mg_alloc_failures);
 
-		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
+		metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
 	}
 
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+	if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
+	space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1516,13 +1719,13 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
 	mutex_enter(&msp->ms_lock);
 
 	if (now) {
-		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+		space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
 		    offset, size);
-		space_map_free(&msp->ms_map, offset, size);
+		space_map_free(msp->ms_map, offset, size);
 	} else {
-		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+		if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+		space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1557,10 +1760,10 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
 	mutex_enter(&msp->ms_lock);
 
-	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+	if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
 		error = ENOENT;
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
@@ -1568,12 +1771,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 		return (error);
 	}
 
-	space_map_claim(&msp->ms_map, offset, size);
+	space_map_claim(msp->ms_map, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+		space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index bebb0f3..190fefe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -114,6 +114,7 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 	int merge_before, merge_after;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(!sm->sm_condensing);
 	VERIFY(size != 0);
 	VERIFY3U(start, >=, sm->sm_start);
 	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
@@ -198,6 +199,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 	int left_over, right_over;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(!sm->sm_condensing);
 	VERIFY(size != 0);
 	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
@@ -267,6 +269,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 }
 
 void
+space_map_swap(space_map_t **msrc, space_map_t **mdst)
+{
+	space_map_t *sm;
+
+	ASSERT(MUTEX_HELD((*msrc)->sm_lock));
+	ASSERT0((*mdst)->sm_space);
+	ASSERT0(avl_numnodes(&(*mdst)->sm_root));
+
+	sm = *msrc;
+	*msrc = *mdst;
+	*mdst = sm;
+}
+
+void
 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
 	space_seg_t *ss;
@@ -447,9 +463,9 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
 	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
-	void *cookie = NULL;
+	avl_tree_t *t = &sm->sm_root;
 	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len, delta, sm_space;
+	uint64_t bufsize, start, size, run_len, total, sm_space, nodes;
 	uint64_t *entry, *entry_map, *entry_map_end;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -478,13 +494,14 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
-	delta = 0;
+	total = 0;
+	nodes = avl_numnodes(&sm->sm_root);
 	sm_space = sm->sm_space;
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
 		size = ss->ss_end - ss->ss_start;
 		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 
-		delta += size;
+		total += size;
 		size >>= sm->sm_shift;
 
 		while (size) {
@@ -506,7 +523,6 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
 			start += run_len;
 			size -= run_len;
 		}
-		kmem_cache_free(space_seg_cache, ss);
 	}
 
 	if (entry != entry_map) {
@@ -522,12 +538,11 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
 	 * Ensure that the space_map's accounting wasn't changed
 	 * while we were in the middle of writing it out.
 	 */
+	VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root));
 	VERIFY3U(sm->sm_space, ==, sm_space);
+	VERIFY3U(sm->sm_space, ==, total);
 
 	zio_buf_free(entry_map, bufsize);
-
-	sm->sm_space -= delta;
-	VERIFY0(sm->sm_space);
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index f1f1b38..138e14e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -66,20 +66,38 @@ struct metaslab_group {
 };
 
 /*
- * Each metaslab's free space is tracked in space map object in the MOS,
- * which is only updated in syncing context.  Each time we sync a txg,
+ * Each metaslab maintains an in-core free map (ms_map) that contains the
+ * current list of free segments. As blocks are allocated, the allocated
+ * segment is removed from the ms_map and added to a per txg allocation map.
+ * As blocks are freed, they are added to the per txg free map. These per
+ * txg maps allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps.
+ *
+ * Each metaslab's free space is tracked in a space map object in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
  * we append the allocs and frees from that txg to the space map object.
  * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing.  Everything in ms_smo is always safe to allocate.
+ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ *
+ * To load the in-core free map we read the space map object from disk.
+ * This object contains a series of alloc and free records that are
+ * combined to make up the list of all free segments in this metaslab. These
+ * segments are represented in-core by the ms_map and are stored in an
+ * AVL tree.
+ *
+ * As the space map objects grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the space map object is
+ * zfs_condense_pct/100 times the size of the minimal on-disk representation,
+ * we rewrite it in its minimized form.
  */
 struct metaslab {
 	kmutex_t	ms_lock;	/* metaslab lock		*/
 	space_map_obj_t	ms_smo;		/* synced space map object	*/
 	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
-	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
-	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
-	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
-	space_map_t	ms_map;		/* in-core free space map	*/
+	space_map_t	*ms_allocmap[TXG_SIZE];	/* allocated this txg	*/
+	space_map_t	*ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	*ms_defermap[TXG_DEFER_SIZE];	/* deferred frees */
+	space_map_t	*ms_map;	/* in-core free space map	*/
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	metaslab_group_t *ms_group;	/* metaslab group		*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index 463b6bb..2da50fb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -40,17 +40,17 @@ extern "C" {
 typedef struct space_map_ops space_map_ops_t;
 
 typedef struct space_map {
-	avl_tree_t	sm_root;	/* AVL tree of map segments */
+	avl_tree_t	sm_root;	/* offset-ordered segment AVL tree */
 	uint64_t	sm_space;	/* sum of all segments in the map */
 	uint64_t	sm_start;	/* start of map */
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
-	uint8_t		sm_pad[3];	/* unused */
 	uint8_t		sm_loaded;	/* map loaded? */
 	uint8_t		sm_loading;	/* map loading? */
+	uint8_t		sm_condensing;	/* map condensing? */
 	kcondvar_t	sm_load_cv;	/* map load completion */
 	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
-	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
+	avl_tree_t	*sm_pp_root;	/* size-ordered, picker-private tree */
 	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
@@ -149,6 +149,7 @@ extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
 extern boolean_t space_map_contains(space_map_t *sm,
     uint64_t start, uint64_t size);
+extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
 extern void space_map_walk(space_map_t *sm,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index 59b461b..be5b0bf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -1847,6 +1847,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 
 	space_map_truncate(smo, mos, tx);
 	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+	space_map_vacate(&smsync, NULL, NULL);
 
 	space_map_destroy(&smsync);
 
diff --git a/sys/dev/ath/ath_hal/ah.c b/sys/dev/ath/ath_hal/ah.c
index d1ce7a8..551c225 100644
--- a/sys/dev/ath/ath_hal/ah.c
+++ b/sys/dev/ath/ath_hal/ah.c
@@ -692,6 +692,10 @@ ath_hal_getcapability(struct ath_hal *ah, HAL_CAPABILITY_TYPE type,
 		return pCap->hal4AddrAggrSupport ? HAL_OK : HAL_ENOTSUPP;
 	case HAL_CAP_EXT_CHAN_DFS:
 		return pCap->halExtChanDfsSupport ? HAL_OK : HAL_ENOTSUPP;
+	case HAL_CAP_RX_STBC:
+		return pCap->halRxStbcSupport ? HAL_OK : HAL_ENOTSUPP;
+	case HAL_CAP_TX_STBC:
+		return pCap->halTxStbcSupport ? HAL_OK : HAL_ENOTSUPP;
 	case HAL_CAP_COMBINED_RADAR_RSSI:
 		return pCap->halUseCombinedRadarRssi ? HAL_OK : HAL_ENOTSUPP;
 	case HAL_CAP_AUTO_SLEEP:
diff --git a/sys/dev/ath/ath_hal/ah.h b/sys/dev/ath/ath_hal/ah.h
index 0e3d5ab..ca2e7ca 100644
--- a/sys/dev/ath/ath_hal/ah.h
+++ b/sys/dev/ath/ath_hal/ah.h
@@ -137,6 +137,9 @@ typedef enum {
 	HAL_CAP_RIFS_RX_ENABLED	= 53,
 	HAL_CAP_BB_DFS_HANG	= 54,
 
+	HAL_CAP_RX_STBC		= 58,
+	HAL_CAP_TX_STBC		= 59,
+
 	HAL_CAP_BT_COEX		= 60,	/* hardware is capable of bluetooth coexistence */
 	HAL_CAP_DYNAMIC_SMPS	= 61,	/* Dynamic MIMO Power Save hardware support */
 
diff --git a/sys/dev/ath/ath_rate/sample/sample.c b/sys/dev/ath/ath_rate/sample/sample.c
index a7d6af6..b3f82fa 100644
--- a/sys/dev/ath/ath_rate/sample/sample.c
+++ b/sys/dev/ath/ath_rate/sample/sample.c
@@ -708,71 +708,6 @@ ath_rate_setupxtxdesc(struct ath_softc *sc, struct ath_node *an,
 	    s3code, sched->t3);		/* series 3 */
 }
 
-/*
- * Update the EWMA percentage.
- *
- * This is a simple hack to track an EWMA based on the current
- * rate scenario. For the rate codes which failed, this will
- * record a 0% against it. For the rate code which succeeded,
- * EWMA will record the nbad*100/nframes percentage against it.
- */
-static void
-update_ewma_stats(struct ath_softc *sc, struct ath_node *an,
-    int frame_size,
-    int rix0, int tries0,
-    int rix1, int tries1,
-    int rix2, int tries2,
-    int rix3, int tries3,
-    int short_tries, int tries, int status,
-    int nframes, int nbad)
-{
-	struct sample_node *sn = ATH_NODE_SAMPLE(an);
-	struct sample_softc *ssc = ATH_SOFTC_SAMPLE(sc);
-	const int size_bin = size_to_bin(frame_size);
-	int tries_so_far;
-	int pct;
-	int rix = rix0;
-
-	/* Calculate percentage based on current rate */
-	if (nframes == 0)
-		nframes = nbad = 1;
-	pct = ((nframes - nbad) * 1000) / nframes;
-
-	/* Figure out which rate index succeeded */
-	tries_so_far = tries0;
-
-	if (tries1 && tries_so_far < tries) {
-		tries_so_far += tries1;
-		rix = rix1;
-		/* XXX bump ewma pct */
-	}
-
-	if (tries2 && tries_so_far < tries) {
-		tries_so_far += tries2;
-		rix = rix2;
-		/* XXX bump ewma pct */
-	}
-
-	if (tries3 && tries_so_far < tries) {
-		rix = rix3;
-		/* XXX bump ewma pct */
-	}
-
-	/* rix is the successful rate, update EWMA for final rix */
-	if (sn->stats[size_bin][rix].total_packets <
-	    ssc->smoothing_minpackets) {
-		/* just average the first few packets */
-		int a_pct = (sn->stats[size_bin][rix].packets_acked * 1000) /
-		    (sn->stats[size_bin][rix].total_packets);
-		sn->stats[size_bin][rix].ewma_pct = a_pct;
-	} else {
-		/* use a ewma */
-		sn->stats[size_bin][rix].ewma_pct =
-			((sn->stats[size_bin][rix].ewma_pct * ssc->smoothing_rate) +
-			 (pct * (100 - ssc->smoothing_rate))) / 100;
-	}
-}
-
 static void
 update_stats(struct ath_softc *sc, struct ath_node *an, 
 		  int frame_size,
@@ -792,6 +727,7 @@ update_stats(struct ath_softc *sc, struct ath_node *an,
 	const int size = bin_to_size(size_bin);
 	int tt, tries_so_far;
 	int is_ht40 = (an->an_node.ni_chw == 40);
+	int pct;
 
 	if (!IS_RATE_DEFINED(sn, rix0))
 		return;
@@ -865,6 +801,27 @@ update_stats(struct ath_softc *sc, struct ath_node *an,
 	sn->stats[size_bin][rix0].last_tx = ticks;
 	sn->stats[size_bin][rix0].total_packets += nframes;
 
+	/* update EWMA for this rix */
+
+	/* Calculate percentage based on current rate */
+	if (nframes == 0)
+		nframes = nbad = 1;
+	pct = ((nframes - nbad) * 1000) / nframes;
+
+	if (sn->stats[size_bin][rix0].total_packets <
+	    ssc->smoothing_minpackets) {
+		/* just average the first few packets */
+		int a_pct = (sn->stats[size_bin][rix0].packets_acked * 1000) /
+		    (sn->stats[size_bin][rix0].total_packets);
+		sn->stats[size_bin][rix0].ewma_pct = a_pct;
+	} else {
+		/* use a ewma */
+		sn->stats[size_bin][rix0].ewma_pct =
+			((sn->stats[size_bin][rix0].ewma_pct * ssc->smoothing_rate) +
+			 (pct * (100 - ssc->smoothing_rate))) / 100;
+	}
+
+
 	if (rix0 == sn->current_sample_rix[size_bin]) {
 		IEEE80211_NOTE(an->an_node.ni_vap, IEEE80211_MSG_RATECTL,
 		   &an->an_node,
@@ -907,6 +864,11 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 	short_tries = ts->ts_shortretry;
 	long_tries = ts->ts_longretry + 1;
 
+	if (nframes == 0) {
+		device_printf(sc->sc_dev, "%s: nframes=0?\n", __func__);
+		return;
+	}
+
 	if (frame_size == 0)		    /* NB: should not happen */
 		frame_size = 1500;
 
@@ -950,13 +912,6 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 			     0, 0,
 			     short_tries, long_tries, status,
 			     nframes, nbad);
-		update_ewma_stats(sc, an, frame_size, 
-			     final_rix, long_tries,
-			     0, 0,
-			     0, 0,
-			     0, 0,
-			     short_tries, long_tries, status,
-			     nframes, nbad);
 
 	} else {
 		int finalTSIdx = ts->ts_finaltsi;
@@ -1008,15 +963,6 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 				     short_tries, long_tries,
 				     long_tries > rc[0].tries,
 				     nframes, nbad);
-			update_ewma_stats(sc, an, frame_size,
-				     rc[0].rix, rc[0].tries,
-				     rc[1].rix, rc[1].tries,
-				     rc[2].rix, rc[2].tries,
-				     rc[3].rix, rc[3].tries,
-				     short_tries, long_tries,
-				     long_tries > rc[0].tries,
-				     nframes, nbad);
-
 			long_tries -= rc[0].tries;
 		}
 		
@@ -1029,14 +975,6 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 				     short_tries, long_tries,
 				     status,
 				     nframes, nbad);
-			update_ewma_stats(sc, an, frame_size,
-				     rc[1].rix, rc[1].tries,
-				     rc[2].rix, rc[2].tries,
-				     rc[3].rix, rc[3].tries,
-				     0, 0,
-				     short_tries, long_tries,
-				     status,
-				     nframes, nbad);
 			long_tries -= rc[1].tries;
 		}
 
@@ -1049,14 +987,6 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 				     short_tries, long_tries,
 				     status,
 				     nframes, nbad);
-			update_ewma_stats(sc, an, frame_size,
-				     rc[2].rix, rc[2].tries,
-				     rc[3].rix, rc[3].tries,
-				     0, 0,
-				     0, 0,
-				     short_tries, long_tries,
-				     status,
-				     nframes, nbad);
 			long_tries -= rc[2].tries;
 		}
 
@@ -1069,14 +999,6 @@ ath_rate_tx_complete(struct ath_softc *sc, struct ath_node *an,
 				     short_tries, long_tries,
 				     status,
 				     nframes, nbad);
-			update_ewma_stats(sc, an, frame_size,
-				     rc[3].rix, rc[3].tries,
-				     0, 0,
-				     0, 0,
-				     0, 0,
-				     short_tries, long_tries,
-				     status,
-				     nframes, nbad);
 		}
 	}
 }
diff --git a/sys/dev/ath/if_ath.c b/sys/dev/ath/if_ath.c
index a614d6f..fd1a7c3 100644
--- a/sys/dev/ath/if_ath.c
+++ b/sys/dev/ath/if_ath.c
@@ -781,6 +781,28 @@ ath_attach(u_int16_t devid, struct ath_softc *sc)
 		ic->ic_txstream = txs;
 		ic->ic_rxstream = rxs;
 
+		/*
+		 * Setup TX and RX STBC based on what the HAL allows and
+		 * the currently configured chainmask set.
+		 * Ie - don't enable STBC TX if only one chain is enabled.
+		 * STBC RX is fine on a single RX chain; it just won't
+		 * provide any real benefit.
+		 */
+		if (ath_hal_getcapability(ah, HAL_CAP_RX_STBC, 0,
+		    NULL) == HAL_OK) {
+			sc->sc_rx_stbc = 1;
+			device_printf(sc->sc_dev,
+			    "[HT] 1 stream STBC receive enabled\n");
+			ic->ic_htcaps |= IEEE80211_HTCAP_RXSTBC_1STREAM;
+		}
+		if (txs > 1 && ath_hal_getcapability(ah, HAL_CAP_TX_STBC, 0,
+		    NULL) == HAL_OK) {
+			sc->sc_tx_stbc = 1;
+			device_printf(sc->sc_dev,
+			    "[HT] 1 stream STBC transmit enabled\n");
+			ic->ic_htcaps |= IEEE80211_HTCAP_TXSTBC;
+		}
+
 		(void) ath_hal_getcapability(ah, HAL_CAP_RTS_AGGR_LIMIT, 1,
 		    &sc->sc_rts_aggr_limit);
 		if (sc->sc_rts_aggr_limit != (64 * 1024))
diff --git a/sys/dev/ath/if_ath_tx_ht.c b/sys/dev/ath/if_ath_tx_ht.c
index c0e72ac..d382f8f 100644
--- a/sys/dev/ath/if_ath_tx_ht.c
+++ b/sys/dev/ath/if_ath_tx_ht.c
@@ -536,16 +536,29 @@ ath_rateseries_setup(struct ath_softc *sc, struct ieee80211_node *ni,
 			series[i].RateFlags |= HAL_RATESERIES_HALFGI;
 
 		/*
-		 * XXX TODO: STBC if it's possible
+		 * Setup rate and TX power cap for this series.
 		 */
+		series[i].Rate = rt->info[rc[i].rix].rateCode;
+		series[i].RateIndex = rc[i].rix;
+		series[i].tx_power_cap = 0x3f;	/* XXX for now */
+
 
 		/*
-		 * XXX TODO: LDPC if it's possible
+		 * If we have STBC TX enabled and the receiver
+		 * can receive (at least) 1 stream STBC, AND it's
+		 * MCS 0-7, AND we have at least two chains enabled,
+		 * enable STBC.
 		 */
+		if (ic->ic_htcaps & IEEE80211_HTCAP_TXSTBC &&
+		    ni->ni_htcap & IEEE80211_HTCAP_RXSTBC_1STREAM &&
+		    (sc->sc_cur_txchainmask > 1) &&
+		    HT_RC_2_STREAMS(series[i].Rate) == 1) {
+			series[i].RateFlags |= HAL_RATESERIES_STBC;
+		}
 
-		series[i].Rate = rt->info[rc[i].rix].rateCode;
-		series[i].RateIndex = rc[i].rix;
-		series[i].tx_power_cap = 0x3f;	/* XXX for now */
+		/*
+		 * XXX TODO: LDPC if it's possible
+		 */
 
 		/*
 		 * PktDuration doesn't include slot, ACK, RTS, etc timing -
diff --git a/sys/dev/ath/if_athvar.h b/sys/dev/ath/if_athvar.h
index e8fdeff..42442de 100644
--- a/sys/dev/ath/if_athvar.h
+++ b/sys/dev/ath/if_athvar.h
@@ -567,7 +567,9 @@ struct ath_softc {
 	/*
 	 * Second set of flags.
 	 */
-	u_int32_t		sc_use_ent  : 1;
+	u_int32_t		sc_use_ent  : 1,
+				sc_rx_stbc  : 1,
+				sc_tx_stbc  : 1;
 
 	/*
 	 * Enterprise mode configuration for AR9380 and later chipsets.
diff --git a/sys/dev/mfi/mfi.c b/sys/dev/mfi/mfi.c
index ed759fc..e799b9d 100644
--- a/sys/dev/mfi/mfi.c
+++ b/sys/dev/mfi/mfi.c
@@ -108,6 +108,7 @@ static void	mfi_bio_complete(struct mfi_command *);
 static struct mfi_command *mfi_build_ldio(struct mfi_softc *,struct bio*);
 static struct mfi_command *mfi_build_syspdio(struct mfi_softc *,struct bio*);
 static int	mfi_send_frame(struct mfi_softc *, struct mfi_command *);
+static int	mfi_std_send_frame(struct mfi_softc *, struct mfi_command *);
 static int	mfi_abort(struct mfi_softc *, struct mfi_command **);
 static int	mfi_linux_ioctl_int(struct cdev *, u_long, caddr_t, int, struct thread *);
 static void	mfi_timeout(void *);
@@ -132,24 +133,30 @@ static int mfi_check_for_sscd(struct mfi_softc *sc, struct mfi_command *cm);
 SYSCTL_NODE(_hw, OID_AUTO, mfi, CTLFLAG_RD, 0, "MFI driver parameters");
 static int	mfi_event_locale = MFI_EVT_LOCALE_ALL;
 TUNABLE_INT("hw.mfi.event_locale", &mfi_event_locale);
-SYSCTL_INT(_hw_mfi, OID_AUTO, event_locale, CTLFLAG_RW, &mfi_event_locale,
-            0, "event message locale");
+SYSCTL_INT(_hw_mfi, OID_AUTO, event_locale, CTLFLAG_RWTUN, &mfi_event_locale,
+           0, "event message locale");
 
 static int	mfi_event_class = MFI_EVT_CLASS_INFO;
 TUNABLE_INT("hw.mfi.event_class", &mfi_event_class);
-SYSCTL_INT(_hw_mfi, OID_AUTO, event_class, CTLFLAG_RW, &mfi_event_class,
-          0, "event message class");
+SYSCTL_INT(_hw_mfi, OID_AUTO, event_class, CTLFLAG_RWTUN, &mfi_event_class,
+           0, "event message class");
 
 static int	mfi_max_cmds = 128;
 TUNABLE_INT("hw.mfi.max_cmds", &mfi_max_cmds);
-SYSCTL_INT(_hw_mfi, OID_AUTO, max_cmds, CTLFLAG_RD, &mfi_max_cmds,
-	   0, "Max commands");
+SYSCTL_INT(_hw_mfi, OID_AUTO, max_cmds, CTLFLAG_RDTUN, &mfi_max_cmds,
+	   0, "Max commands limit (-1 = controller limit)");
 
 static int	mfi_detect_jbod_change = 1;
 TUNABLE_INT("hw.mfi.detect_jbod_change", &mfi_detect_jbod_change);
-SYSCTL_INT(_hw_mfi, OID_AUTO, detect_jbod_change, CTLFLAG_RW,
+SYSCTL_INT(_hw_mfi, OID_AUTO, detect_jbod_change, CTLFLAG_RWTUN,
 	   &mfi_detect_jbod_change, 0, "Detect a change to a JBOD");
 
+int		mfi_polled_cmd_timeout = MFI_POLL_TIMEOUT_SECS;
+TUNABLE_INT("hw.mfi.polled_cmd_timeout", &mfi_polled_cmd_timeout);
+SYSCTL_INT(_hw_mfi, OID_AUTO, polled_cmd_timeout, CTLFLAG_RWTUN,
+	   &mfi_polled_cmd_timeout, 0,
+	   "Polled command timeout - used for firmware flash etc (in seconds)");
+
 /* Management interface */
 static d_open_t		mfi_open;
 static d_close_t	mfi_close;
@@ -361,7 +368,7 @@ mfi_attach(struct mfi_softc *sc)
 {
 	uint32_t status;
 	int error, commsz, framessz, sensesz;
-	int frames, unit, max_fw_sge;
+	int frames, unit, max_fw_sge, max_fw_cmds;
 	uint32_t tb_mem_size = 0;
 
 	if (sc == NULL)
@@ -456,7 +463,14 @@ mfi_attach(struct mfi_softc *sc)
 	 * instead of compile time.
 	 */
 	status = sc->mfi_read_fw_status(sc);
-	sc->mfi_max_fw_cmds = status & MFI_FWSTATE_MAXCMD_MASK;
+	max_fw_cmds = status & MFI_FWSTATE_MAXCMD_MASK;
+	if (mfi_max_cmds > 0 && mfi_max_cmds < max_fw_cmds) {
+		device_printf(sc->mfi_dev, "FW MaxCmds = %d, limiting to %d\n",
+		    max_fw_cmds, mfi_max_cmds);
+		sc->mfi_max_fw_cmds = mfi_max_cmds;
+	} else {
+		sc->mfi_max_fw_cmds = max_fw_cmds;
+	}
 	max_fw_sge = (status & MFI_FWSTATE_MAXSGL_MASK) >> 16;
 	sc->mfi_max_sge = min(max_fw_sge, ((MFI_MAXPHYS / PAGE_SIZE) + 1));
 
@@ -464,7 +478,8 @@ mfi_attach(struct mfi_softc *sc)
 
 	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
 		mfi_tbolt_init_globals(sc);
-		device_printf(sc->mfi_dev, "MaxCmd = %x MaxSgl = %x state = %x \n",
+		device_printf(sc->mfi_dev, "MaxCmd = %d, Drv MaxCmd = %d, "
+		    "MaxSgl = %d, state = %#x\n", max_fw_cmds,
 		    sc->mfi_max_fw_cmds, sc->mfi_max_sge, status);
 		tb_mem_size = mfi_tbolt_get_memory_requirement(sc);
 
@@ -503,8 +518,8 @@ mfi_attach(struct mfi_softc *sc)
 				0,			/* flags */
 				NULL, NULL,		/* lockfunc, lockarg */
 				&sc->mfi_tb_init_dmat)) {
-		device_printf(sc->mfi_dev, "Cannot allocate init DMA tag\n");
-		return (ENOMEM);
+			device_printf(sc->mfi_dev, "Cannot allocate init DMA tag\n");
+			return (ENOMEM);
 		}
 		if (bus_dmamem_alloc(sc->mfi_tb_init_dmat, (void **)&sc->mfi_tb_init,
 		    BUS_DMA_NOWAIT, &sc->mfi_tb_init_dmamap)) {
@@ -683,11 +698,14 @@ mfi_attach(struct mfi_softc *sc)
 	/* ThunderBolt MFI_IOC2 INIT */
 	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
 		sc->mfi_disable_intr(sc);
+		mtx_lock(&sc->mfi_io_lock);
 		if ((error = mfi_tbolt_init_MFI_queue(sc)) != 0) {
 			device_printf(sc->mfi_dev,
 			    "TB Init has failed with error %d\n",error);
+			mtx_unlock(&sc->mfi_io_lock);
 			return error;
 		}
+		mtx_unlock(&sc->mfi_io_lock);
 
 		if ((error = mfi_tbolt_alloc_cmd(sc)) != 0)
 			return error;
@@ -723,10 +741,12 @@ mfi_attach(struct mfi_softc *sc)
 		    "hook\n");
 		return (EINVAL);
 	}
+	mtx_lock(&sc->mfi_io_lock);
 	if ((error = mfi_aen_setup(sc, 0), 0) != 0) {
 		mtx_unlock(&sc->mfi_io_lock);
 		return (error);
 	}
+	mtx_unlock(&sc->mfi_io_lock);
 
 	/*
 	 * Register a shutdown handler.
@@ -766,7 +786,9 @@ mfi_attach(struct mfi_softc *sc)
 	    mfi_timeout, sc);
 
 	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
+		mtx_lock(&sc->mfi_io_lock);
 		mfi_tbolt_sync_map_info(sc);
+		mtx_unlock(&sc->mfi_io_lock);
 	}
 
 	return (0);
@@ -776,21 +798,16 @@ static int
 mfi_alloc_commands(struct mfi_softc *sc)
 {
 	struct mfi_command *cm;
-	int i, ncmds;
+	int i, j;
 
 	/*
 	 * XXX Should we allocate all the commands up front, or allocate on
 	 * demand later like 'aac' does?
 	 */
-	ncmds = MIN(mfi_max_cmds, sc->mfi_max_fw_cmds);
-	if (bootverbose)
-		device_printf(sc->mfi_dev, "Max fw cmds= %d, sizing driver "
-		   "pool to %d\n", sc->mfi_max_fw_cmds, ncmds);
-
-	sc->mfi_commands = malloc(sizeof(struct mfi_command) * ncmds, M_MFIBUF,
-	    M_WAITOK | M_ZERO);
+	sc->mfi_commands = malloc(sizeof(sc->mfi_commands[0]) *
+	    sc->mfi_max_fw_cmds, M_MFIBUF, M_WAITOK | M_ZERO);
 
-	for (i = 0; i < ncmds; i++) {
+	for (i = 0; i < sc->mfi_max_fw_cmds; i++) {
 		cm = &sc->mfi_commands[i];
 		cm->cm_frame = (union mfi_frame *)((uintptr_t)sc->mfi_frames +
 		    sc->mfi_cmd_size * i);
@@ -806,10 +823,20 @@ mfi_alloc_commands(struct mfi_softc *sc)
 			mtx_lock(&sc->mfi_io_lock);
 			mfi_release_command(cm);
 			mtx_unlock(&sc->mfi_io_lock);
+		} else {
+			device_printf(sc->mfi_dev, "Failed to allocate %d "
+			   "command blocks, only allocated %d\n",
+			    sc->mfi_max_fw_cmds, i - 1);
+			for (j = 0; j < i; j++) {
+				cm = &sc->mfi_commands[i];
+				bus_dmamap_destroy(sc->mfi_buffer_dmat,
+				    cm->cm_dmamap);
+			}
+			free(sc->mfi_commands, M_MFIBUF);
+			sc->mfi_commands = NULL;
+
+			return (ENOMEM);
 		}
-		else
-			break;
-		sc->mfi_total_cmds++;
 	}
 
 	return (0);
@@ -834,6 +861,29 @@ mfi_release_command(struct mfi_command *cm)
 		cm->cm_sg->sg32[0].addr = 0;
 	}
 
+	/*
+	 * Command may be on other queues e.g. busy queue depending on the
+	 * flow of a previous call to mfi_mapcmd, so ensure its dequeued
+	 * properly
+	 */
+	if ((cm->cm_flags & MFI_ON_MFIQ_BUSY) != 0)
+		mfi_remove_busy(cm);
+	if ((cm->cm_flags & MFI_ON_MFIQ_READY) != 0)
+		mfi_remove_ready(cm);
+
+	/* We're not expecting it to be on any other queue but check */
+	if ((cm->cm_flags & MFI_ON_MFIQ_MASK) != 0) {
+		panic("Command %p is still on another queue, flags = %#x",
+		    cm, cm->cm_flags);
+	}
+
+	/* tbolt cleanup */
+	if ((cm->cm_flags & MFI_CMD_TBOLT) != 0) {
+		mfi_tbolt_return_cmd(cm->cm_sc,
+		    cm->cm_sc->mfi_cmd_pool_tbolt[cm->cm_extra_frames - 1],
+		    cm);
+	}
+
 	hdr_data = (uint32_t *)cm->cm_frame;
 	hdr_data[0] = 0;	/* cmd, sense_len, cmd_status, scsi_status */
 	hdr_data[1] = 0;	/* target_id, lun_id, cdb_len, sg_count */
@@ -916,8 +966,10 @@ mfi_comms_init(struct mfi_softc *sc)
 	uint32_t context = 0;
 
 	mtx_lock(&sc->mfi_io_lock);
-	if ((cm = mfi_dequeue_free(sc)) == NULL)
+	if ((cm = mfi_dequeue_free(sc)) == NULL) {
+		mtx_unlock(&sc->mfi_io_lock);
 		return (EBUSY);
+	}
 
 	/* Zero out the MFI frame */
 	context = cm->cm_frame->header.context;
@@ -946,15 +998,12 @@ mfi_comms_init(struct mfi_softc *sc)
 	cm->cm_data = NULL;
 	cm->cm_flags = MFI_CMD_POLLED;
 
-	if ((error = mfi_mapcmd(sc, cm)) != 0) {
+	if ((error = mfi_mapcmd(sc, cm)) != 0)
 		device_printf(sc->mfi_dev, "failed to send init command\n");
-		mtx_unlock(&sc->mfi_io_lock);
-		return (error);
-	}
 	mfi_release_command(cm);
 	mtx_unlock(&sc->mfi_io_lock);
 
-	return (0);
+	return (error);
 }
 
 static int
@@ -1005,7 +1054,7 @@ mfi_get_log_state(struct mfi_softc *sc, struct mfi_evt_log_state **log_state)
 	struct mfi_command *cm = NULL;
 	int error;
 
-	mtx_lock(&sc->mfi_io_lock);
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_GETINFO,
 	    (void **)log_state, sizeof(**log_state));
 	if (error)
@@ -1024,7 +1073,6 @@ mfi_get_log_state(struct mfi_softc *sc, struct mfi_evt_log_state **log_state)
 out:
 	if (cm)
 		mfi_release_command(cm);
-	mtx_unlock(&sc->mfi_io_lock);
 
 	return (error);
 }
@@ -1037,32 +1085,32 @@ mfi_aen_setup(struct mfi_softc *sc, uint32_t seq_start)
 	int error = 0;
 	uint32_t seq;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	class_locale.members.reserved = 0;
 	class_locale.members.locale = mfi_event_locale;
 	class_locale.members.evt_class  = mfi_event_class;
 
 	if (seq_start == 0) {
-		error = mfi_get_log_state(sc, &log_state);
+		if ((error = mfi_get_log_state(sc, &log_state)) != 0)
+			goto out;
 		sc->mfi_boot_seq_num = log_state->boot_seq_num;
-		if (error) {
-			if (log_state)
-				free(log_state, M_MFIBUF);
-			return (error);
-		}
 
 		/*
 		 * Walk through any events that fired since the last
 		 * shutdown.
 		 */
-		mfi_parse_entries(sc, log_state->shutdown_seq_num,
-		    log_state->newest_seq_num);
+		if ((error = mfi_parse_entries(sc, log_state->shutdown_seq_num,
+		    log_state->newest_seq_num)) != 0)
+			goto out;
 		seq = log_state->newest_seq_num;
 	} else
 		seq = seq_start;
-	mfi_aen_register(sc, seq, class_locale.word);
+	error = mfi_aen_register(sc, seq, class_locale.word);
+out:
 	free(log_state, M_MFIBUF);
 
-	return 0;
+	return (error);
 }
 
 int
@@ -1072,7 +1120,6 @@ mfi_wait_command(struct mfi_softc *sc, struct mfi_command *cm)
 	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 	cm->cm_complete = NULL;
 
-
 	/*
 	 * MegaCli can issue a DCMD of 0.  In this case do nothing
 	 * and return 0 to it as status
@@ -1100,12 +1147,13 @@ mfi_free(struct mfi_softc *sc)
 	if (sc->mfi_cdev != NULL)
 		destroy_dev(sc->mfi_cdev);
 
-	if (sc->mfi_total_cmds != 0) {
-		for (i = 0; i < sc->mfi_total_cmds; i++) {
+	if (sc->mfi_commands != NULL) {
+		for (i = 0; i < sc->mfi_max_fw_cmds; i++) {
 			cm = &sc->mfi_commands[i];
 			bus_dmamap_destroy(sc->mfi_buffer_dmat, cm->cm_dmamap);
 		}
 		free(sc->mfi_commands, M_MFIBUF);
+		sc->mfi_commands = NULL;
 	}
 
 	if (sc->mfi_intr)
@@ -1161,7 +1209,8 @@ mfi_free(struct mfi_softc *sc)
 		/* End LSIP200113393 */
 		/* ThunderBolt INIT packet memory Free */
 		if (sc->mfi_tb_init_busaddr != 0)
-			bus_dmamap_unload(sc->mfi_tb_init_dmat, sc->mfi_tb_init_dmamap);
+			bus_dmamap_unload(sc->mfi_tb_init_dmat,
+			    sc->mfi_tb_init_dmamap);
 		if (sc->mfi_tb_init != NULL)
 			bus_dmamem_free(sc->mfi_tb_init_dmat, sc->mfi_tb_init,
 			    sc->mfi_tb_init_dmamap);
@@ -1178,16 +1227,14 @@ mfi_free(struct mfi_softc *sc)
 			    sc->mfi_tb_ioc_init_dmamap);
 		if (sc->mfi_tb_ioc_init_dmat != NULL)
 			bus_dma_tag_destroy(sc->mfi_tb_ioc_init_dmat);
-		for (int i = 0; i < sc->mfi_max_fw_cmds; i++) {
-			if (sc->mfi_cmd_pool_tbolt != NULL) {
+		if (sc->mfi_cmd_pool_tbolt != NULL) {
+			for (int i = 0; i < sc->mfi_max_fw_cmds; i++) {
 				if (sc->mfi_cmd_pool_tbolt[i] != NULL) {
 					free(sc->mfi_cmd_pool_tbolt[i],
 					    M_MFIBUF);
 					sc->mfi_cmd_pool_tbolt[i] = NULL;
 				}
 			}
-		}
-		if (sc->mfi_cmd_pool_tbolt != NULL) {
 			free(sc->mfi_cmd_pool_tbolt, M_MFIBUF);
 			sc->mfi_cmd_pool_tbolt = NULL;
 		}
@@ -1252,16 +1299,14 @@ restart:
 			cm->cm_error = 0;
 			mfi_complete(sc, cm);
 		}
-		if (++ci == (sc->mfi_max_fw_cmds + 1)) {
+		if (++ci == (sc->mfi_max_fw_cmds + 1))
 			ci = 0;
-		}
 	}
 
 	sc->mfi_comms->hw_ci = ci;
 
 	/* Give defered I/O a chance to run */
-	if (sc->mfi_flags & MFI_FLAGS_QFRZN)
-		sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
+	sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
 	mfi_startio(sc);
 	mtx_unlock(&sc->mfi_io_lock);
 
@@ -1284,15 +1329,15 @@ mfi_shutdown(struct mfi_softc *sc)
 	int error;
 
 
-	if (sc->mfi_aen_cm)
+	if (sc->mfi_aen_cm != NULL) {
 		sc->cm_aen_abort = 1;
-	if (sc->mfi_aen_cm != NULL)
 		mfi_abort(sc, &sc->mfi_aen_cm);
+	}
 
-	if (sc->mfi_map_sync_cm)
+	if (sc->mfi_map_sync_cm != NULL) {
 		sc->cm_map_abort = 1;
-	if (sc->mfi_map_sync_cm != NULL)
 		mfi_abort(sc, &sc->mfi_map_sync_cm);
+	}
 
 	mtx_lock(&sc->mfi_io_lock);
 	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_SHUTDOWN, NULL, 0);
@@ -1306,9 +1351,8 @@ mfi_shutdown(struct mfi_softc *sc)
 	cm->cm_flags = MFI_CMD_POLLED;
 	cm->cm_data = NULL;
 
-	if ((error = mfi_mapcmd(sc, cm)) != 0) {
+	if ((error = mfi_mapcmd(sc, cm)) != 0)
 		device_printf(sc->mfi_dev, "Failed to shutdown controller\n");
-	}
 
 	mfi_release_command(cm);
 	mtx_unlock(&sc->mfi_io_lock);
@@ -1374,8 +1418,10 @@ mfi_syspdprobe(struct mfi_softc *sc)
 	TAILQ_FOREACH_SAFE(syspd, &sc->mfi_syspd_tqh, pd_link, tmp) {
 		found = 0;
 		for (i = 0; i < pdlist->count; i++) {
-			if (syspd->pd_id == pdlist->addr[i].device_id)
+			if (syspd->pd_id == pdlist->addr[i].device_id) {
 				found = 1;
+				break;
+			}
 		}
 		if (found == 0) {
 			printf("DELETE\n");
@@ -1628,6 +1674,8 @@ mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
 	struct mfi_evt_detail *ed = NULL;
 	int error = 0;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	current_aen.word = locale;
 	if (sc->mfi_aen_cm != NULL) {
 		prior_aen.word =
@@ -1646,13 +1694,10 @@ mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
 		}
 	}
 
-	mtx_lock(&sc->mfi_io_lock);
 	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_WAIT,
 	    (void **)&ed, sizeof(*ed));
-	mtx_unlock(&sc->mfi_io_lock);
-	if (error) {
+	if (error)
 		goto out;
-	}
 
 	dcmd = &cm->cm_frame->dcmd;
 	((uint32_t *)&dcmd->mbox)[0] = seq;
@@ -1663,10 +1708,8 @@ mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
 	sc->last_seq_num = seq;
 	sc->mfi_aen_cm = cm;
 
-	mtx_lock(&sc->mfi_io_lock);
 	mfi_enqueue_ready(cm);
 	mfi_startio(sc);
-	mtx_unlock(&sc->mfi_io_lock);
 
 out:
 	return (error);
@@ -1684,11 +1727,11 @@ mfi_aen_complete(struct mfi_command *cm)
 	sc = cm->cm_sc;
 	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 
-	hdr = &cm->cm_frame->header;
-
 	if (sc->mfi_aen_cm == NULL)
 		return;
 
+	hdr = &cm->cm_frame->header;
+
 	if (sc->cm_aen_abort ||
 	    hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
 		sc->cm_aen_abort = 0;
@@ -1714,16 +1757,13 @@ mfi_aen_complete(struct mfi_command *cm)
 	}
 
 	free(cm->cm_data, M_MFIBUF);
-	sc->mfi_aen_cm = NULL;
 	wakeup(&sc->mfi_aen_cm);
+	sc->mfi_aen_cm = NULL;
 	mfi_release_command(cm);
 
 	/* set it up again so the driver can catch more events */
-	if (!aborted) {
-		mtx_unlock(&sc->mfi_io_lock);
+	if (!aborted)
 		mfi_aen_setup(sc, seq);
-		mtx_lock(&sc->mfi_io_lock);
-	}
 }
 
 #define MAX_EVENTS 15
@@ -1737,6 +1777,8 @@ mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
 	union mfi_evt class_locale;
 	int error, i, seq, size;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	class_locale.members.reserved = 0;
 	class_locale.members.locale = mfi_event_locale;
 	class_locale.members.evt_class  = mfi_event_class;
@@ -1748,13 +1790,10 @@ mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
 		return (ENOMEM);
 
 	for (seq = start_seq;;) {
-		mtx_lock(&sc->mfi_io_lock);
 		if ((cm = mfi_dequeue_free(sc)) == NULL) {
 			free(el, M_MFIBUF);
-			mtx_unlock(&sc->mfi_io_lock);
 			return (EBUSY);
 		}
-		mtx_unlock(&sc->mfi_io_lock);
 
 		dcmd = &cm->cm_frame->dcmd;
 		bzero(dcmd->mbox, MFI_MBOX_SIZE);
@@ -1770,38 +1809,30 @@ mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
 		cm->cm_data = el;
 		cm->cm_len = size;
 
-		mtx_lock(&sc->mfi_io_lock);
 		if ((error = mfi_mapcmd(sc, cm)) != 0) {
 			device_printf(sc->mfi_dev,
 			    "Failed to get controller entries\n");
 			mfi_release_command(cm);
-			mtx_unlock(&sc->mfi_io_lock);
 			break;
 		}
 
-		mtx_unlock(&sc->mfi_io_lock);
 		bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
 		    BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
 
 		if (dcmd->header.cmd_status == MFI_STAT_NOT_FOUND) {
-			mtx_lock(&sc->mfi_io_lock);
 			mfi_release_command(cm);
-			mtx_unlock(&sc->mfi_io_lock);
 			break;
 		}
 		if (dcmd->header.cmd_status != MFI_STAT_OK) {
 			device_printf(sc->mfi_dev,
 			    "Error %d fetching controller entries\n",
 			    dcmd->header.cmd_status);
-			mtx_lock(&sc->mfi_io_lock);
 			mfi_release_command(cm);
-			mtx_unlock(&sc->mfi_io_lock);
+			error = EIO;
 			break;
 		}
-		mtx_lock(&sc->mfi_io_lock);
 		mfi_release_command(cm);
-		mtx_unlock(&sc->mfi_io_lock);
 
 		for (i = 0; i < el->count; i++) {
 			/*
@@ -1817,15 +1848,13 @@ mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
 				else if (el->event[i].seq < start_seq)
 					break;
 			}
-			mtx_lock(&sc->mfi_io_lock);
 			mfi_queue_evt(sc, &el->event[i]);
-			mtx_unlock(&sc->mfi_io_lock);
 		}
 		seq = el->event[el->count - 1].seq + 1;
 	}
 
 	free(el, M_MFIBUF);
-	return (0);
+	return (error);
 }
 
 static int
@@ -1942,11 +1971,12 @@ static int mfi_add_sys_pd(struct mfi_softc *sc, int id)
 	dcmd->mbox[0]=id;
 	dcmd->header.scsi_status = 0;
 	dcmd->header.pad0 = 0;
-	if (mfi_mapcmd(sc, cm) != 0) {
+	if ((error = mfi_mapcmd(sc, cm)) != 0) {
 		device_printf(sc->mfi_dev,
 		    "Failed to get physical drive info %d\n", id);
 		free(pd_info, M_MFIBUF);
-		return (0);
+		mfi_release_command(cm);
+		return (error);
 	}
 	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
 	    BUS_DMASYNC_POSTREAD);
@@ -2096,6 +2126,8 @@ mfi_build_syspdio(struct mfi_softc *sc, struct bio *bio)
 	int flags = 0, blkcount = 0, readop;
 	uint8_t cdb_len;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	if ((cm = mfi_dequeue_free(sc)) == NULL)
 	    return (NULL);
 
@@ -2142,6 +2174,7 @@ mfi_build_syspdio(struct mfi_softc *sc, struct bio *bio)
 	cm->cm_sg = &pass->sgl;
 	cm->cm_total_frame_size = MFI_PASS_FRAME_SIZE;
 	cm->cm_flags = flags;
+
 	return (cm);
 }
 
@@ -2154,6 +2187,8 @@ mfi_build_ldio(struct mfi_softc *sc, struct bio *bio)
 	uint32_t blkcount;
 	uint32_t context = 0;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	if ((cm = mfi_dequeue_free(sc)) == NULL)
 	    return (NULL);
 
@@ -2195,6 +2230,7 @@ mfi_build_ldio(struct mfi_softc *sc, struct bio *bio)
 	cm->cm_sg = &io->sgl;
 	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
 	cm->cm_flags = flags;
+
 	return (cm);
 }
 
@@ -2212,11 +2248,14 @@ mfi_bio_complete(struct mfi_command *cm)
 	if ((hdr->cmd_status != MFI_STAT_OK) || (hdr->scsi_status != 0)) {
 		bio->bio_flags |= BIO_ERROR;
 		bio->bio_error = EIO;
-		device_printf(sc->mfi_dev, "I/O error, status= %d "
-		    "scsi_status= %d\n", hdr->cmd_status, hdr->scsi_status);
+		device_printf(sc->mfi_dev, "I/O error, cmd=%p, status=%#x, "
+		    "scsi_status=%#x\n", cm, hdr->cmd_status, hdr->scsi_status);
 		mfi_print_sense(cm->cm_sc, cm->cm_sense);
 	} else if (cm->cm_error != 0) {
 		bio->bio_flags |= BIO_ERROR;
+		bio->bio_error = cm->cm_error;
+		device_printf(sc->mfi_dev, "I/O error, cmd=%p, error=%#x\n",
+		    cm, cm->cm_error);
 	}
 
 	mfi_release_command(cm);
@@ -2252,6 +2291,7 @@ mfi_startio(struct mfi_softc *sc)
 
 		/* Send the command to the controller */
 		if (mfi_mapcmd(sc, cm) != 0) {
+			device_printf(sc->mfi_dev, "Failed to startio\n");
 			mfi_requeue_ready(cm);
 			break;
 		}
@@ -2280,10 +2320,7 @@ mfi_mapcmd(struct mfi_softc *sc, struct mfi_command *cm)
 			return (0);
 		}
 	} else {
-		if (sc->MFA_enabled)
-			error = mfi_tbolt_send_frame(sc, cm);
-		else
-			error = mfi_send_frame(sc, cm);
+		error = mfi_send_frame(sc, cm);
 	}
 
 	return (error);
@@ -2297,18 +2334,28 @@ mfi_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 	union mfi_sgl *sgl;
 	struct mfi_softc *sc;
 	int i, j, first, dir;
-	int sge_size;
+	int sge_size, locked;
 
 	cm = (struct mfi_command *)arg;
 	sc = cm->cm_sc;
 	hdr = &cm->cm_frame->header;
 	sgl = cm->cm_sg;
 
+	/*
+	 * We need to check if we have the lock as this is async
+	 * callback so even though our caller mfi_mapcmd asserts
+	 * it has the lock, there is no garantee that hasn't been
+	 * dropped if bus_dmamap_load returned prior to our
+	 * completion.
+	 */
+	if ((locked = mtx_owned(&sc->mfi_io_lock)) == 0)
+		mtx_lock(&sc->mfi_io_lock);
+
 	if (error) {
 		printf("error %d in callback\n", error);
 		cm->cm_error = error;
 		mfi_complete(sc, cm);
-		return;
+		goto out;
 	}
 	/* Use IEEE sgl only for IO's on a SKINNY controller
 	 * For other commands on a SKINNY controller use either
@@ -2380,10 +2427,17 @@ mfi_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 	cm->cm_total_frame_size += (sc->mfi_sge_size * nsegs);
 	cm->cm_extra_frames = (cm->cm_total_frame_size - 1) / MFI_FRAME_SIZE;
 
-	if (sc->MFA_enabled)
-			mfi_tbolt_send_frame(sc, cm);
-	else
-		mfi_send_frame(sc, cm);
+	if ((error = mfi_send_frame(sc, cm)) != 0) {
+		printf("error %d in callback from mfi_send_frame\n", error);
+		cm->cm_error = error;
+		mfi_complete(sc, cm);
+		goto out;
+	}
+
+out:
+	/* leave the lock in the state we found it */
+	if (locked == 0)
+		mtx_unlock(&sc->mfi_io_lock);
 
 	return;
 }
@@ -2391,8 +2445,26 @@ mfi_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 static int
 mfi_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
 {
+	int error;
+
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
+	if (sc->MFA_enabled)
+		error = mfi_tbolt_send_frame(sc, cm);
+	else
+		error = mfi_std_send_frame(sc, cm);
+
+	if (error != 0 && (cm->cm_flags & MFI_ON_MFIQ_BUSY) != 0)
+		mfi_remove_busy(cm);
+
+	return (error);
+}
+
+static int
+mfi_std_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
+{
 	struct mfi_frame_header *hdr;
-	int tm = MFI_POLL_TIMEOUT_SECS * 1000;
+	int tm = mfi_polled_cmd_timeout * 1000;
 
 	hdr = &cm->cm_frame->header;
 
@@ -2446,6 +2518,7 @@ void
 mfi_complete(struct mfi_softc *sc, struct mfi_command *cm)
 {
 	int dir;
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 
 	if ((cm->cm_flags & MFI_CMD_MAPPED) != 0) {
 		dir = 0;
@@ -2473,11 +2546,12 @@ mfi_abort(struct mfi_softc *sc, struct mfi_command **cm_abort)
 {
 	struct mfi_command *cm;
 	struct mfi_abort_frame *abort;
-	int i = 0;
+	int i = 0, error;
 	uint32_t context = 0;
 
 	mtx_lock(&sc->mfi_io_lock);
 	if ((cm = mfi_dequeue_free(sc)) == NULL) {
+		mtx_unlock(&sc->mfi_io_lock);
 		return (EBUSY);
 	}
 
@@ -2497,7 +2571,8 @@ mfi_abort(struct mfi_softc *sc, struct mfi_command **cm_abort)
 	cm->cm_data = NULL;
 	cm->cm_flags = MFI_CMD_POLLED;
 
-	mfi_mapcmd(sc, cm);
+	if ((error = mfi_mapcmd(sc, cm)) != 0)
+		device_printf(sc->mfi_dev, "failed to abort command\n");
 	mfi_release_command(cm);
 
 	mtx_unlock(&sc->mfi_io_lock);
@@ -2513,7 +2588,7 @@ mfi_abort(struct mfi_softc *sc, struct mfi_command **cm_abort)
 		mtx_unlock(&sc->mfi_io_lock);
 	}
 
-	return (0);
+	return (error);
 }
 
 int
@@ -2551,7 +2626,8 @@ mfi_dump_blocks(struct mfi_softc *sc, int id, uint64_t lba, void *virt,
 	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
 	cm->cm_flags = MFI_CMD_POLLED | MFI_CMD_DATAOUT;
 
-	error = mfi_mapcmd(sc, cm);
+	if ((error = mfi_mapcmd(sc, cm)) != 0)
+		device_printf(sc->mfi_dev, "failed dump blocks\n");
 	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
 	    BUS_DMASYNC_POSTWRITE);
 	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
@@ -2594,7 +2670,8 @@ mfi_dump_syspd_blocks(struct mfi_softc *sc, int id, uint64_t lba, void *virt,
 	cm->cm_total_frame_size = MFI_PASS_FRAME_SIZE;
 	cm->cm_flags = MFI_CMD_POLLED | MFI_CMD_DATAOUT | MFI_CMD_SCSI;
 
-	error = mfi_mapcmd(sc, cm);
+	if ((error = mfi_mapcmd(sc, cm)) != 0)
+		device_printf(sc->mfi_dev, "failed dump blocks\n");
 	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
 	    BUS_DMASYNC_POSTWRITE);
 	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
@@ -3308,8 +3385,10 @@ out:
 		}
 	case MFI_SET_AEN:
 		aen = (struct mfi_ioc_aen *)arg;
+		mtx_lock(&sc->mfi_io_lock);
 		error = mfi_aen_register(sc, aen->aen_seq_num,
 		    aen->aen_class_locale);
+		mtx_unlock(&sc->mfi_io_lock);
 
 		break;
 	case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
@@ -3638,7 +3717,7 @@ mfi_dump_all(void)
 		deadline = time_uptime - MFI_CMD_TIMEOUT;
 		mtx_lock(&sc->mfi_io_lock);
 		TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
-			if (cm->cm_timestamp < deadline) {
+			if (cm->cm_timestamp <= deadline) {
 				device_printf(sc->mfi_dev,
 				    "COMMAND %p TIMEOUT AFTER %d SECONDS\n",
 				    cm, (int)(time_uptime - cm->cm_timestamp));
@@ -3649,7 +3728,7 @@ mfi_dump_all(void)
 
 #if 0
 		if (timedout)
-			MFI_DUMP_CMDS(SC);
+			MFI_DUMP_CMDS(sc);
 #endif
 
 		mtx_unlock(&sc->mfi_io_lock);
@@ -3662,7 +3741,7 @@ static void
 mfi_timeout(void *data)
 {
 	struct mfi_softc *sc = (struct mfi_softc *)data;
-	struct mfi_command *cm;
+	struct mfi_command *cm, *tmp;
 	time_t deadline;
 	int timedout = 0;
 
@@ -3674,10 +3753,10 @@ mfi_timeout(void *data)
 		}
 	}
 	mtx_lock(&sc->mfi_io_lock);
-	TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
+	TAILQ_FOREACH_SAFE(cm, &sc->mfi_busy, cm_link, tmp) {
 		if (sc->mfi_aen_cm == cm || sc->mfi_map_sync_cm == cm)
 			continue;
-		if (cm->cm_timestamp < deadline) {
+		if (cm->cm_timestamp <= deadline) {
 			if (sc->adpreset != 0 && sc->issuepend_done == 0) {
 				cm->cm_timestamp = time_uptime;
 			} else {
@@ -3687,6 +3766,13 @@ mfi_timeout(void *data)
 				     );
 				MFI_PRINT_CMD(cm);
 				MFI_VALIDATE_CMD(sc, cm);
+				/*
+				 * Fail the command instead of leaving it on
+				 * the queue where it could remain stuck forever
+				 */
+				mfi_remove_busy(cm);
+				cm->cm_error = ETIMEDOUT;
+				mfi_complete(sc, cm);
 				timedout++;
 			}
 		}
@@ -3694,7 +3780,7 @@ mfi_timeout(void *data)
 
 #if 0
 	if (timedout)
-		MFI_DUMP_CMDS(SC);
+		MFI_DUMP_CMDS(sc);
 #endif
 
 	mtx_unlock(&sc->mfi_io_lock);
diff --git a/sys/dev/mfi/mfi_cam.c b/sys/dev/mfi/mfi_cam.c
index 325b064..0ea2326 100644
--- a/sys/dev/mfi/mfi_cam.c
+++ b/sys/dev/mfi/mfi_cam.c
@@ -145,6 +145,7 @@ mfip_attach(device_t dev)
 				MFI_SCSI_MAX_CMDS, sc->devq);
 	if (sc->sim == NULL) {
 		cam_simq_free(sc->devq);
+		sc->devq = NULL;
 		device_printf(dev, "CAM SIM attach failed\n");
 		return (EINVAL);
 	}
@@ -155,7 +156,9 @@ mfip_attach(device_t dev)
 	if (xpt_bus_register(sc->sim, dev, 0) != 0) {
 		device_printf(dev, "XPT bus registration failed\n");
 		cam_sim_free(sc->sim, FALSE);
+		sc->sim = NULL;
 		cam_simq_free(sc->devq);
+		sc->devq = NULL;
 		mtx_unlock(&mfisc->mfi_io_lock);
 		return (EINVAL);
 	}
@@ -187,11 +190,14 @@ mfip_detach(device_t dev)
 		mtx_lock(&sc->mfi_sc->mfi_io_lock);
 		xpt_bus_deregister(cam_sim_path(sc->sim));
 		cam_sim_free(sc->sim, FALSE);
+		sc->sim = NULL;
 		mtx_unlock(&sc->mfi_sc->mfi_io_lock);
 	}
 
-	if (sc->devq != NULL)
+	if (sc->devq != NULL) {
 		cam_simq_free(sc->devq);
+		sc->devq = NULL;
+	}
 
 	return (0);
 }
diff --git a/sys/dev/mfi/mfi_debug.c b/sys/dev/mfi/mfi_debug.c
index 2e66e19..4aec4f7 100644
--- a/sys/dev/mfi/mfi_debug.c
+++ b/sys/dev/mfi/mfi_debug.c
@@ -57,14 +57,7 @@ __FBSDID("$FreeBSD$");
 static void
 mfi_print_frame_flags(device_t dev, uint32_t flags)
 {
-	device_printf(dev, "flags=%b\n", flags,
-	    "\20"
-	    "\1NOPOST"
-	    "\2SGL64"
-	    "\3SENSE64"
-	    "\4WRITE"
-	    "\5READ"
-	    "\6IEEESGL");
+	device_printf(dev, "flags=%b\n", flags, MFI_FRAME_FMT);
 }
 
 static void
@@ -205,16 +198,7 @@ mfi_print_cmd(struct mfi_command *cm)
 	device_printf(dev, "cm=%p index=%d total_frame_size=%d "
 	    "extra_frames=%d\n", cm, cm->cm_index, cm->cm_total_frame_size,
 	    cm->cm_extra_frames);
-	device_printf(dev, "flags=%b\n", cm->cm_flags,
-	    "\20"
-	    "\1MAPPED"
-	    "\2DATAIN"
-	    "\3DATAOUT"
-	    "\4COMPLETED"
-	    "\5POLLED"
-	    "\6Q_FREE"
-	    "\7Q_READY"
-	    "\10Q_BUSY");
+	device_printf(dev, "flags=%b\n", cm->cm_flags, MFI_CMD_FLAGS_FMT);
 
 	switch (cm->cm_frame->header.cmd) {
 	case MFI_CMD_DCMD:
@@ -237,7 +221,7 @@ mfi_dump_cmds(struct mfi_softc *sc)
 {
 	int i;
 
-	for (i = 0; i < sc->mfi_total_cmds; i++)
+	for (i = 0; i < sc->mfi_max_fw_cmds; i++)
 		mfi_print_generic_frame(sc, &sc->mfi_commands[i]);
 }
 
diff --git a/sys/dev/mfi/mfi_tbolt.c b/sys/dev/mfi/mfi_tbolt.c
index cce63c0..9d29ea0 100644
--- a/sys/dev/mfi/mfi_tbolt.c
+++ b/sys/dev/mfi/mfi_tbolt.c
@@ -55,14 +55,12 @@ __FBSDID("$FreeBSD$");
 #include <dev/mfi/mfi_ioctl.h>
 #include <dev/mfi/mfivar.h>
 
-struct mfi_cmd_tbolt *mfi_tbolt_get_cmd(struct mfi_softc *sc);
+struct mfi_cmd_tbolt *mfi_tbolt_get_cmd(struct mfi_softc *sc, struct mfi_command *);
 union mfi_mpi2_request_descriptor *
 mfi_tbolt_get_request_descriptor(struct mfi_softc *sc, uint16_t index);
 void mfi_tbolt_complete_cmd(struct mfi_softc *sc);
 int mfi_tbolt_build_io(struct mfi_softc *sc, struct mfi_command *mfi_cmd,
     struct mfi_cmd_tbolt *cmd);
-static inline void mfi_tbolt_return_cmd(struct mfi_softc *sc,
-    struct mfi_cmd_tbolt *cmd);
 union mfi_mpi2_request_descriptor *mfi_tbolt_build_mpt_cmd(struct mfi_softc
     *sc, struct mfi_command *cmd);
 uint8_t
@@ -84,6 +82,15 @@ static void mfi_queue_map_sync(struct mfi_softc *sc);
 
 #define MFI_FUSION_ENABLE_INTERRUPT_MASK	(0x00000008)
 
+
+extern int	mfi_polled_cmd_timeout;
+static int	mfi_fw_reset_test = 0;
+#ifdef MFI_DEBUG
+TUNABLE_INT("hw.mfi.fw_reset_test", &mfi_fw_reset_test);
+SYSCTL_INT(_hw_mfi, OID_AUTO, fw_reset_test, CTLFLAG_RWTUN, &mfi_fw_reset_test,
+           0, "Force a firmware reset condition");
+#endif
+
 void
 mfi_tbolt_enable_intr_ppc(struct mfi_softc *sc)
 {
@@ -162,14 +169,14 @@ mfi_tbolt_adp_reset(struct mfi_softc *sc)
 	while (!( HostDiag & DIAG_WRITE_ENABLE)) {
 		for (i = 0; i < 1000; i++);
 		HostDiag = (uint32_t)MFI_READ4(sc, MFI_HDR);
-		device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: retry time=%x, "
-		    "hostdiag=%x\n", retry, HostDiag);
+		device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: retry time=%d, "
+		    "hostdiag=%#x\n", retry, HostDiag);
 
 		if (retry++ >= 100)
 			return 1;
 	}
 
-	device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: HostDiag=%x\n", HostDiag);
+	device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: HostDiag=%#x\n", HostDiag);
 
 	MFI_WRITE4(sc, MFI_HDR, (HostDiag | DIAG_RESET_ADAPTER));
 
@@ -181,8 +188,8 @@ mfi_tbolt_adp_reset(struct mfi_softc *sc)
 	while (HostDiag & DIAG_RESET_ADAPTER) {
 		for (i = 0; i < 1000; i++) ;
 		HostDiag = (uint32_t)MFI_READ4(sc, MFI_RSR);
-		device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: retry time=%x, "
-		    "hostdiag=%x\n", retry, HostDiag);
+		device_printf(sc->mfi_dev, "ADP_RESET_TBOLT: retry time=%d, "
+		    "hostdiag=%#x\n", retry, HostDiag);
 
 		if (retry++ >= 1000)
 			return 1;
@@ -311,6 +318,8 @@ mfi_tbolt_init_desc_pool(struct mfi_softc *sc, uint8_t* mem_location,
 	sc->sg_frame_busaddr = sc->reply_frame_busaddr + offset;
 	/* initialize the last_reply_idx to 0 */
 	sc->last_reply_idx = 0;
+	MFI_WRITE4(sc, MFI_RFPI, sc->mfi_max_fw_cmds - 1);
+	MFI_WRITE4(sc, MFI_RPI, sc->last_reply_idx);
 	offset = (sc->sg_frame_busaddr + (MEGASAS_MAX_SZ_CHAIN_FRAME *
 	    sc->mfi_max_fw_cmds)) - sc->mfi_tb_busaddr;
 	if (offset > tbolt_contg_length)
@@ -327,30 +336,35 @@ int
 mfi_tbolt_init_MFI_queue(struct mfi_softc *sc)
 {
 	struct MPI2_IOC_INIT_REQUEST   *mpi2IocInit;
-	struct mfi_init_frame	*mfi_init;
+	struct mfi_init_frame		*mfi_init;
 	uintptr_t			offset = 0;
 	bus_addr_t			phyAddress;
 	MFI_ADDRESS			*mfiAddressTemp;
-	struct mfi_command *cm;
+	struct mfi_command		*cm, cmd_tmp;
 	int error;
 
-	mpi2IocInit = (struct MPI2_IOC_INIT_REQUEST *)sc->mfi_tb_ioc_init_desc;
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	/* Check if initialization is already completed */
 	if (sc->MFA_enabled) {
+		device_printf(sc->mfi_dev, "tbolt_init already initialised!\n");
 		return 1;
 	}
 
-	mtx_lock(&sc->mfi_io_lock);
 	if ((cm = mfi_dequeue_free(sc)) == NULL) {
-		mtx_unlock(&sc->mfi_io_lock);
+		device_printf(sc->mfi_dev, "tbolt_init failed to get command "
+		    " entry!\n");
 		return (EBUSY);
 	}
+
+	cmd_tmp.cm_frame = cm->cm_frame;
+	cmd_tmp.cm_frame_busaddr = cm->cm_frame_busaddr;
+	cmd_tmp.cm_dmamap = cm->cm_dmamap;
+
 	cm->cm_frame = (union mfi_frame *)((uintptr_t)sc->mfi_tb_init);
 	cm->cm_frame_busaddr = sc->mfi_tb_init_busaddr;
 	cm->cm_dmamap = sc->mfi_tb_init_dmamap;
 	cm->cm_frame->header.context = 0;
-	cm->cm_sc = sc;
-	cm->cm_index = 0;
 
 	/*
 	 * Abuse the SG list area of the frame to hold the init_qinfo
@@ -358,6 +372,7 @@ mfi_tbolt_init_MFI_queue(struct mfi_softc *sc)
 	 */
 	mfi_init = &cm->cm_frame->init;
 
+	mpi2IocInit = (struct MPI2_IOC_INIT_REQUEST *)sc->mfi_tb_ioc_init_desc;
 	bzero(mpi2IocInit, sizeof(struct MPI2_IOC_INIT_REQUEST));
 	mpi2IocInit->Function  = MPI2_FUNCTION_IOC_INIT;
 	mpi2IocInit->WhoInit   = MPI2_WHOINIT_HOST_DRIVER;
@@ -411,23 +426,25 @@ mfi_tbolt_init_MFI_queue(struct mfi_softc *sc)
 	if ((error = mfi_mapcmd(sc, cm)) != 0) {
 		device_printf(sc->mfi_dev, "failed to send IOC init2 "
 		    "command %d at %lx\n", error, (long)cm->cm_frame_busaddr);
-		mfi_release_command(cm);
-		mtx_unlock(&sc->mfi_io_lock);
-		return (error);
+		goto out;
 	}
-	mfi_release_command(cm);
-	mtx_unlock(&sc->mfi_io_lock);
 
-	if (mfi_init->header.cmd_status == 0) {
+	if (mfi_init->header.cmd_status == MFI_STAT_OK) {
 		sc->MFA_enabled = 1;
-	}
-	else {
-		device_printf(sc->mfi_dev, "Init command Failed %x\n",
+	} else {
+		device_printf(sc->mfi_dev, "Init command Failed %#x\n",
 		    mfi_init->header.cmd_status);
-		return 1;
+		error = mfi_init->header.cmd_status;
+		goto out;
 	}
 
-	return 0;
+out:
+	cm->cm_frame = cmd_tmp.cm_frame;
+	cm->cm_frame_busaddr = cmd_tmp.cm_frame_busaddr;
+	cm->cm_dmamap = cmd_tmp.cm_dmamap;
+	mfi_release_command(cm);
+
+	return (error);
 
 }
 
@@ -447,13 +464,21 @@ mfi_tbolt_alloc_cmd(struct mfi_softc *sc)
 	sc->request_desc_pool = malloc(sizeof(
 	    union mfi_mpi2_request_descriptor) * sc->mfi_max_fw_cmds,
 	    M_MFIBUF, M_NOWAIT|M_ZERO);
+
+	if (sc->request_desc_pool == NULL) {
+		device_printf(sc->mfi_dev, "Could not alloc "
+		    "memory for request_desc_pool\n");
+		return (ENOMEM);
+	}
+
 	sc->mfi_cmd_pool_tbolt = malloc(sizeof(struct mfi_cmd_tbolt*)
 	    * sc->mfi_max_fw_cmds, M_MFIBUF, M_NOWAIT|M_ZERO);
 
-	if (!sc->mfi_cmd_pool_tbolt) {
-		device_printf(sc->mfi_dev, "out of memory. Could not alloc "
-		    "memory for cmd_list_fusion\n");
-		return 1;
+	if (sc->mfi_cmd_pool_tbolt == NULL) {
+		free(sc->request_desc_pool, M_MFIBUF);
+		device_printf(sc->mfi_dev, "Could not alloc "
+		    "memory for cmd_pool_tbolt\n");
+		return (ENOMEM);
 	}
 
 	for (i = 0; i < sc->mfi_max_fw_cmds; i++) {
@@ -461,20 +486,24 @@ mfi_tbolt_alloc_cmd(struct mfi_softc *sc)
 		    struct mfi_cmd_tbolt),M_MFIBUF, M_NOWAIT|M_ZERO);
 
 		if (!sc->mfi_cmd_pool_tbolt[i]) {
-			device_printf(sc->mfi_dev, "Could not alloc cmd list "
-			    "fusion\n");
+			device_printf(sc->mfi_dev, "Could not alloc "
+			    "cmd_pool_tbolt entry\n");
 
 			for (j = 0; j < i; j++)
 				free(sc->mfi_cmd_pool_tbolt[j], M_MFIBUF);
 
+			free(sc->request_desc_pool, M_MFIBUF);
+			sc->request_desc_pool = NULL;
 			free(sc->mfi_cmd_pool_tbolt, M_MFIBUF);
 			sc->mfi_cmd_pool_tbolt = NULL;
+
+			return (ENOMEM);
 		}
 	}
 
 	/*
 	 * The first 256 bytes (SMID 0) is not used. Don't add to the cmd
-	 *list
+	 * list
 	 */
 	io_req_base = sc->request_message_pool_align
 		+ MEGASAS_THUNDERBOLT_NEW_MSG_SIZE;
@@ -520,7 +549,8 @@ mfi_tbolt_reset(struct mfi_softc *sc)
 
 	if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
 		fw_state = sc->mfi_read_fw_status(sc);
-		if ((fw_state & MFI_FWSTATE_FAULT) == MFI_FWSTATE_FAULT) {
+		if ((fw_state & MFI_FWSTATE_FAULT) == MFI_FWSTATE_FAULT ||
+		    mfi_fw_reset_test) {
 			if ((sc->disableOnlineCtrlReset == 0)
 			    && (sc->adpreset == 0)) {
 				device_printf(sc->mfi_dev, "Adapter RESET "
@@ -554,8 +584,7 @@ mfi_intr_tbolt(void *arg)
 		return;
 	mtx_lock(&sc->mfi_io_lock);
 	mfi_tbolt_complete_cmd(sc);
-	if (sc->mfi_flags & MFI_FLAGS_QFRZN)
-		sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
+	sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
 	mfi_startio(sc);
 	mtx_unlock(&sc->mfi_io_lock);
 	return;
@@ -573,58 +602,63 @@ map_tbolt_cmd_status(struct mfi_command *mfi_cmd, uint8_t status,
     uint8_t ext_status)
 {
 	switch (status) {
-		case MFI_STAT_OK:
-			mfi_cmd->cm_frame->header.cmd_status = MFI_STAT_OK;
-			mfi_cmd->cm_frame->dcmd.header.cmd_status = MFI_STAT_OK;
-			mfi_cmd->cm_error = MFI_STAT_OK;
-			break;
-
-		case MFI_STAT_SCSI_IO_FAILED:
-		case MFI_STAT_LD_INIT_IN_PROGRESS:
-			mfi_cmd->cm_frame->header.cmd_status = status;
-			mfi_cmd->cm_frame->header.scsi_status = ext_status;
-			mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
-			mfi_cmd->cm_frame->dcmd.header.scsi_status
-			    = ext_status;
-			break;
-
-		case MFI_STAT_SCSI_DONE_WITH_ERROR:
-			mfi_cmd->cm_frame->header.cmd_status = ext_status;
-			mfi_cmd->cm_frame->dcmd.header.cmd_status = ext_status;
-			break;
-
-		case MFI_STAT_LD_OFFLINE:
-		case MFI_STAT_DEVICE_NOT_FOUND:
-			mfi_cmd->cm_frame->header.cmd_status = status;
-			mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
-			break;
-
-		default:
-			mfi_cmd->cm_frame->header.cmd_status = status;
-			mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
-			break;
-		}
+	case MFI_STAT_OK:
+		mfi_cmd->cm_frame->header.cmd_status = MFI_STAT_OK;
+		mfi_cmd->cm_frame->dcmd.header.cmd_status = MFI_STAT_OK;
+		mfi_cmd->cm_error = MFI_STAT_OK;
+		break;
+
+	case MFI_STAT_SCSI_IO_FAILED:
+	case MFI_STAT_LD_INIT_IN_PROGRESS:
+		mfi_cmd->cm_frame->header.cmd_status = status;
+		mfi_cmd->cm_frame->header.scsi_status = ext_status;
+		mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
+		mfi_cmd->cm_frame->dcmd.header.scsi_status
+		    = ext_status;
+		break;
+
+	case MFI_STAT_SCSI_DONE_WITH_ERROR:
+		mfi_cmd->cm_frame->header.cmd_status = ext_status;
+		mfi_cmd->cm_frame->dcmd.header.cmd_status = ext_status;
+		break;
+
+	case MFI_STAT_LD_OFFLINE:
+	case MFI_STAT_DEVICE_NOT_FOUND:
+		mfi_cmd->cm_frame->header.cmd_status = status;
+		mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
+		break;
+
+	default:
+		mfi_cmd->cm_frame->header.cmd_status = status;
+		mfi_cmd->cm_frame->dcmd.header.cmd_status = status;
+		break;
+	}
 }
 
 /*
  * mfi_tbolt_return_cmd -	Return a cmd to free command pool
  * @instance:		Adapter soft state
- * @cmd:		Command packet to be returned to free command pool
+ * @tbolt_cmd:		Tbolt command packet to be returned to free command pool
+ * @mfi_cmd:		Oning MFI command packe
  */
-static inline void
-mfi_tbolt_return_cmd(struct mfi_softc *sc, struct mfi_cmd_tbolt *cmd)
+void
+mfi_tbolt_return_cmd(struct mfi_softc *sc, struct mfi_cmd_tbolt *tbolt_cmd,
+    struct mfi_command *mfi_cmd)
 {
 	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 
-	cmd->sync_cmd_idx = sc->mfi_max_fw_cmds;
-	TAILQ_INSERT_TAIL(&sc->mfi_cmd_tbolt_tqh, cmd, next);
+	mfi_cmd->cm_flags &= ~MFI_CMD_TBOLT;
+	mfi_cmd->cm_extra_frames = 0;
+	tbolt_cmd->sync_cmd_idx = sc->mfi_max_fw_cmds;
+
+	TAILQ_INSERT_TAIL(&sc->mfi_cmd_tbolt_tqh, tbolt_cmd, next);
 }
 
 void
 mfi_tbolt_complete_cmd(struct mfi_softc *sc)
 {
 	struct mfi_mpi2_reply_header *desc, *reply_desc;
-	struct mfi_command *cmd_mfi, *cmd_mfi_check;	/* For MFA Cmds */
+	struct mfi_command *cmd_mfi;	/* For MFA Cmds */
 	struct mfi_cmd_tbolt *cmd_tbolt;
 	uint16_t smid;
 	uint8_t reply_descript_type;
@@ -632,14 +666,17 @@ mfi_tbolt_complete_cmd(struct mfi_softc *sc)
 	uint32_t status, extStatus;
 	uint16_t num_completed;
 	union desc_value val;
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 
 	desc = (struct mfi_mpi2_reply_header *)
 		((uintptr_t)sc->reply_frame_pool_align
 		+ sc->last_reply_idx * sc->reply_size);
 	reply_desc = desc;
 
-	if (!reply_desc)
+	if (reply_desc == NULL) {
 		device_printf(sc->mfi_dev, "reply desc is NULL!!\n");
+		return;
+	}
 
 	reply_descript_type = reply_desc->ReplyFlags
 	     & MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
@@ -652,13 +689,18 @@ mfi_tbolt_complete_cmd(struct mfi_softc *sc)
 	/* Read Reply descriptor */
 	while ((val.u.low != 0xFFFFFFFF) && (val.u.high != 0xFFFFFFFF)) {
 		smid = reply_desc->SMID;
-		if (!smid || smid > sc->mfi_max_fw_cmds + 1) {
-			device_printf(sc->mfi_dev, "smid is %x. Cannot "
-			    "proceed. Returning \n", smid);
-			return;
+		if (smid == 0 || smid > sc->mfi_max_fw_cmds) {
+			device_printf(sc->mfi_dev, "smid is %d cannot "
+			    "proceed - skipping\n", smid);
+			goto next;
 		}
-
 		cmd_tbolt = sc->mfi_cmd_pool_tbolt[smid - 1];
+		if (cmd_tbolt->sync_cmd_idx == sc->mfi_max_fw_cmds) {
+			device_printf(sc->mfi_dev, "cmd_tbolt %p "
+			    "has invalid sync_cmd_idx=%d - skipping\n",
+			    cmd_tbolt, cmd_tbolt->sync_cmd_idx);
+			goto next;
+		}
 		cmd_mfi = &sc->mfi_commands[cmd_tbolt->sync_cmd_idx];
 		scsi_io_req = cmd_tbolt->io_request;
 
@@ -666,33 +708,30 @@ mfi_tbolt_complete_cmd(struct mfi_softc *sc)
 		extStatus = cmd_mfi->cm_frame->dcmd.header.scsi_status;
 		map_tbolt_cmd_status(cmd_mfi, status, extStatus);
 
-		if (cmd_mfi->cm_flags & MFI_CMD_SCSI &&
+		/* mfi_tbolt_return_cmd is handled by mfi complete / return */
+		if ((cmd_mfi->cm_flags & MFI_CMD_SCSI) != 0 &&
 		    (cmd_mfi->cm_flags & MFI_CMD_POLLED) != 0) {
 			/* polled LD/SYSPD IO command */
-			mfi_tbolt_return_cmd(sc, cmd_tbolt);
 			/* XXX mark okay for now DJA */
 			cmd_mfi->cm_frame->header.cmd_status = MFI_STAT_OK;
-		} else {
 
+		} else {
 			/* remove command from busy queue if not polled */
-			TAILQ_FOREACH(cmd_mfi_check, &sc->mfi_busy, cm_link) {
-				if (cmd_mfi_check == cmd_mfi) {
-					mfi_remove_busy(cmd_mfi);
-					break;
-				}
-			}
+			if ((cmd_mfi->cm_flags & MFI_ON_MFIQ_BUSY) != 0)
+				mfi_remove_busy(cmd_mfi);
 
 			/* complete the command */
 			mfi_complete(sc, cmd_mfi);
-			mfi_tbolt_return_cmd(sc, cmd_tbolt);
 		}
 
+next:
 		sc->last_reply_idx++;
 		if (sc->last_reply_idx >= sc->mfi_max_fw_cmds) {
 			MFI_WRITE4(sc, MFI_RPI, sc->last_reply_idx);
 			sc->last_reply_idx = 0;
 		}
-		/*set it back to all 0xfff.*/
+
+		/* Set it back to all 0xfff */
 		((union mfi_mpi2_reply_descriptor*)desc)->words =
 			~((uint64_t)0x00);
 
@@ -728,17 +767,23 @@ mfi_tbolt_complete_cmd(struct mfi_softc *sc)
  */
 
 struct mfi_cmd_tbolt *
-mfi_tbolt_get_cmd(struct mfi_softc *sc)
+mfi_tbolt_get_cmd(struct mfi_softc *sc, struct mfi_command *mfi_cmd)
 {
 	struct mfi_cmd_tbolt *cmd = NULL;
 
 	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 
-	cmd = TAILQ_FIRST(&sc->mfi_cmd_tbolt_tqh);
+	if ((cmd = TAILQ_FIRST(&sc->mfi_cmd_tbolt_tqh)) == NULL)
+		return (NULL);
 	TAILQ_REMOVE(&sc->mfi_cmd_tbolt_tqh, cmd, next);
 	memset((uint8_t *)cmd->sg_frame, 0, MEGASAS_MAX_SZ_CHAIN_FRAME);
 	memset((uint8_t *)cmd->io_request, 0,
 	    MEGASAS_THUNDERBOLT_NEW_MSG_SIZE);
+
+	cmd->sync_cmd_idx = mfi_cmd->cm_index;
+	mfi_cmd->cm_extra_frames = cmd->index; /* Frame count used as SMID */
+	mfi_cmd->cm_flags |= MFI_CMD_TBOLT;
+
 	return cmd;
 }
 
@@ -767,11 +812,9 @@ mfi_build_mpt_pass_thru(struct mfi_softc *sc, struct mfi_command *mfi_cmd)
 	struct mfi_mpi2_request_raid_scsi_io *io_req;
 	struct mfi_cmd_tbolt *cmd;
 
-	cmd = mfi_tbolt_get_cmd(sc);
+	cmd = mfi_tbolt_get_cmd(sc, mfi_cmd);
 	if (!cmd)
 		return EBUSY;
-	mfi_cmd->cm_extra_frames = cmd->index; /* Frame count used as SMID */
-	cmd->sync_cmd_idx = mfi_cmd->cm_index;
 	io_req = cmd->io_request;
 	mpi25_ieee_chain = (MPI25_IEEE_SGE_CHAIN64 *)&io_req->SGL.IeeeChain;
 
@@ -980,16 +1023,21 @@ mfi_build_and_issue_cmd(struct mfi_softc *sc, struct mfi_command *mfi_cmd)
 	struct mfi_cmd_tbolt *cmd;
 	union mfi_mpi2_request_descriptor *req_desc = NULL;
 	uint16_t index;
-	cmd = mfi_tbolt_get_cmd(sc);
-	if (!cmd)
-		return NULL;
-	mfi_cmd->cm_extra_frames = cmd->index;
-	cmd->sync_cmd_idx = mfi_cmd->cm_index;
+	cmd = mfi_tbolt_get_cmd(sc, mfi_cmd);
+	if (cmd == NULL)
+		return (NULL);
 
 	index = cmd->index;
 	req_desc = mfi_tbolt_get_request_descriptor(sc, index-1);
-	if (mfi_tbolt_build_io(sc, mfi_cmd, cmd))
-		return NULL;
+	if (req_desc == NULL) {
+		mfi_tbolt_return_cmd(sc, cmd, mfi_cmd);
+		return (NULL);
+	}
+
+	if (mfi_tbolt_build_io(sc, mfi_cmd, cmd) != 0) {
+		mfi_tbolt_return_cmd(sc, cmd, mfi_cmd);
+		return (NULL);
+	}
 	req_desc->header.SMID = index;
 	return req_desc;
 }
@@ -1008,7 +1056,7 @@ mfi_tbolt_build_mpt_cmd(struct mfi_softc *sc, struct mfi_command *cmd)
 	index = cmd->cm_extra_frames;
 
 	req_desc = mfi_tbolt_get_request_descriptor(sc, index - 1);
-	if (!req_desc)
+	if (req_desc == NULL)
 		return NULL;
 
 	bzero(req_desc, sizeof(*req_desc));
@@ -1024,7 +1072,7 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
 	struct mfi_frame_header *hdr;
 	uint8_t *cdb;
 	union mfi_mpi2_request_descriptor *req_desc = NULL;
-	int tm = MFI_POLL_TIMEOUT_SECS * 1000;
+	int tm = mfi_polled_cmd_timeout * 1000;
 
 	hdr = &cm->cm_frame->header;
 	cdb = cm->cm_frame->pass.cdb;
@@ -1058,9 +1106,8 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
 			return 1;
 		}
 	} else if ((req_desc = mfi_tbolt_build_mpt_cmd(sc, cm)) == NULL) {
-			device_printf(sc->mfi_dev, "Mapping from MFI to MPT "
-			    "Failed\n");
-			return 1;
+		device_printf(sc->mfi_dev, "Mapping from MFI to MPT Failed\n");
+		return (1);
 	}
 
 	if (cm->cm_flags & MFI_CMD_SCSI) {
@@ -1078,23 +1125,30 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
 	if ((cm->cm_flags & MFI_CMD_POLLED) == 0)
 		return 0;
 
-	if (cm->cm_flags & MFI_CMD_SCSI) {
-		/* check reply queue */
-		mfi_tbolt_complete_cmd(sc);
-	}
-
-	/* This is a polled command, so busy-wait for it to complete. */
+	/*
+	 * This is a polled command, so busy-wait for it to complete.
+	 *
+	 * The value of hdr->cmd_status is updated directly by the hardware
+	 * so there is no garantee that mfi_tbolt_complete_cmd is called
+	 * prior to this value changing.
+	 */
 	while (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
 		DELAY(1000);
 		tm -= 1;
 		if (tm <= 0)
 			break;
 		if (cm->cm_flags & MFI_CMD_SCSI) {
-			/* check reply queue */
+			/*
+			 * Force check reply queue.
+			 * This ensures that dump works correctly
+			 */
 			mfi_tbolt_complete_cmd(sc);
 		}
 	}
 
+	/* ensure the command cleanup has been processed before returning */
+	mfi_tbolt_complete_cmd(sc);
+
 	if (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
 		device_printf(sc->mfi_dev, "Frame %p timed out "
 		    "command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
@@ -1104,9 +1158,10 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
 }
 
 static void
-mfi_issue_pending_cmds_again (struct mfi_softc *sc)
+mfi_issue_pending_cmds_again(struct mfi_softc *sc)
 {
 	struct mfi_command *cm, *tmp;
+	struct mfi_cmd_tbolt *cmd;
 
 	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
 	TAILQ_FOREACH_REVERSE_SAFE(cm, &sc->mfi_busy, BUSYQ, cm_link, tmp) {
@@ -1119,50 +1174,51 @@ mfi_issue_pending_cmds_again (struct mfi_softc *sc)
 		 * should be performed on the controller
 		 */
 		if (cm->retry_for_fw_reset == 3) {
-			device_printf(sc->mfi_dev, "megaraid_sas: command %d "
-			    "was tried multiple times during adapter reset"
-			    "Shutting down the HBA\n", cm->cm_index);
+			device_printf(sc->mfi_dev, "megaraid_sas: command %p "
+			    "index=%d was tried multiple times during adapter "
+			    "reset - Shutting down the HBA\n", cm, cm->cm_index);
 			mfi_kill_hba(sc);
 			sc->hw_crit_error = 1;
 			return;
 		}
 
-		if ((cm->cm_flags & MFI_ON_MFIQ_BUSY) != 0) {
-			struct mfi_cmd_tbolt *cmd;
-			mfi_remove_busy(cm);
-			cmd = sc->mfi_cmd_pool_tbolt[cm->cm_extra_frames -
-			    1 ];
-			mfi_tbolt_return_cmd(sc, cmd);
-			if ((cm->cm_flags & MFI_ON_MFIQ_MASK) == 0) {
-				if (cm->cm_frame->dcmd.opcode !=
-				    MFI_DCMD_CTRL_EVENT_WAIT) {
-					device_printf(sc->mfi_dev,
-					    "APJ ****requeue command %d \n",
-					    cm->cm_index);
-					mfi_requeue_ready(cm);
-				}
+		mfi_remove_busy(cm);
+		if ((cm->cm_flags & MFI_CMD_TBOLT) != 0) {
+			if (cm->cm_extra_frames != 0 && cm->cm_extra_frames <=
+			    sc->mfi_max_fw_cmds) {
+				cmd = sc->mfi_cmd_pool_tbolt[cm->cm_extra_frames - 1];
+				mfi_tbolt_return_cmd(sc, cmd, cm);
+			} else {
+				device_printf(sc->mfi_dev,
+				    "Invalid extra_frames: %d detected\n",
+				    cm->cm_extra_frames);
 			}
-			else
-				mfi_release_command(cm);
 		}
+
+		if (cm->cm_frame->dcmd.opcode != MFI_DCMD_CTRL_EVENT_WAIT) {
+			device_printf(sc->mfi_dev,
+			    "APJ ****requeue command %p index=%d\n",
+			    cm, cm->cm_index);
+			mfi_requeue_ready(cm);
+		} else
+			mfi_release_command(cm);
 	}
 	mfi_startio(sc);
 }
 
 static void
-mfi_kill_hba (struct mfi_softc *sc)
+mfi_kill_hba(struct mfi_softc *sc)
 {
 	if (sc->mfi_flags & MFI_FLAGS_TBOLT)
-		MFI_WRITE4 (sc, 0x00,MFI_STOP_ADP);
+		MFI_WRITE4(sc, 0x00, MFI_STOP_ADP);
 	else
-		MFI_WRITE4 (sc, MFI_IDB,MFI_STOP_ADP);
+		MFI_WRITE4(sc, MFI_IDB, MFI_STOP_ADP);
 }
 
 static void
 mfi_process_fw_state_chg_isr(void *arg)
 {
 	struct mfi_softc *sc= (struct mfi_softc *)arg;
-	struct mfi_cmd_tbolt *cmd;
 	int error, status;
 
 	if (sc->adpreset == 1) {
@@ -1191,26 +1247,32 @@ mfi_process_fw_state_chg_isr(void *arg)
 			device_printf(sc->mfi_dev, "controller is not in "
 			    "ready state\n");
 			mfi_kill_hba(sc);
-			sc->hw_crit_error= 1;
-			return ;
+			sc->hw_crit_error = 1;
+			return;
+		}
+		if ((error = mfi_tbolt_init_MFI_queue(sc)) != 0) {
+			device_printf(sc->mfi_dev, "Failed to initialise MFI "
+			    "queue\n");
+			mfi_kill_hba(sc);
+			sc->hw_crit_error = 1;
+			return;
 		}
-		if ((error = mfi_tbolt_init_MFI_queue(sc)) != 0)
-				return;
 
-		mtx_lock(&sc->mfi_io_lock);
+		/* Init last reply index and max */
+		MFI_WRITE4(sc, MFI_RFPI, sc->mfi_max_fw_cmds - 1);
+		MFI_WRITE4(sc, MFI_RPI, sc->last_reply_idx);
 
 		sc->mfi_enable_intr(sc);
 		sc->adpreset = 0;
-		free(sc->mfi_aen_cm->cm_data, M_MFIBUF);
-		mfi_remove_busy(sc->mfi_aen_cm);
-		cmd = sc->mfi_cmd_pool_tbolt[sc->mfi_aen_cm->cm_extra_frames
-		    - 1];
-		mfi_tbolt_return_cmd(sc, cmd);
-		if (sc->mfi_aen_cm) {
+		if (sc->mfi_aen_cm != NULL) {
+			free(sc->mfi_aen_cm->cm_data, M_MFIBUF);
+			mfi_remove_busy(sc->mfi_aen_cm);
 			mfi_release_command(sc->mfi_aen_cm);
 			sc->mfi_aen_cm = NULL;
 		}
-		if (sc->mfi_map_sync_cm) {
+
+		if (sc->mfi_map_sync_cm != NULL) {
+			mfi_remove_busy(sc->mfi_map_sync_cm);
 			mfi_release_command(sc->mfi_map_sync_cm);
 			sc->mfi_map_sync_cm = NULL;
 		}
@@ -1223,9 +1285,12 @@ mfi_process_fw_state_chg_isr(void *arg)
 		 */
 		if (!sc->hw_crit_error) {
 			/*
-			 * Initiate AEN (Asynchronous Event Notification)
+			 * Initiate AEN (Asynchronous Event Notification) &
+			 * Sync Map
 			 */
 			mfi_aen_setup(sc, sc->last_seq_num);
+			mfi_tbolt_sync_map_info(sc);
+
 			sc->issuepend_done = 1;
 			device_printf(sc->mfi_dev, "second stage of reset "
 			    "complete, FW is ready now.\n");
@@ -1237,7 +1302,6 @@ mfi_process_fw_state_chg_isr(void *arg)
 		device_printf(sc->mfi_dev, "mfi_process_fw_state_chg_isr "
 		    "called with unhandled value:%d\n", sc->adpreset);
 	}
-	mtx_unlock(&sc->mfi_io_lock);
 }
 
 /*
@@ -1276,25 +1340,27 @@ void
 mfi_tbolt_sync_map_info(struct mfi_softc *sc)
 {
 	int error = 0, i;
-	struct mfi_command *cmd;
-	struct mfi_dcmd_frame *dcmd;
+	struct mfi_command *cmd = NULL;
+	struct mfi_dcmd_frame *dcmd = NULL;
 	uint32_t context = 0;
-	union mfi_ld_ref *ld_sync;
+	union mfi_ld_ref *ld_sync = NULL;
 	size_t ld_size;
 	struct mfi_frame_header *hdr;
 	struct mfi_command *cm = NULL;
 	struct mfi_ld_list *list = NULL;
 
+	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
+
 	if (sc->mfi_map_sync_cm != NULL || sc->cm_map_abort)
 		return;
 
-	mtx_lock(&sc->mfi_io_lock);
 	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_LIST,
 	    (void **)&list, sizeof(*list));
 	if (error)
 		goto out;
 
 	cm->cm_flags = MFI_CMD_POLLED | MFI_CMD_DATAIN;
+
 	if (mfi_wait_command(sc, cm) != 0) {
 		device_printf(sc->mfi_dev, "Failed to get device listing\n");
 		goto out;
@@ -1308,18 +1374,15 @@ mfi_tbolt_sync_map_info(struct mfi_softc *sc)
 	}
 
 	ld_size = sizeof(*ld_sync) * list->ld_count;
-	mtx_unlock(&sc->mfi_io_lock);
 	ld_sync = (union mfi_ld_ref *) malloc(ld_size, M_MFIBUF,
-	     M_WAITOK | M_ZERO);
+	     M_NOWAIT | M_ZERO);
 	if (ld_sync == NULL) {
 		device_printf(sc->mfi_dev, "Failed to allocate sync\n");
 		goto out;
 	}
-	for (i = 0; i < list->ld_count; i++) {
+	for (i = 0; i < list->ld_count; i++)
 		ld_sync[i].ref = list->ld_list[i].ld.ref;
-	}
 
-	mtx_lock(&sc->mfi_io_lock);
 	if ((cmd = mfi_dequeue_free(sc)) == NULL) {
 		device_printf(sc->mfi_dev, "Failed to get command\n");
 		free(ld_sync, M_MFIBUF);
@@ -1355,7 +1418,7 @@ mfi_tbolt_sync_map_info(struct mfi_softc *sc)
 		device_printf(sc->mfi_dev, "failed to send map sync\n");
 		free(ld_sync, M_MFIBUF);
 		sc->mfi_map_sync_cm = NULL;
-		mfi_requeue_ready(cmd);
+		mfi_release_command(cmd);
 		goto out;
 	}
 
@@ -1364,7 +1427,6 @@ out:
 		free(list, M_MFIBUF);
 	if (cm)
 		mfi_release_command(cm);
-	mtx_unlock(&sc->mfi_io_lock);
 }
 
 static void
@@ -1389,14 +1451,13 @@ mfi_sync_map_complete(struct mfi_command *cm)
 	}
 
 	free(cm->cm_data, M_MFIBUF);
-	sc->mfi_map_sync_cm = NULL;
 	wakeup(&sc->mfi_map_sync_cm);
+	sc->mfi_map_sync_cm = NULL;
 	mfi_release_command(cm);
 
 	/* set it up again so the driver can catch more events */
-	if (!aborted) {
+	if (!aborted)
 		mfi_queue_map_sync(sc);
-	}
 }
 
 static void
@@ -1412,5 +1473,7 @@ mfi_handle_map_sync(void *context, int pending)
 	struct mfi_softc *sc;
 
 	sc = context;
+	mtx_lock(&sc->mfi_io_lock);
 	mfi_tbolt_sync_map_info(sc);
+	mtx_unlock(&sc->mfi_io_lock);
 }
diff --git a/sys/dev/mfi/mfireg.h b/sys/dev/mfi/mfireg.h
index dab9cf7..52ddafe 100644
--- a/sys/dev/mfi/mfireg.h
+++ b/sys/dev/mfi/mfireg.h
@@ -86,6 +86,7 @@ __FBSDID("$FreeBSD$");
 *  ThunderBolt specific Register
 */
 
+#define MFI_RFPI	0x48 		/* reply_free_post_host_index */
 #define MFI_RPI		0x6c 		/* reply_post_host_index */
 #define MFI_ILQP 	0xc0		/* inbound_low_queue_port */
 #define MFI_IHQP 	0xc4		/* inbound_high_queue_port */
@@ -259,6 +260,13 @@ typedef enum {
 #define MFI_FRAME_DIR_READ			0x0010
 #define MFI_FRAME_DIR_BOTH			0x0018
 #define MFI_FRAME_IEEE_SGL			0x0020
+#define MFI_FRAME_FMT "\20" \
+    "\1NOPOST" \
+    "\2SGL64" \
+    "\3SENSE64" \
+    "\4WRITE" \
+    "\5READ" \
+    "\6IEEESGL"
 
 /* ThunderBolt Specific */
 
@@ -456,8 +464,8 @@ typedef enum {
 #define MFI_FRAME_SIZE		64
 #define MFI_MBOX_SIZE		12
 
-/* Firmware flashing can take 40s */
-#define MFI_POLL_TIMEOUT_SECS	50
+/* Firmware flashing can take 50+ seconds */
+#define MFI_POLL_TIMEOUT_SECS	60
 
 /* Allow for speedier math calculations */
 #define MFI_SECTOR_LEN		512
diff --git a/sys/dev/mfi/mfivar.h b/sys/dev/mfi/mfivar.h
index bb2a324..664ede9 100644
--- a/sys/dev/mfi/mfivar.h
+++ b/sys/dev/mfi/mfivar.h
@@ -102,12 +102,25 @@ struct mfi_command {
 #define MFI_CMD_DATAOUT		(1<<2)
 #define MFI_CMD_COMPLETED	(1<<3)
 #define MFI_CMD_POLLED		(1<<4)
-#define MFI_ON_MFIQ_FREE	(1<<5)
-#define MFI_ON_MFIQ_READY	(1<<6)
-#define MFI_ON_MFIQ_BUSY	(1<<7)
-#define MFI_ON_MFIQ_MASK	((1<<5)|(1<<6)|(1<<7))
-#define MFI_CMD_SCSI		(1<<8)
-#define MFI_CMD_CCB		(1<<9)
+#define MFI_CMD_SCSI		(1<<5)
+#define MFI_CMD_CCB		(1<<6)
+#define MFI_CMD_TBOLT		(1<<7)
+#define MFI_ON_MFIQ_FREE	(1<<8)
+#define MFI_ON_MFIQ_READY	(1<<9)
+#define MFI_ON_MFIQ_BUSY	(1<<10)
+#define MFI_ON_MFIQ_MASK	(MFI_ON_MFIQ_FREE | MFI_ON_MFIQ_READY| \
+    MFI_ON_MFIQ_BUSY)
+#define MFI_CMD_FLAGS_FMT	"\20" \
+    "\1MAPPED" \
+    "\2DATAIN" \
+    "\3DATAOUT" \
+    "\4COMPLETED" \
+    "\5POLLED" \
+    "\6SCSI" \
+    "\7TBOLT" \
+    "\10Q_FREE" \
+    "\11Q_READY" \
+    "\12Q_BUSY"
 	uint8_t			retry_for_fw_reset;
 	void			(* cm_complete)(struct mfi_command *cm);
 	void			*cm_private;
@@ -268,10 +281,6 @@ struct mfi_softc {
 	 */
 	struct mfi_command		*mfi_commands;
 	/*
-	 * How many commands were actually allocated
-	 */
-	int				mfi_total_cmds;
-	/*
 	 * How many commands the firmware can handle.  Also how big the reply
 	 * queue is, minus 1.
 	 */
@@ -470,9 +479,8 @@ extern int mfi_build_cdb(int, uint8_t, u_int64_t, u_int32_t, uint8_t *);
 	mfi_enqueue_ ## name (struct mfi_command *cm)			\
 	{								\
 		if ((cm->cm_flags & MFI_ON_MFIQ_MASK) != 0) {		\
-			printf("command %p is on another queue, "	\
+			panic("command %p is on another queue, "	\
 			    "flags = %#x\n", cm, cm->cm_flags);		\
-			panic("command is on another queue");		\
 		}							\
 		TAILQ_INSERT_TAIL(&cm->cm_sc->mfi_ ## name, cm, cm_link); \
 		cm->cm_flags |= MFI_ON_ ## index;			\
@@ -482,9 +490,8 @@ extern int mfi_build_cdb(int, uint8_t, u_int64_t, u_int32_t, uint8_t *);
 	mfi_requeue_ ## name (struct mfi_command *cm)			\
 	{								\
 		if ((cm->cm_flags & MFI_ON_MFIQ_MASK) != 0) {		\
-			printf("command %p is on another queue, "	\
+			panic("command %p is on another queue, "	\
 			    "flags = %#x\n", cm, cm->cm_flags);		\
-			panic("command is on another queue");		\
 		}							\
 		TAILQ_INSERT_HEAD(&cm->cm_sc->mfi_ ## name, cm, cm_link); \
 		cm->cm_flags |= MFI_ON_ ## index;			\
@@ -497,10 +504,9 @@ extern int mfi_build_cdb(int, uint8_t, u_int64_t, u_int32_t, uint8_t *);
 									\
 		if ((cm = TAILQ_FIRST(&sc->mfi_ ## name)) != NULL) {	\
 			if ((cm->cm_flags & MFI_ON_ ## index) == 0) {	\
-				printf("command %p not in queue, "	\
+				panic("command %p not in queue, "	\
 				    "flags = %#x, bit = %#x\n", cm,	\
 				    cm->cm_flags, MFI_ON_ ## index);	\
-				panic("command not in queue");		\
 			}						\
 			TAILQ_REMOVE(&sc->mfi_ ## name, cm, cm_link);	\
 			cm->cm_flags &= ~MFI_ON_ ## index;		\
@@ -512,10 +518,9 @@ extern int mfi_build_cdb(int, uint8_t, u_int64_t, u_int32_t, uint8_t *);
 	mfi_remove_ ## name (struct mfi_command *cm)			\
 	{								\
 		if ((cm->cm_flags & MFI_ON_ ## index) == 0) {		\
-			printf("command %p not in queue, flags = %#x, " \
+			panic("command %p not in queue, flags = %#x, " \
 			    "bit = %#x\n", cm, cm->cm_flags,		\
 			    MFI_ON_ ## index);				\
-			panic("command not in queue");			\
 		}							\
 		TAILQ_REMOVE(&cm->cm_sc->mfi_ ## name, cm, cm_link);	\
 		cm->cm_flags &= ~MFI_ON_ ## index;			\
@@ -608,7 +613,8 @@ SYSCTL_DECL(_hw_mfi);
 #ifdef MFI_DEBUG
 extern void mfi_print_cmd(struct mfi_command *cm);
 extern void mfi_dump_cmds(struct mfi_softc *sc);
-extern void mfi_validate_sg(struct mfi_softc *, struct mfi_command *, const char *, int );
+extern void mfi_validate_sg(struct mfi_softc *, struct mfi_command *,
+    const char *, int);
 #define MFI_PRINT_CMD(cm)	mfi_print_cmd(cm)
 #define MFI_DUMP_CMDS(sc)	mfi_dump_cmds(sc)
 #define MFI_VALIDATE_CMD(sc, cm) mfi_validate_sg(sc, cm, __FUNCTION__, __LINE__)
@@ -618,6 +624,8 @@ extern void mfi_validate_sg(struct mfi_softc *, struct mfi_command *, const char
 #define MFI_VALIDATE_CMD(sc, cm)
 #endif
 
-extern void mfi_release_command(struct mfi_command *cm);
+extern void mfi_release_command(struct mfi_command *);
+extern void mfi_tbolt_return_cmd(struct mfi_softc *,
+    struct mfi_cmd_tbolt *, struct mfi_command *);
 
 #endif /* _MFIVAR_H */
diff --git a/sys/dev/msk/if_msk.c b/sys/dev/msk/if_msk.c
index d0ca808..664575c 100644
--- a/sys/dev/msk/if_msk.c
+++ b/sys/dev/msk/if_msk.c
@@ -1695,6 +1695,12 @@ msk_attach(device_t dev)
 			ifp->if_capabilities |= IFCAP_VLAN_HWCSUM;
 	}
 	ifp->if_capenable = ifp->if_capabilities;
+	/*
+	 * Disable RX checksum offloading on controllers that don't use
+	 * new descriptor format but give chance to enable it.
+	 */
+	if ((sc_if->msk_flags & MSK_FLAG_DESCV2) == 0)
+		ifp->if_capenable &= ~IFCAP_RXCSUM;
 
 	/*
 	 * Tell the upper layer(s) we support long frames.
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 66da0d0..6d110ab 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1269,6 +1269,15 @@ brelse(struct buf *bp)
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
+	if (BUF_LOCKRECURSED(bp)) {
+		/*
+		 * Do not process, in particular, do not handle the
+		 * B_INVAL/B_RELBUF and do not release to free list.
+		 */
+		BUF_UNLOCK(bp);
+		return;
+	}
+
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
@@ -1445,12 +1454,6 @@ brelse(struct buf *bp)
 			brelvp(bp);
 	}
 			
-	if (BUF_LOCKRECURSED(bp)) {
-		/* do not release to free list */
-		BUF_UNLOCK(bp);
-		return;
-	}
-
 	/* enqueue */
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
@@ -2682,6 +2685,9 @@ loop:
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
+		/* If recursed, assume caller knows the rules. */
+		else if (BUF_LOCKRECURSED(bp))
+			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
@@ -2865,6 +2871,7 @@ loop:
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
+end:
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index b54dc04..0696edd 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -385,6 +385,7 @@ extern int		vttoif_tab[];
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
+#define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index abe4073..789a7cf 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -1790,6 +1790,17 @@ fail:
 	return (0);
 }
 
+static inline struct buf *
+getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
+{
+	struct fs *fs;
+
+	fs = ip->i_fs;
+	return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs,
+	    cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
+	    gbflags));
+}
+
 /*
  * Determine whether an inode can be allocated.
  *
@@ -1814,9 +1825,11 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
 	u_int8_t *inosused, *loc;
 	struct ufs2_dinode *dp2;
 	int error, start, len, i;
+	u_int32_t old_initediblk;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
+check_nifree:
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
@@ -1828,13 +1841,13 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
+restart:
 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
-	cgp->cg_old_time = cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
@@ -1856,7 +1869,6 @@ ffs_nodealloccg(ip, cg, ipref, mode, unused)
 		}
 	}
 	ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
-	cgp->cg_irotor = ipref;
 gotit:
 	/*
 	 * Check to see if we need to initialize more inodes.
@@ -1864,9 +1876,37 @@ gotit:
 	if (fs->fs_magic == FS_UFS2_MAGIC &&
 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
 	    cgp->cg_initediblk < cgp->cg_niblk) {
-		ibp = getblk(ip->i_devvp, fsbtodb(fs,
-		    ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
-		    (int)fs->fs_bsize, 0, 0, 0);
+		old_initediblk = cgp->cg_initediblk;
+
+		/*
+		 * Free the cylinder group lock before writing the
+		 * initialized inode block.  Entering the
+		 * babarrierwrite() with the cylinder group lock
+		 * causes lock order violation between the lock and
+		 * snaplk.
+		 *
+		 * Another thread can decide to initialize the same
+		 * inode block, but whichever thread first gets the
+		 * cylinder group lock after writing the newly
+		 * allocated inode block will update it and the other
+		 * will realize that it has lost and leave the
+		 * cylinder group unchanged.
+		 */
+		ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
+		brelse(bp);
+		if (ibp == NULL) {
+			/*
+			 * The inode block buffer is already owned by
+			 * another thread, which must initialize it.
+			 * Wait on the buffer to allow another thread
+			 * to finish the updates, with dropped cg
+			 * buffer lock, then retry.
+			 */
+			ibp = getinobuf(ip, cg, old_initediblk, 0);
+			brelse(ibp);
+			UFS_LOCK(ump);
+			goto check_nifree;
+		}
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
@@ -1883,8 +1923,29 @@ gotit:
 		 * loading of newly created filesystems.
 		 */
 		babarrierwrite(ibp);
-		cgp->cg_initediblk += INOPB(fs);
+
+		/*
+		 * After the inode block is written, try to update the
+		 * cg initediblk pointer.  If another thread beat us
+		 * to it, then leave it unchanged as the other thread
+		 * has already set it correctly.
+		 */
+		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		    (int)fs->fs_cgsize, NOCRED, &bp);
+		UFS_LOCK(ump);
+		ACTIVECLEAR(fs, cg);
+		UFS_UNLOCK(ump);
+		if (error != 0) {
+			brelse(bp);
+			return (error);
+		}
+		cgp = (struct cg *)bp->b_data;
+		if (cgp->cg_initediblk == old_initediblk)
+			cgp->cg_initediblk += INOPB(fs);
+		goto restart;
 	}
+	cgp->cg_old_time = cgp->cg_time = time_second;
+	cgp->cg_irotor = ipref;
 	UFS_LOCK(ump);
 	ACTIVECLEAR(fs, cg);
 	setbit(inosused, ipref);
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 16fe134..e39fd46 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -1908,7 +1908,12 @@ softdep_flushfiles(oldmnt, flags, td)
 	int flags;
 	struct thread *td;
 {
-	int error, depcount, loopcnt, retry_flush_count, retry;
+#ifdef QUOTA
+	struct ufsmount *ump;
+	int i;
+#endif
+	int error, early, depcount, loopcnt, retry_flush_count, retry;
+	int morework;
 
 	loopcnt = 10;
 	retry_flush_count = 3;
@@ -1926,7 +1931,9 @@ retry_flush:
 		 * Do another flush in case any vnodes were brought in
 		 * as part of the cleanup operations.
 		 */
-		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
+		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
+		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
+		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
 			break;
 		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
 		    depcount == 0)
@@ -1950,7 +1957,17 @@ retry_flush:
 			MNT_ILOCK(oldmnt);
 			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
 			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
-			if (oldmnt->mnt_nvnodelistsize > 0) {
+			morework = oldmnt->mnt_nvnodelistsize > 0;
+#ifdef QUOTA
+			ump = VFSTOUFS(oldmnt);
+			UFS_LOCK(ump);
+			for (i = 0; i < MAXQUOTAS; i++) {
+				if (ump->um_quotas[i] != NULLVP)
+					morework = 1;
+			}
+			UFS_UNLOCK(ump);
+#endif
+			if (morework) {
 				if (--retry_flush_count > 0) {
 					retry = 1;
 					loopcnt = 3;
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 0204613..b3292d0 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1351,9 +1351,10 @@ ffs_flushfiles(mp, flags, td)
 	struct thread *td;
 {
 	struct ufsmount *ump;
-	int error;
+	int qerror, error;
 
 	ump = VFSTOUFS(mp);
+	qerror = 0;
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
@@ -1361,11 +1362,19 @@ ffs_flushfiles(mp, flags, td)
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
-			quotaoff(td, mp, i);
+			error = quotaoff(td, mp, i);
+			if (error != 0) {
+				if ((flags & EARLYFLUSH) == 0)
+					return (error);
+				else
+					qerror = error;
+			}
 		}
+
 		/*
-		 * Here we fall through to vflush again to ensure
-		 * that we have gotten rid of all the system vnodes.
+		 * Here we fall through to vflush again to ensure that
+		 * we have gotten rid of all the system vnodes, unless
+		 * quotas must not be closed.
 		 */
 	}
 #endif
@@ -1380,11 +1389,21 @@ ffs_flushfiles(mp, flags, td)
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
-        /*
-	 * Flush all the files.
+
+	/*
+	 * Do not close system files if quotas were not closed, to be
+	 * able to sync the remaining dquots.  The freeblks softupdate
+	 * workitems might hold a reference on a dquot, preventing
+	 * quotaoff() from completing.  Next round of
+	 * softdep_flushworklist() iteration should process the
+	 * blockers, allowing the next run of quotaoff() to finally
+	 * flush held dquots.
+	 *
+	 * Otherwise, flush all the files.
 	 */
-	if ((error = vflush(mp, 0, flags, td)) != 0)
+	if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0)
 		return (error);
+
 	/*
 	 * Flush filesystem metadata.
 	 */
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
index 87ac9a1..a949898 100644
--- a/sys/ufs/ufs/ufs_quota.c
+++ b/sys/ufs/ufs/ufs_quota.c
@@ -80,7 +80,7 @@ static int dqopen(struct vnode *, struct ufsmount *, int);
 static int dqget(struct vnode *,
 	u_long, struct ufsmount *, int, struct dquot **);
 static int dqsync(struct vnode *, struct dquot *);
-static void dqflush(struct vnode *);
+static int dqflush(struct vnode *);
 static int quotaoff1(struct thread *td, struct mount *mp, int type);
 static int quotaoff_inchange(struct thread *td, struct mount *mp, int type);
 
@@ -674,8 +674,12 @@ again:
 		vrele(vp);
 	}
 
-	dqflush(qvp);
-	/* Clear um_quotas before closing the quota vnode to prevent
+	error = dqflush(qvp);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Clear um_quotas before closing the quota vnode to prevent
 	 * access to the closed vnode from dqget/dqsync
 	 */
 	UFS_LOCK(ump);
@@ -1594,17 +1598,19 @@ out:
 /*
  * Flush all entries from the cache for a particular vnode.
  */
-static void
+static int
 dqflush(struct vnode *vp)
 {
 	struct dquot *dq, *nextdq;
 	struct dqhash *dqh;
+	int error;
 
 	/*
 	 * Move all dquot's that used to refer to this quota
 	 * file off their hash chains (they will eventually
 	 * fall off the head of the free list and be re-used).
 	 */
+	error = 0;
 	DQH_LOCK();
 	for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
 		for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
@@ -1612,12 +1618,15 @@ dqflush(struct vnode *vp)
 			if (dq->dq_ump->um_quotas[dq->dq_type] != vp)
 				continue;
 			if (dq->dq_cnt)
-				panic("dqflush: stray dquot");
-			LIST_REMOVE(dq, dq_hash);
-			dq->dq_ump = (struct ufsmount *)0;
+				error = EBUSY;
+			else {
+				LIST_REMOVE(dq, dq_hash);
+				dq->dq_ump = NULL;
+			}
 		}
 	}
 	DQH_UNLOCK();
+	return (error);
 }
 
 /*
author	attilio <attilio@FreeBSD.org>	2013-02-27 18:17:34 +0000
committer	attilio <attilio@FreeBSD.org>	2013-02-27 18:17:34 +0000
commit	52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf (patch)
tree	d0908474209a17865e044675940a2f62f9ff2493 /sys
parent	c74a3afc6a5d7d1ced989c36d4ba0a7d2bbc43b9 (diff)
download	FreeBSD-src-52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf.zip FreeBSD-src-52c57fbbdb554a7ce0cdbb6bf27051ef70834bdf.tar.gz