5 files changed, 147 insertions, 15 deletions
diff --git a/sys/dev/isci/isci.h b/sys/dev/isci/isci.h
index d6ad93c..cdba7de 100644
--- a/sys/dev/isci/isci.h
+++ b/sys/dev/isci/isci.h
@@ -30,6 +30,9 @@
  * $FreeBSD$
  */
 
+#ifndef _ISCI_H
+#define _ISCI_H
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -86,7 +89,31 @@ struct ISCI_REMOTE_DEVICE {
 	BOOL				is_resetting;
 	uint32_t			frozen_lun_mask;
 	SCI_FAST_LIST_ELEMENT_T		pending_device_reset_element;
+
+	/*
+	 * This queue maintains CCBs that have been returned with
+	 *  SCI_IO_FAILURE_INVALID_STATE from the SCI layer.  These CCBs
+	 *  need to be retried, but we cannot return CAM_REQUEUE_REQ because
+	 *  this status gets passed all the way back up to users of the pass(4)
+	 *  interface and breaks things like smartctl.  So instead, we queue
+	 *  these CCBs internally.
+	 */
 	TAILQ_HEAD(,ccb_hdr)		queued_ccbs;
+
+	/*
+	 * Marker denoting this remote device needs its first queued ccb to
+	 *  be retried.
+	 */
+	BOOL				release_queued_ccb;
+
+	/*
+	 * Points to a CCB in the queue that is currently being processed by
+	 *  SCIL.  This allows us to keep in flight CCBs in the queue so as to
+	 *  maintain ordering (i.e. in case we retry an I/O and then find out
+	 *  it needs to be retried again - it just keeps its same place in the
+	 *  queue.
+	 */
+	union ccb *			queued_ccb_in_progress;
 };
 
 struct ISCI_DOMAIN {
@@ -126,6 +153,7 @@ struct ISCI_CONTROLLER
 	BOOL			has_been_scanned;
 	uint32_t		initial_discovery_mask;
 	BOOL			is_frozen;
+	BOOL			release_queued_ccbs;
 	uint8_t			*remote_device_memory;
 	struct ISCI_MEMORY	cached_controller_memory;
 	struct ISCI_MEMORY	uncached_controller_memory;
@@ -291,6 +319,8 @@ int isci_controller_attach_to_cam(struct ISCI_CONTROLLER *controller);
 
 void isci_controller_start(void *controller);
 
+void isci_controller_release_queued_ccbs(struct ISCI_CONTROLLER *controller);
+
 void isci_domain_construct(struct ISCI_DOMAIN *domain, uint32_t domain_index,
     struct ISCI_CONTROLLER *controller);
 
@@ -301,3 +331,5 @@ void isci_log_message(uint32_t	verbosity, char *log_message_prefix,
     char *log_message, ...);
 
 extern uint32_t g_isci_debug_level;
+
+#endif /* #ifndef _ISCI_H */
diff --git a/sys/dev/isci/isci_controller.c b/sys/dev/isci/isci_controller.c
index a4b7cfe..49785cb 100644
--- a/sys/dev/isci/isci_controller.c
+++ b/sys/dev/isci/isci_controller.c
@@ -201,6 +201,7 @@ void isci_controller_construct(struct ISCI_CONTROLLER *controller,
 
 	controller->is_started = FALSE;
 	controller->is_frozen = FALSE;
+	controller->release_queued_ccbs = FALSE;
 	controller->sim = NULL;
 	controller->initial_discovery_mask = 0;
 
@@ -431,6 +432,8 @@ int isci_controller_allocate_memory(struct ISCI_CONTROLLER *controller)
 		sci_fast_list_element_init(remote_device,
 		    &remote_device->pending_device_reset_element);
 		TAILQ_INIT(&remote_device->queued_ccbs);
+		remote_device->release_queued_ccb = FALSE;
+		remote_device->queued_ccb_in_progress = NULL;
 
 		/*
 		 * For the first SCI_MAX_DOMAINS device objects, do not put
@@ -694,3 +697,47 @@ void isci_action(struct cam_sim *sim, union ccb *ccb)
 	}
 }
 
+/*
+ * Unfortunately, SCIL doesn't cleanly handle retry conditions.
+ *  CAM_REQUEUE_REQ works only when no one is using the pass(4) interface.  So
+ *  when SCIL denotes an I/O needs to be retried (typically because of mixing
+ *  tagged/non-tagged ATA commands, or running out of NCQ slots), we queue
+ *  these I/O internally.  Once SCIL completes an I/O to this device, or we get
+ *  a ready notification, we will retry the first I/O on the queue.
+ *  Unfortunately, SCIL also doesn't cleanly handle starting the new I/O within
+ *  the context of the completion handler, so we need to retry these I/O after
+ *  the completion handler is done executing.
+ */
+void
+isci_controller_release_queued_ccbs(struct ISCI_CONTROLLER *controller)
+{
+	struct ISCI_REMOTE_DEVICE *dev;
+	struct ccb_hdr *ccb_h;
+	int dev_idx;
+
+	KASSERT(mtx_owned(&controller->lock), ("controller lock not owned"));
+
+	controller->release_queued_ccbs = FALSE;
+	for (dev_idx = 0;
+	     dev_idx < SCI_MAX_REMOTE_DEVICES;
+	     dev_idx++) {
+
+		dev = controller->remote_device[dev_idx];
+		if (dev != NULL &&
+		    dev->release_queued_ccb == TRUE &&
+		    dev->queued_ccb_in_progress == NULL) {
+			dev->release_queued_ccb = FALSE;
+			ccb_h = TAILQ_FIRST(&dev->queued_ccbs);
+
+			if (ccb_h == NULL)
+				continue;
+
+			isci_log_message(1, "ISCI", "release %p %x\n", ccb_h,
+			    ((union ccb *)ccb_h)->csio.cdb_io.cdb_bytes[0]);
+
+			dev->queued_ccb_in_progress = (union ccb *)ccb_h;
+			isci_io_request_execute_scsi_io(
+			    (union ccb *)ccb_h, controller);
+		}
+	}
+}
diff --git a/sys/dev/isci/isci_interrupt.c b/sys/dev/isci/isci_interrupt.c
index e020f83..52c64f7 100644
--- a/sys/dev/isci/isci_interrupt.c
+++ b/sys/dev/isci/isci_interrupt.c
@@ -177,6 +177,9 @@ isci_interrupt_legacy_handler(void *arg)
 			if (interrupt_handler(scic_controller_handle)) {
 				mtx_lock(&controller->lock);
 				completion_handler(scic_controller_handle);
+				if (controller->release_queued_ccbs == TRUE)
+					isci_controller_release_queued_ccbs(
+					    controller);
 				mtx_unlock(&controller->lock);
 			}
 		}
@@ -204,6 +207,13 @@ isci_interrupt_msix_handler(void *arg)
 	if (interrupt_handler(scic_controller_handle)) {
 		mtx_lock(&controller->lock);
 		completion_handler(scic_controller_handle);
+		/*
+		 * isci_controller_release_queued_ccb() is a relatively
+		 *  expensive routine, so we don't call it until the controller
+		 *  level flag is set to TRUE.
+		 */
+		if (controller->release_queued_ccbs == TRUE)
+			isci_controller_release_queued_ccbs(controller);
 		mtx_unlock(&controller->lock);
 	}
 }
diff --git a/sys/dev/isci/isci_io_request.c b/sys/dev/isci/isci_io_request.c
index 985a2e4..67ed1da 100644
--- a/sys/dev/isci/isci_io_request.c
+++ b/sys/dev/isci/isci_io_request.c
@@ -223,7 +223,7 @@ isci_io_request_complete(SCI_CONTROLLER_HANDLE_T scif_controller,
 	    (struct ISCI_REQUEST *)isci_request);
 
 	if (complete_ccb) {
-		if (ccb->ccb_h.status != CAM_REQ_CMP) {
+		if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			/* ccb will be completed with some type of non-success
 			 *  status.  So temporarily freeze the queue until the
 			 *  upper layers can act on the status.  The
@@ -234,6 +234,26 @@ isci_io_request_complete(SCI_CONTROLLER_HANDLE_T scif_controller,
 			xpt_freeze_devq(ccb->ccb_h.path, 1);
 		}
 
+		if (ccb->ccb_h.status & CAM_SIM_QUEUED) {
+
+			KASSERT(ccb == isci_remote_device->queued_ccb_in_progress,
+			    ("multiple internally queued ccbs in flight"));
+
+			TAILQ_REMOVE(&isci_remote_device->queued_ccbs,
+			    &ccb->ccb_h, sim_links.tqe);
+			ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
+
+			/*
+			 * This CCB that was in the queue was completed, so
+			 *  set the in_progress pointer to NULL denoting that
+			 *  we can retry another CCB from the queue.  We only
+			 *  allow one CCB at a time from the queue to be
+			 *  in progress so that we can effectively maintain
+			 *  ordering.
+			 */
+			isci_remote_device->queued_ccb_in_progress = NULL;
+		}
+
 		if (isci_remote_device->frozen_lun_mask != 0) {
 			isci_remote_device_release_device_queue(isci_remote_device);
 		}
@@ -248,11 +268,30 @@ isci_io_request_complete(SCI_CONTROLLER_HANDLE_T scif_controller,
 		isci_remote_device_freeze_lun_queue(isci_remote_device,
 		    ccb->ccb_h.target_lun);
 
-		isci_log_message(1, "ISCI", "queue %p %x\n", ccb,
-		    ccb->csio.cdb_io.cdb_bytes[0]);
-		ccb->ccb_h.status |= CAM_SIM_QUEUED;
-		TAILQ_INSERT_TAIL(&isci_remote_device->queued_ccbs,
-		    &ccb->ccb_h, sim_links.tqe);
+		if (ccb->ccb_h.status & CAM_SIM_QUEUED) {
+
+			KASSERT(ccb == isci_remote_device->queued_ccb_in_progress,
+			    ("multiple internally queued ccbs in flight"));
+
+			/*
+			 *  Do nothing, CCB is already on the device's queue.
+			 *   We leave it on the queue, to be retried again
+			 *   next time a CCB on this device completes, or we
+			 *   get a ready notification for this device.
+			 */
+			isci_log_message(1, "ISCI", "already queued %p %x\n",
+			    ccb, ccb->csio.cdb_io.cdb_bytes[0]);
+
+			isci_remote_device->queued_ccb_in_progress = NULL;
+
+		} else {
+			isci_log_message(1, "ISCI", "queue %p %x\n", ccb,
+			    ccb->csio.cdb_io.cdb_bytes[0]);
+			ccb->ccb_h.status |= CAM_SIM_QUEUED;
+
+			TAILQ_INSERT_TAIL(&isci_remote_device->queued_ccbs,
+			    &ccb->ccb_h, sim_links.tqe);
+		}
 	}
 }
 
diff --git a/sys/dev/isci/isci_remote_device.c b/sys/dev/isci/isci_remote_device.c
index c9434f8..31fcbfb 100644
--- a/sys/dev/isci/isci_remote_device.c
+++ b/sys/dev/isci/isci_remote_device.c
@@ -297,14 +297,18 @@ isci_remote_device_release_device_queue(
 		for (lun = 0; lun < ISCI_MAX_LUN; lun++)
 			isci_remote_device_release_lun_queue(device, lun);
 	} else {
-		struct ccb_hdr *ccb_h;
-
-		ccb_h = TAILQ_FIRST(&device->queued_ccbs);
-		TAILQ_REMOVE(&device->queued_ccbs, ccb_h, sim_links.tqe);
-		ccb_h->status &= ~CAM_SIM_QUEUED;
-		isci_log_message(1, "ISCI", "release %p %x\n", ccb_h,
-		    ((union ccb*)ccb_h)->csio.cdb_io.cdb_bytes[0]);
-		isci_io_request_execute_scsi_io((union ccb *)ccb_h,
-		    device->domain->controller);
+		/*
+		 * We cannot unfreeze the devq, because there are still
+		 *  CCBs in our internal queue that need to be processed
+		 *  first.  Mark this device, and the controller, so that
+		 *  the first CCB in this device's internal queue will be
+		 *  resubmitted after the current completion context
+		 *  unwinds.
+		 */
+		device->release_queued_ccb = TRUE;
+		device->domain->controller->release_queued_ccbs = TRUE;
+
+		isci_log_message(1, "ISCI", "schedule %p for release\n",
+		    device);
 	}
 }