diff options
author | mav <mav@FreeBSD.org> | 2011-07-29 20:30:28 +0000 |
---|---|---|
committer | mav <mav@FreeBSD.org> | 2011-07-29 20:30:28 +0000 |
commit | 56747017c93fe2b157d540210ca731a4901ae11b (patch) | |
tree | 4669b162bed7e4820e270e521d620684df9b61c5 /sys/cam | |
parent | 3b6f1df0b0ea32496af31645202e47a2cde44943 (diff) | |
download | FreeBSD-src-56747017c93fe2b157d540210ca731a4901ae11b.zip FreeBSD-src-56747017c93fe2b157d540210ca731a4901ae11b.tar.gz |
In some cases failed SATA disks may report their presence, but don't
respond to any commands. I've found that because of multiple command
retries, each of which cause 30s timeout, bus reset and another retry or
requeue for many commands, it may take ages to eventually drop the
failed device. The odd thing is that those retries continue even after
XPT considered device as dead and invalidated it.
This patch makes cam_periph_error() to block any command retries after
periph was marked as invalid. With that patch all activity completes in
1-2 minutes, just after several timeouts, required to consider device
death. This should make ZFS, gmirror, graid, etc. operation more robust.
Reviewed by: mjacob@ on scsi@
Approved by: re (kib)
Diffstat (limited to 'sys/cam')
-rw-r--r-- | sys/cam/cam_periph.c | 27 |
1 files changed, 19 insertions, 8 deletions
diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index f630772..67c2d01 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -1550,7 +1550,8 @@ camperiphscsisenseerror(union ccb *ccb, cam_flags camflags, * make sure we actually have retries available. */ if ((err_action & SSQ_DECREMENT_COUNT) != 0) { - if (ccb->ccb_h.retry_count > 0) + if (ccb->ccb_h.retry_count > 0 && + (periph->flags & CAM_PERIPH_INVALID) == 0) ccb->ccb_h.retry_count--; else { *action_string = "Retries exhausted"; @@ -1718,6 +1719,7 @@ int cam_periph_error(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags, union ccb *save_ccb) { + struct cam_periph *periph; const char *action_string; cam_status status; int frozen; @@ -1725,7 +1727,8 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, int openings; u_int32_t relsim_flags; u_int32_t timeout = 0; - + + periph = xpt_path_periph(ccb->ccb_h.path); action_string = NULL; status = ccb->ccb_h.status; frozen = (status & CAM_DEV_QFRZN) != 0; @@ -1787,9 +1790,9 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, xpt_print(ccb->ccb_h.path, "Data overrun\n"); printed++; } - error = EIO; /* we have to kill the command */ /* decrement the number of retries */ - if (ccb->ccb_h.retry_count > 0) { + if (ccb->ccb_h.retry_count > 0 && + (periph->flags & CAM_PERIPH_INVALID) == 0) { ccb->ccb_h.retry_count--; error = ERESTART; } else { @@ -1808,7 +1811,8 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, struct cam_path *newpath; if ((camflags & CAM_RETRY_SELTO) != 0) { - if (ccb->ccb_h.retry_count > 0) { + if (ccb->ccb_h.retry_count > 0 && + (periph->flags & CAM_PERIPH_INVALID) == 0) { ccb->ccb_h.retry_count--; error = ERESTART; @@ -1826,10 +1830,11 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, timeout = periph_selto_delay; break; } + action_string = "Retries exhausted"; } error = ENXIO; /* Should we do more if we can't create the path?? */ - if (xpt_create_path(&newpath, xpt_path_periph(ccb->ccb_h.path), + if (xpt_create_path(&newpath, periph, xpt_path_path_id(ccb->ccb_h.path), xpt_path_target_id(ccb->ccb_h.path), CAM_LUN_WILDCARD) != CAM_REQ_CMP) @@ -1874,11 +1879,16 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, /* FALLTHROUGH */ case CAM_REQUEUE_REQ: /* Unconditional requeue */ - error = ERESTART; if (bootverbose && printed == 0) { xpt_print(ccb->ccb_h.path, "Request requeued\n"); printed++; } + if ((periph->flags & CAM_PERIPH_INVALID) == 0) + error = ERESTART; + else { + action_string = "Retries exhausted"; + error = EIO; + } break; case CAM_RESRC_UNAVAIL: /* Wait a bit for the resource shortage to abate. */ @@ -1893,7 +1903,8 @@ cam_periph_error(union ccb *ccb, cam_flags camflags, /* FALLTHROUGH */ default: /* decrement the number of retries */ - if (ccb->ccb_h.retry_count > 0) { + if (ccb->ccb_h.retry_count > 0 && + (periph->flags & CAM_PERIPH_INVALID) == 0) { ccb->ccb_h.retry_count--; error = ERESTART; if (bootverbose && printed == 0) { |