diff options
author | mm <mm@FreeBSD.org> | 2012-11-08 01:36:06 +0000 |
---|---|---|
committer | mm <mm@FreeBSD.org> | 2012-11-08 01:36:06 +0000 |
commit | cab0cc7d7a2d10ffa21890c1f9bcc26c6b74c540 (patch) | |
tree | 6eb986901f1506046b1fd88d8b7f81b7e6a14b67 | |
parent | fcf2566e0f79f9ece1a0b89b8e1c275a2d422c2d (diff) | |
download | FreeBSD-src-cab0cc7d7a2d10ffa21890c1f9bcc26c6b74c540.zip FreeBSD-src-cab0cc7d7a2d10ffa21890c1f9bcc26c6b74c540.tar.gz |
Update vendor/illumos/dist and vendor-sys/illumos/dist
to illumos-gate 13869:921a99998bb4
(illumos ZFS issue #3246 ZFS I/O deadman thread)
Note: illumos disables deadman logic by default on vmware and sparc
-rw-r--r-- | uts/common/fs/zfs/spa.c | 6 | ||||
-rw-r--r-- | uts/common/fs/zfs/spa_misc.c | 75 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/spa.h | 1 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/spa_boot.h | 6 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/spa_impl.h | 4 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/vdev.h | 1 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/vdev_impl.h | 2 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/zfs_context.h | 3 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/zfs_ioctl.h | 12 | ||||
-rw-r--r-- | uts/common/fs/zfs/sys/zio.h | 4 | ||||
-rw-r--r-- | uts/common/fs/zfs/vdev.c | 38 | ||||
-rw-r--r-- | uts/common/fs/zfs/vdev_queue.c | 14 | ||||
-rw-r--r-- | uts/common/fs/zfs/zio_inject.c | 65 |
13 files changed, 199 insertions, 32 deletions
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index d9cd70f..968fbd8 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -5983,6 +5983,10 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6111,6 +6115,8 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); + /* * Clear the dirty config list. */ diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 30681b6..a254c8d 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ #include <sys/zfs_context.h> #include <sys/spa_impl.h> +#include <sys/spa_boot.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> @@ -249,6 +250,26 @@ int zfs_flags = 0; */ int zfs_recover = 0; +extern int zfs_txg_synctime_ms; + +/* + * Expiration time in units of zfs_txg_synctime_ms. This value has two + * meanings. First it is used to determine when the spa_deadman logic + * should fire. By default the spa_deadman will fire if spa_sync has + * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). + * Secondly, the value determines if an I/O is considered "hung". + * Any I/O that has not completed in zfs_deadman_synctime is considered + * "hung" resulting in a system panic. + * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). + */ +uint64_t zfs_deadman_synctime = 1000ULL; + +/* + * Override the zfs deadman behavior via /etc/system. By default the + * deadman is enabled except on VMware and sparc deployments. + */ +int zfs_deadman_enabled = -1; + /* * ========================================================================== @@ -418,6 +439,23 @@ spa_lookup(const char *name) } /* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +} + +/* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. @@ -427,6 +465,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; + cyc_handler_t hdlr; + cyc_time_t when; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -458,6 +498,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; + + spa->spa_deadman_synctime = zfs_deadman_synctime * + zfs_txg_synctime_ms * MICROSEC; + + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 synctimes before checking again. + */ + when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -540,6 +599,12 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; + refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); @@ -1507,6 +1572,12 @@ spa_prev_software_version(spa_t *spa) } uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); @@ -1600,7 +1671,9 @@ spa_init(int mode) spa_mode_global = mode; -#ifndef _KERNEL +#ifdef _KERNEL + spa_arch_init(); +#else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h index 1043f40..172a9f1 100644 --- a/uts/common/fs/zfs/sys/spa.h +++ b/uts/common/fs/zfs/sys/spa.h @@ -604,6 +604,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature); diff --git a/uts/common/fs/zfs/sys/spa_boot.h b/uts/common/fs/zfs/sys/spa_boot.h index 1d3622f..8df5072 100644 --- a/uts/common/fs/zfs/sys/spa_boot.h +++ b/uts/common/fs/zfs/sys/spa_boot.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H @@ -35,6 +39,8 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); +extern void spa_arch_init(void); + #ifdef __cplusplus } #endif diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index 027832e..42ce555 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -227,6 +227,10 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + cyclic_id_t spa_deadman_cycid; /* cyclic id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + uint64_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h index 7e34889..5a78366 100644 --- a/uts/common/fs/zfs/sys/vdev.h +++ b/uts/common/fs/zfs/sys/vdev.h @@ -79,6 +79,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h index 6d2e962..fcc2cbd 100644 --- a/uts/common/fs/zfs/sys/vdev_impl.h +++ b/uts/common/fs/zfs/sys/vdev_impl.h @@ -104,6 +104,8 @@ struct vdev_queue { avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; + uint64_t vq_io_complete_ts; + uint64_t vq_io_delta_ts; kmutex_t vq_lock; }; diff --git a/uts/common/fs/zfs/sys/zfs_context.h b/uts/common/fs/zfs/sys/zfs_context.h index fdd0412..0dc8d88 100644 --- a/uts/common/fs/zfs/sys/zfs_context.h +++ b/uts/common/fs/zfs/sys/zfs_context.h @@ -22,8 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -67,6 +69,7 @@ extern "C" { #include <sys/sysevent/dev.h> #include <sys/fm/util.h> #include <sys/sunddi.h> +#include <sys/cyclic.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h index 4d781ad..86e901b 100644 --- a/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -240,12 +240,24 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index 9e475b4..27ebe5e 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -21,8 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -403,6 +401,7 @@ struct zio { uint64_t io_offset; uint64_t io_deadline; + uint64_t io_timestamp; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -548,6 +547,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); +extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index fa0a579..18180ec 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -3153,3 +3153,41 @@ vdev_split(vdev_t *vd) } vdev_propagate_state(cvd); } + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_pending_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_pending_tree); + delta = ddi_get_lbolt64() - fio->io_timestamp; + if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) { + zfs_dbgmsg("SLOW IO: zio timestamp %llu, " + "delta %llu, last io %llu", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + fm_panic("I/O to pool '%s' appears to be " + "hung.", spa_name(spa)); + } + } + mutex_exit(&vq->vq_lock); + } +} diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c index 5a0d3ee..2b06040 100644 --- a/uts/common/fs/zfs/vdev_queue.c +++ b/uts/common/fs/zfs/vdev_queue.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -288,6 +292,7 @@ again: zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -359,7 +364,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = ddi_get_lbolt64(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -384,10 +390,16 @@ vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); + mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); + vq->vq_io_complete_ts = ddi_get_lbolt64(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) diff --git a/uts/common/fs/zfs/zio_inject.c b/uts/common/fs/zfs/zio_inject.c index 9ae7d1f..a9d4ab4 100644 --- a/uts/common/fs/zfs/zio_inject.c +++ b/uts/common/fs/zfs/zio_inject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors and panic injection */ - if (handler->zi_record.zi_guid != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ @@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults or panic injection */ - if (handler->zi_record.zi_start == 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* @@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* - * Ignore label specific faults, panic injection - * or fake writes - */ - if (handler->zi_record.zi_start != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { @@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio) handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* @@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { @@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } +uint64_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *handler; + uint64_t seconds = 0; + + if (zio_injection_enabled == 0) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + seconds = handler->zi_record.zi_timer; + break; + } + + } + rw_exit(&inject_lock); + return (seconds); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, |