diff options
220 files changed, 14079 insertions, 9902 deletions
diff --git a/bin/chio/chio.1 b/bin/chio/chio.1 index 0efd27d..8a85354 100644 --- a/bin/chio/chio.1 +++ b/bin/chio/chio.1 @@ -302,13 +302,11 @@ first appeared in The .Nm program and SCSI changer driver were written by -.An Jason R. Thorpe Aq thorpej@and.com +.An Jason R. Thorpe Aq Mt thorpej@and.com for And Communications, .Pa http://www.and.com/ . .Pp Additional work by -.An Hans Huebner -.Aq hans@artcom.de +.An Hans Huebner Aq Mt hans@artcom.de and -.An Steve Gunn -.Aq csg@waterspout.com . +.An Steve Gunn Aq Mt csg@waterspout.com . diff --git a/bin/echo/echo.1 b/bin/echo/echo.1 index f1f3907..f268ccd 100644 --- a/bin/echo/echo.1 +++ b/bin/echo/echo.1 @@ -32,7 +32,7 @@ .\" @(#)echo.1 8.1 (Berkeley) 7/22/93 .\" $FreeBSD$ .\" -.Dd November 12, 2010 +.Dd October 5, 2016 .Dt ECHO 1 .Os .Sh NAME @@ -103,3 +103,8 @@ The utility conforms to .St -p1003.1-2001 as amended by Cor.\& 1-2002. +.Sh HISTORY +The +.Nm +command appeared in +.At v2 . diff --git a/bin/expr/expr.1 b/bin/expr/expr.1 index 34be0b8f..30a4549 100644 --- a/bin/expr/expr.1 +++ b/bin/expr/expr.1 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd February 25, 2012 +.Dd October 5, 2016 .Dt EXPR 1 .Os .Sh NAME @@ -309,3 +309,19 @@ these arguments are treated just as their respective string values. The .Fl e flag is an extension. +.Sh HISTORY +An +.Nm +utility first appeared in the Programmer's Workbench (PWB/UNIX). +A public domain version of +.Nm +written by +.An Pace Willisson Aq Mt pace@blitz.com +appeared in +.Bx 386 0.1 . +.Sh AUTHORS +Initial implementation by +.An Pace Willisson Aq Mt pace@blitz.com +was largely rewritten by +.An -nosplit +.An J.T. Conklin Aq Mt jtc@FreeBSD.org . diff --git a/bin/freebsd-version/freebsd-version.1 b/bin/freebsd-version/freebsd-version.1 index 1eea5bb..7e471bf 100644 --- a/bin/freebsd-version/freebsd-version.1 +++ b/bin/freebsd-version/freebsd-version.1 @@ -121,4 +121,4 @@ command appeared in The .Nm utility and this manual page were written by -.An Dag-Erling Sm\(/orgrav Aq des@FreeBSD.org . +.An Dag-Erling Sm\(/orgrav Aq Mt des@FreeBSD.org . diff --git a/bin/pkill/pkill.1 b/bin/pkill/pkill.1 index d5a2ea6..70b7912 100644 --- a/bin/pkill/pkill.1 +++ b/bin/pkill/pkill.1 @@ -291,5 +291,4 @@ Solaris 7. They made their first appearance in .Fx 5.3 . .Sh AUTHORS -.An Andrew Doran -.Aq ad@NetBSD.org +.An Andrew Doran Aq Mt ad@NetBSD.org diff --git a/bin/pwd/pwd.1 b/bin/pwd/pwd.1 index 2b687a5..967e40e 100644 --- a/bin/pwd/pwd.1 +++ b/bin/pwd/pwd.1 @@ -32,7 +32,7 @@ .\" @(#)pwd.1 8.2 (Berkeley) 4/28/95 .\" $FreeBSD$ .\" -.Dd April 12, 2003 +.Dd October 5, 2016 .Dt PWD 1 .Os .Sh NAME @@ -85,6 +85,11 @@ The .Nm utility conforms to .St -p1003.1-2001 . +.Sh HISTORY +The +.Nm +command appeared in +.At v5 . .Sh BUGS In .Xr csh 1 diff --git a/bin/setfacl/setfacl.1 b/bin/setfacl/setfacl.1 index cfb0f01..a310c64 100644 --- a/bin/setfacl/setfacl.1 +++ b/bin/setfacl/setfacl.1 @@ -487,6 +487,6 @@ NFSv4 ACL support was introduced in The .Nm utility was written by -.An Chris D. Faulhaber Aq jedgar@fxp.org . +.An Chris D. Faulhaber Aq Mt jedgar@fxp.org . NFSv4 ACL support was implemented by -.An Edward Tomasz Napierala Aq trasz@FreeBSD.org . +.An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org . diff --git a/bin/stty/stty.1 b/bin/stty/stty.1 index db6ea04..a8092ce 100644 --- a/bin/stty/stty.1 +++ b/bin/stty/stty.1 @@ -32,7 +32,7 @@ .\" @(#)stty.1 8.4 (Berkeley) 4/18/94 .\" $FreeBSD$ .\" -.Dd August 23, 2008 +.Dd October 5, 2016 .Dt STTY 1 .Os .Sh NAME @@ -601,3 +601,8 @@ and .Fl f are extensions to the standard. +.Sh HISTORY +A +.Nm +command appeared in +.At v2 . diff --git a/bin/test/test.1 b/bin/test/test.1 index 136ee57..9c56506 100644 --- a/bin/test/test.1 +++ b/bin/test/test.1 @@ -32,7 +32,7 @@ .\" @(#)test.1 8.1 (Berkeley) 5/31/93 .\" $FreeBSD$ .\" -.Dd June 1, 2013 +.Dd October 5, 2016 .Dt TEST 1 .Os .Sh NAME @@ -376,6 +376,11 @@ The primaries and .Fl O are extensions. +.Sh HISTORY +A +.Nm +utility appeared in +.At v7 . .Sh BUGS Both sides are always evaluated in .Fl a diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 641ab59..d3c4269 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -117,7 +117,7 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " + "Usage: %s [-CumMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] " "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n" " %s [-divPA] [-e -p path...] [-U config] dataset " "[object...]\n" @@ -178,12 +178,23 @@ usage(void) (void) fprintf(stderr, " -I <number of inflight I/Os> -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } +static void +dump_debug_buffer() +{ + if (dump_opt['G']) { + (void) printf("\n"); + zfs_dbgmsg_print("zdb"); + } +} + /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. @@ -200,6 +211,8 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); + dump_debug_buffer(); + exit(1); } @@ -1289,7 +1302,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); @@ -3104,8 +3117,10 @@ dump_zpool(spa_t *spa) if (dump_opt['h']) dump_history(spa); - if (rc != 0) + if (rc != 0) { + dump_debug_buffer(); exit(rc); + } } #define ZDB_FLAG_CHECKSUM 0x0001 @@ -3576,7 +3591,7 @@ main(int argc, char **argv) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, - "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) { + "bcdhilmMI:suCDRSAFLXx:evp:t:U:PG")) != -1) { switch (c) { case 'b': case 'c': @@ -3592,6 +3607,7 @@ main(int argc, char **argv) case 'M': case 'R': case 'S': + case 'G': dump_opt[c]++; dump_all = 0; break; @@ -3827,6 +3843,8 @@ main(int argc, char **argv) fuid_table_destroy(); sa_loaded = B_FALSE; + dump_debug_buffer(); + libzfs_fini(g_zfs); kernel_fini(); diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index f655c3d..5fc5717 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -4792,7 +4793,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; @@ -4953,11 +4954,58 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { + /* + * The offset must be chosen carefully to ensure that + * we do not inject a given logical block with errors + * on two different leaf devices, because ZFS can not + * tolerate that (if maxfaults==1). + * + * We divide each leaf into chunks of size + * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk + * there is a series of ranges to which we can inject errors. + * Each range can accept errors on only a single leaf vdev. + * The error injection ranges are separated by ranges + * which we will not inject errors on any device (DMZs). + * Each DMZ must be large enough such that a single block + * can not straddle it, so that a single block can not be + * a target in two different injection ranges (on different + * leaf vdevs). + * + * For example, with 3 leaves, each chunk looks like: + * 0 to 32M: injection range for leaf 0 + * 32M to 64M: DMZ - no injection allowed + * 64M to 96M: injection range for leaf 1 + * 96M to 128M: DMZ - no injection allowed + * 128M to 160M: injection range for leaf 2 + * 160M to 192M: DMZ - no injection allowed + */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); - if (offset >= fsize) + /* + * Only allow damage to the labels at one end of the vdev. + * + * If all labels are damaged, the device will be totally + * inaccessible, which will result in loss of data, + * because we also damage (parts of) the other side of + * the mirror/raidz. + * + * Additionally, we will always have both an even and an + * odd label, so that we can handle crashes in the + * middle of vdev_config_sync(). + */ + if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) + continue; + + /* + * The two end labels are stored at the "end" of the disk, but + * the end of the disk (vdev_psize) is aligned to + * sizeof (vdev_label_t). + */ + uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + if ((leaf & 1) == 1 && + offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; VERIFY(mutex_lock(&ztest_vdev_lock) == 0); @@ -5021,9 +5069,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) return; } + dmu_objset_stats_t dds; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + object = od[0].od_object; blocksize = od[0].od_blocksize; - pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os); + pattern = zs->zs_guid ^ dds.dds_guid; ASSERT(object != 0); @@ -5355,6 +5408,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } return (NULL); } @@ -5620,9 +5679,13 @@ ztest_run(ztest_shared_t *zs) metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; + dmu_objset_stats_t dds; VERIFY0(dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, FTAG, &os)); - zs->zs_guid = dmu_objset_fsid_guid(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, FTAG); spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; diff --git a/contrib/tzcode/zic/zdump.c b/contrib/tzcode/zic/zdump.c index 2878a70..01111fe 100644 --- a/contrib/tzcode/zic/zdump.c +++ b/contrib/tzcode/zic/zdump.c @@ -212,24 +212,16 @@ const char * const zone; return; cp = abbrp; wp = NULL; - while (isascii((unsigned char) *cp) && isalpha((unsigned char) *cp)) + while (isascii((unsigned char) *cp) && + (isalnum((unsigned char)*cp) || *cp == '-' || *cp == '+')) ++cp; - if (cp - abbrp == 0) - wp = _("lacks alphabetic at start"); - else if (cp - abbrp < 3) - wp = _("has fewer than 3 alphabetics"); + if (cp - abbrp < 3) + wp = _("has fewer than 3 characters"); else if (cp - abbrp > 6) - wp = _("has more than 6 alphabetics"); - if (wp == NULL && (*cp == '+' || *cp == '-')) { - ++cp; - if (isascii((unsigned char) *cp) && - isdigit((unsigned char) *cp)) - if (*cp++ == '1' && *cp >= '0' && *cp <= '4') - ++cp; - if (*cp != '\0') - wp = _("differs from POSIX standard"); - } - if (wp == NULL) + wp = _("has more than 6 characters"); + else if (*cp) + wp = "has characters other than ASCII alphanumerics, '-' or '+'"; + else return; (void) fflush(stdout); (void) fprintf(stderr, diff --git a/contrib/tzcode/zic/zic.c b/contrib/tzcode/zic/zic.c index 5fb8b53..00043b7 100644 --- a/contrib/tzcode/zic/zic.c +++ b/contrib/tzcode/zic/zic.c @@ -2615,29 +2615,15 @@ const char * const string; register const char * cp; register char * wp; - /* - ** Want one to ZIC_MAX_ABBR_LEN_WO_WARN alphabetics - ** optionally followed by a + or - and a number from 1 to 14. - */ cp = string; wp = NULL; while (isascii((unsigned char) *cp) && - isalpha((unsigned char) *cp)) + (isalnum((unsigned char)*cp) || *cp == '-' || *cp == '+')) ++cp; - if (cp - string == 0) -wp = _("time zone abbreviation lacks alphabetic at start"); if (noise && cp - string > 3) -wp = _("time zone abbreviation has more than 3 alphabetics"); +wp = _("time zone abbreviation has more than 3 characters"); if (cp - string > ZIC_MAX_ABBR_LEN_WO_WARN) -wp = _("time zone abbreviation has too many alphabetics"); - if (wp == NULL && (*cp == '+' || *cp == '-')) { - ++cp; - if (isascii((unsigned char) *cp) && - isdigit((unsigned char) *cp)) - if (*cp++ == '1' && - *cp >= '0' && *cp <= '4') - ++cp; - } +wp = _("time zone abbreviation has too many characters"); if (*cp != '\0') wp = _("time zone abbreviation differs from POSIX standard"); if (wp != NULL) { diff --git a/contrib/tzdata/africa b/contrib/tzdata/africa index f20d216..d35aaa5 100644 --- a/contrib/tzdata/africa +++ b/contrib/tzdata/africa @@ -343,6 +343,12 @@ Rule Egypt 2007 only - Sep Thu>=1 24:00 0 - # decision to abandon DST permanently. See Ahram Online 2015-04-24. # http://english.ahram.org.eg/NewsContent/1/64/128509/Egypt/Politics-/Sisi-cancels-daylight-saving-time-in-Egypt.aspx +# From Steffen Thorsen (2016-04-29): +# Egypt will have DST from July 7 until the end of October.... +# http://english.ahram.org.eg/NewsContentP/1/204655/Egypt/Daylight-savings-time-returning-to-Egypt-on--July.aspx +# From Mina Samuel (2016-07-04): +# Egyptian government took the decision to cancel the DST, + Rule Egypt 2008 only - Aug lastThu 24:00 0 - Rule Egypt 2009 only - Aug 20 24:00 0 - Rule Egypt 2010 only - Aug 10 24:00 0 - @@ -458,7 +464,7 @@ Zone Africa/Monrovia -0:43:08 - LMT 1882 # http://www.libyaherald.com/2013/10/24/correction-no-time-change-tomorrow/ # # From Paul Eggert (2013-10-25): -# For now, assume they're reverting to the pre-2012 rules of permanent UTC+2. +# For now, assume they're reverting to the pre-2012 rules of permanent UT +02. # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Libya 1951 only - Oct 14 2:00 1:00 S @@ -858,11 +864,11 @@ Rule Morocco 2009 only - Aug 21 0:00 0 - Rule Morocco 2010 only - May 2 0:00 1:00 S Rule Morocco 2010 only - Aug 8 0:00 0 - Rule Morocco 2011 only - Apr 3 0:00 1:00 S -Rule Morocco 2011 only - Jul 31 0 0 - +Rule Morocco 2011 only - Jul 31 0:00 0 - Rule Morocco 2012 2013 - Apr lastSun 2:00 1:00 S -Rule Morocco 2012 only - Sep 30 3:00 0 - Rule Morocco 2012 only - Jul 20 3:00 0 - Rule Morocco 2012 only - Aug 20 2:00 1:00 S +Rule Morocco 2012 only - Sep 30 3:00 0 - Rule Morocco 2013 only - Jul 7 3:00 0 - Rule Morocco 2013 only - Aug 10 2:00 1:00 S Rule Morocco 2013 max - Oct lastSun 3:00 0 - diff --git a/contrib/tzdata/antarctica b/contrib/tzdata/antarctica index 2af088f..0995835 100644 --- a/contrib/tzdata/antarctica +++ b/contrib/tzdata/antarctica @@ -10,10 +10,8 @@ # http://www.spri.cam.ac.uk/bob/periant.htm # for information. # Unless otherwise specified, we have no time zone information. -# -# Except for the French entries, -# I made up all time zone abbreviations mentioned here; corrections welcome! -# FORMAT is 'zzz' and GMTOFF is 0 for locations while uninhabited. + +# FORMAT is '-00' and GMTOFF is 0 for locations while uninhabited. # Argentina - year-round bases # Belgrano II, Confin Coast, -770227-0343737, since 1972-02-05 @@ -29,7 +27,7 @@ # previously sealers and scientific personnel wintered # Margaret Turner reports # http://web.archive.org/web/20021204222245/http://www.dstc.qut.edu.au/DST/marg/daylight.html -# (1999-09-30) that they're UTC+5, with no DST; +# (1999-09-30) that they're UT +05, with no DST; # presumably this is when they have visitors. # # year-round bases @@ -67,24 +65,23 @@ # http://www.timeanddate.com/news/time/antartica-time-changes-2010.html # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/Casey 0 - zzz 1969 - 8:00 - AWST 2009 Oct 18 2:00 - # Australian Western Std Time - 11:00 - CAST 2010 Mar 5 2:00 # Casey Time - 8:00 - AWST 2011 Oct 28 2:00 - 11:00 - CAST 2012 Feb 21 17:00u - 8:00 - AWST -Zone Antarctica/Davis 0 - zzz 1957 Jan 13 - 7:00 - DAVT 1964 Nov # Davis Time - 0 - zzz 1969 Feb - 7:00 - DAVT 2009 Oct 18 2:00 - 5:00 - DAVT 2010 Mar 10 20:00u - 7:00 - DAVT 2011 Oct 28 2:00 - 5:00 - DAVT 2012 Feb 21 20:00u - 7:00 - DAVT -Zone Antarctica/Mawson 0 - zzz 1954 Feb 13 - 6:00 - MAWT 2009 Oct 18 2:00 # Mawson Time - 5:00 - MAWT +Zone Antarctica/Casey 0 - -00 1969 + 8:00 - +08 2009 Oct 18 2:00 + 11:00 - +11 2010 Mar 5 2:00 + 8:00 - +08 2011 Oct 28 2:00 + 11:00 - +11 2012 Feb 21 17:00u + 8:00 - +08 +Zone Antarctica/Davis 0 - -00 1957 Jan 13 + 7:00 - +07 1964 Nov + 0 - -00 1969 Feb + 7:00 - +07 2009 Oct 18 2:00 + 5:00 - +05 2010 Mar 10 20:00u + 7:00 - +07 2011 Oct 28 2:00 + 5:00 - +05 2012 Feb 21 20:00u + 7:00 - +07 +Zone Antarctica/Mawson 0 - -00 1954 Feb 13 + 6:00 - +06 2009 Oct 18 2:00 + 5:00 - +05 # References: # Casey Weather (1998-02-26) # http://www.antdiv.gov.au/aad/exop/sfo/casey/casey_aws.html @@ -137,8 +134,8 @@ Zone Antarctica/Mawson 0 - zzz 1954 Feb 13 # fishing stations operated variously 1819/1931 # # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Indian/Kerguelen 0 - zzz 1950 # Port-aux-Français - 5:00 - TFT # ISO code TF Time +Zone Indian/Kerguelen 0 - -00 1950 # Port-aux-Français + 5:00 - +05 # # year-round base in the main continent # Dumont d'Urville, Île des Pétrels, -6640+14001, since 1956-11 @@ -148,10 +145,10 @@ Zone Indian/Kerguelen 0 - zzz 1950 # Port-aux-Français # It was destroyed by fire on 1952-01-14. # # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/DumontDUrville 0 - zzz 1947 - 10:00 - PMT 1952 Jan 14 # Port-Martin Time - 0 - zzz 1956 Nov - 10:00 - DDUT # Dumont-d'Urville Time +Zone Antarctica/DumontDUrville 0 - -00 1947 + 10:00 - +10 1952 Jan 14 + 0 - -00 1956 Nov + 10:00 - +10 # France & Italy - year-round base # Concordia, -750600+1232000, since 2005 @@ -176,8 +173,8 @@ Zone Antarctica/DumontDUrville 0 - zzz 1947 # was established on 1957-01-29. Since Syowa station is still the main # station of Japan, it's appropriate for the principal location. # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/Syowa 0 - zzz 1957 Jan 29 - 3:00 - SYOT # Syowa Time +Zone Antarctica/Syowa 0 - -00 1957 Jan 29 + 3:00 - +03 # See: # NIPR Antarctic Research Activities (1999-08-17) # http://www.nipr.ac.jp/english/ara01.html @@ -214,19 +211,19 @@ Zone Antarctica/Syowa 0 - zzz 1957 Jan 29 # correct, but they should be quite close to the actual dates. # # From Paul Eggert (2014-03-21): -# The CET-switching Troll rules require zic from tzcode 2014b or later, so as +# The CET-switching Troll rules require zic from tz 2014b or later, so as # suggested by Bengt-Inge Larsson comment them out for now, and approximate # with only UTC and CEST. Uncomment them when 2014b is more prevalent. # # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S -#Rule Troll 2005 max - Mar 1 1:00u 1:00 CET -Rule Troll 2005 max - Mar lastSun 1:00u 2:00 CEST -#Rule Troll 2005 max - Oct lastSun 1:00u 1:00 CET -#Rule Troll 2004 max - Nov 7 1:00u 0:00 UTC +#Rule Troll 2005 max - Mar 1 1:00u 1:00 +01 +Rule Troll 2005 max - Mar lastSun 1:00u 2:00 +02 +#Rule Troll 2005 max - Oct lastSun 1:00u 1:00 +01 +#Rule Troll 2004 max - Nov 7 1:00u 0:00 +00 # Remove the following line when uncommenting the above '#Rule' lines. -Rule Troll 2004 max - Oct lastSun 1:00u 0:00 UTC +Rule Troll 2004 max - Oct lastSun 1:00u 0:00 +00 # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/Troll 0 - zzz 2005 Feb 12 +Zone Antarctica/Troll 0 - -00 2005 Feb 12 0:00 Troll %s # Poland - year-round base @@ -265,10 +262,10 @@ Zone Antarctica/Troll 0 - zzz 2005 Feb 12 # changes during the year and does not necessarily correspond to mean # solar noon. So the Vostok time might have been whatever the clocks # happened to be during their visit. So we still don't really know what time -# it is at Vostok. But we'll guess UTC+6. +# it is at Vostok. But we'll guess +06. # -Zone Antarctica/Vostok 0 - zzz 1957 Dec 16 - 6:00 - VOST # Vostok time +Zone Antarctica/Vostok 0 - -00 1957 Dec 16 + 6:00 - +06 # S Africa - year-round bases # Marion Island, -4653+03752 @@ -300,8 +297,8 @@ Zone Antarctica/Vostok 0 - zzz 1957 Dec 16 # <http://webexhibits.org/daylightsaving/g.html> says Rothera is -03 all year. # # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/Rothera 0 - zzz 1976 Dec 1 - -3:00 - ROTT # Rothera time +Zone Antarctica/Rothera 0 - -00 1976 Dec 1 + -3:00 - -03 # Uruguay - year round base # Artigas, King George Island, -621104-0585107 diff --git a/contrib/tzdata/asia b/contrib/tzdata/asia index 5467024..71ef878 100644 --- a/contrib/tzdata/asia +++ b/contrib/tzdata/asia @@ -79,13 +79,9 @@ Rule E-EurAsia 1979 1995 - Sep lastSun 0:00 0 - Rule E-EurAsia 1996 max - Oct lastSun 0:00 0 - Rule RussiaAsia 1981 1984 - Apr 1 0:00 1:00 S Rule RussiaAsia 1981 1983 - Oct 1 0:00 0 - -Rule RussiaAsia 1984 1991 - Sep lastSun 2:00s 0 - -Rule RussiaAsia 1985 1991 - Mar lastSun 2:00s 1:00 S -Rule RussiaAsia 1992 only - Mar lastSat 23:00 1:00 S -Rule RussiaAsia 1992 only - Sep lastSat 23:00 0 - -Rule RussiaAsia 1993 max - Mar lastSun 2:00s 1:00 S -Rule RussiaAsia 1993 1995 - Sep lastSun 2:00s 0 - -Rule RussiaAsia 1996 max - Oct lastSun 2:00s 0 - +Rule RussiaAsia 1984 1995 - Sep lastSun 2:00s 0 - +Rule RussiaAsia 1985 2011 - Mar lastSun 2:00s 1:00 S +Rule RussiaAsia 1996 2011 - Oct lastSun 2:00s 0 - # Afghanistan # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -120,31 +116,37 @@ Zone Asia/Kabul 4:36:48 - LMT 1890 # http://www.worldtimezone.com/dst_news/dst_news_armenia03.html # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Yerevan 2:58:00 - LMT 1924 May 2 - 3:00 - YERT 1957 Mar # Yerevan Time - 4:00 RussiaAsia YER%sT 1991 Mar 31 2:00s - 3:00 1:00 YERST 1991 Sep 23 # independence - 3:00 RussiaAsia AM%sT 1995 Sep 24 2:00s - 4:00 - AMT 1997 - 4:00 RussiaAsia AM%sT 2012 Mar 25 2:00s - 4:00 - AMT + 3:00 - +03 1957 Mar + 4:00 RussiaAsia +04/+05 1991 Mar 31 2:00s + 3:00 RussiaAsia +03/+04 1995 Sep 24 2:00s + 4:00 - +04 1997 + 4:00 RussiaAsia +04/+05 # Azerbaijan + # From Rustam Aliyev of the Azerbaijan Internet Forum (2005-10-23): # According to the resolution of Cabinet of Ministers, 1997 # From Paul Eggert (2015-09-17): It was Resolution No. 21 (1997-03-17). # http://code.az/files/daylight_res.pdf + +# From Steffen Thorsen (2016-03-17): +# ... the Azerbaijani Cabinet of Ministers has cancelled switching to +# daylight saving time.... +# http://www.azernews.az/azerbaijan/94137.html +# http://vestnikkavkaza.net/news/Azerbaijani-Cabinet-of-Ministers-cancels-daylight-saving-time.html +# http://en.apa.az/xeber_azerbaijan_abolishes_daylight_savings_ti_240862.html + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S -Rule Azer 1997 max - Mar lastSun 4:00 1:00 S -Rule Azer 1997 max - Oct lastSun 5:00 0 - +Rule Azer 1997 2015 - Mar lastSun 4:00 1:00 S +Rule Azer 1997 2015 - Oct lastSun 5:00 0 - # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Baku 3:19:24 - LMT 1924 May 2 - 3:00 - BAKT 1957 Mar # Baku Time - 4:00 RussiaAsia BAK%sT 1991 Mar 31 2:00s - 3:00 1:00 BAKST 1991 Aug 30 # independence - 3:00 RussiaAsia AZ%sT 1992 Sep lastSat 23:00 - 4:00 - AZT 1996 # Azerbaijan Time - 4:00 EUAsia AZ%sT 1997 - 4:00 Azer AZ%sT + 3:00 - +03 1957 Mar + 4:00 RussiaAsia +04/+05 1991 Mar 31 2:00s + 3:00 RussiaAsia +03/+04 1992 Sep lastSun 2:00s + 4:00 - +04 1996 + 4:00 EUAsia +04/+05 1997 + 4:00 Azer +04/+05 # Bahrain # See Asia/Qatar. @@ -263,7 +265,7 @@ Zone Asia/Brunei 7:39:40 - LMT 1926 Mar # Bandar Seri Begawan # Milne says 6:24:40 was the meridian of the time ball observatory at Rangoon. # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Asia/Rangoon 6:24:40 - LMT 1880 # or Yangon +Zone Asia/Yangon 6:24:40 - LMT 1880 # or Rangoon 6:24:40 - RMT 1920 # Rangoon Mean Time? 6:30 - BURT 1942 May # Burma Time 9:00 - JST 1945 May 3 @@ -378,7 +380,7 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # Lewiston (ME) Daily Sun (1939-05-29), p 17, said "Even the time is # different - the occupied districts going by Tokyo time, an hour # ahead of that prevailing in the rest of Shanghai." Guess that the -# Xujiahui Observatory was under French control and stuck with UT+8. +# Xujiahui Observatory was under French control and stuck with UT +08. # # In earlier versions of this file, China had many separate Zone entries, but # this was based on what were apparently incorrect data in Shanks & Pottenger. @@ -387,26 +389,26 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # Proposed in 1918 and theoretically in effect until 1949 (although in practice # mainly observed in coastal areas), the five zones were: # -# Changbai Time ("Long-white Time", Long-white = Heilongjiang area) UT+8.5 +# Changbai Time ("Long-white Time", Long-white = Heilongjiang area) UT +08:30 # Asia/Harbin (currently a link to Asia/Shanghai) # Heilongjiang (except Mohe county), Jilin # -# Zhongyuan Time ("Central plain Time") UT+8 +# Zhongyuan Time ("Central plain Time") UT +08 # Asia/Shanghai # most of China # This currently represents most other zones as well, # as apparently these regions have been the same since 1970. # Milne gives 8:05:43.2 for Xujiahui Observatory time; round to nearest. -# Guo says Shanghai switched to UT+8 "from the end of the 19th century". +# Guo says Shanghai switched to UT +08 "from the end of the 19th century". # -# Long-shu Time (probably due to Long and Shu being two names of that area) UT+7 +# Long-shu Time (probably due to Long and Shu being two names of the area) UT +07 # Asia/Chongqing (currently a link to Asia/Shanghai) # Guangxi, Guizhou, Hainan, Ningxia, Sichuan, Shaanxi, and Yunnan; # most of Gansu; west Inner Mongolia; west Qinghai; and the Guangdong # counties Deqing, Enping, Kaiping, Luoding, Taishan, Xinxing, # Yangchun, Yangjiang, Yu'nan, and Yunfu. # -# Xin-zang Time ("Xinjiang-Tibet Time") UT+6 +# Xin-zang Time ("Xinjiang-Tibet Time") UT +06 # Asia/Urumqi # This currently represents Kunlun Time as well, # as apparently the two regions have been the same since 1970. @@ -419,7 +421,7 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # Shihezi, Changji, Yanqi, Heshuo, Tuokexun, Tulufan, Shanshan, Hami, # Fukang, Kuitun, Kumukuli, Miquan, Qitai, and Turfan. # -# Kunlun Time UT+5.5 +# Kunlun Time UT +05:30 # Asia/Kashgar (currently a link to Asia/Urumqi) # West Tibet, including Pulan, Aheqi, Shufu, Shule; # West Xinjiang, including Aksu, Atushi, Yining, Hetian, Cele, Luopu, Nileke, @@ -435,7 +437,7 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # # On the other hand, ethnic Uyghurs, who make up about half the # population of Xinjiang, typically use "Xinjiang time" which is two -# hours behind Beijing time, or UTC +0600. The government of the Xinjiang +# hours behind Beijing time, or UT +06. The government of the Xinjiang # Uyghur Autonomous Region, (XAUR, or just Xinjiang for short) as well as # local governments such as the Ürümqi city government use both times in # publications, referring to what is popularly called Xinjiang time as @@ -491,8 +493,8 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # having the same time as Beijing. # From Paul Eggert (2014-06-30): -# In the early days of the PRC, Tibet was given its own time zone (UT+6) but -# this was withdrawn in 1959 and never reinstated; see Tubten Khétsun, +# In the early days of the PRC, Tibet was given its own time zone (UT +06) +# but this was withdrawn in 1959 and never reinstated; see Tubten Khétsun, # Memories of life in Lhasa under Chinese Rule, Columbia U Press, ISBN # 978-0231142861 (2008), translator's introduction by Matthew Akester, p x. # As this is before our 1970 cutoff, Tibet doesn't need a separate zone. @@ -506,12 +508,12 @@ Rule PRC 1987 1991 - Apr Sun>=10 0:00 1:00 D # Republics, the Soviet Union, the Kuomintang, and the People's Republic of # China, and tracking down all these organizations' timekeeping rules would be # quite a trick. Approximate this lost history by a transition from LMT to -# XJT at the start of 1928, the year of accession of the warlord Jin Shuren, +# UT +06 at the start of 1928, the year of accession of the warlord Jin Shuren, # which happens to be the date given by Shanks & Pottenger (no doubt as a -# guess) as the transition from LMT. Ignore the usage of UT+8 before -# 1986-02-01 under the theory that the transition date to UT+8 is unknown and +# guess) as the transition from LMT. Ignore the usage of +08 before +# 1986-02-01 under the theory that the transition date to +08 is unknown and # that the sort of users who prefer Asia/Urumqi now typically ignored the -# UT+8 mandate back then. +# +08 mandate back then. # Zone NAME GMTOFF RULES FORMAT [UNTIL] # Beijing time, used throughout China; represented by Shanghai. @@ -716,7 +718,7 @@ Zone Asia/Hong_Kong 7:36:42 - LMT 1904 Oct 30 # be found from historical government announcement database. # From Paul Eggert (2014-07-03): -# As per Yu-Cheng Chuang, say that Taiwan was at UT+9 from 1937-10-01 +# As per Yu-Cheng Chuang, say that Taiwan was at UT +09 from 1937-10-01 # until 1945-09-21 at 01:00, overriding Shanks & Pottenger. # Likewise, use Yu-Cheng Chuang's data for DST in Taiwan. @@ -830,16 +832,15 @@ Link Asia/Nicosia Europe/Nicosia # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Tbilisi 2:59:11 - LMT 1880 2:59:11 - TBMT 1924 May 2 # Tbilisi Mean Time - 3:00 - TBIT 1957 Mar # Tbilisi Time - 4:00 RussiaAsia TBI%sT 1991 Mar 31 2:00s - 3:00 1:00 TBIST 1991 Apr 9 # independence - 3:00 RussiaAsia GE%sT 1992 # Georgia Time - 3:00 E-EurAsia GE%sT 1994 Sep lastSun - 4:00 E-EurAsia GE%sT 1996 Oct lastSun - 4:00 1:00 GEST 1997 Mar lastSun - 4:00 E-EurAsia GE%sT 2004 Jun 27 - 3:00 RussiaAsia GE%sT 2005 Mar lastSun 2:00 - 4:00 - GET + 3:00 - +03 1957 Mar + 4:00 RussiaAsia +04/+05 1991 Mar 31 2:00s + 3:00 RussiaAsia +03/+04 1992 + 3:00 E-EurAsia +03/+04 1994 Sep lastSun + 4:00 E-EurAsia +04/+05 1996 Oct lastSun + 4:00 1:00 +05 1997 Mar lastSun + 4:00 E-EurAsia +04/+05 2004 Jun 27 + 3:00 RussiaAsia +03/+04 2005 Mar lastSun 2:00 + 4:00 - +04 # East Timor @@ -874,6 +875,15 @@ Zone Asia/Dili 8:22:20 - LMT 1912 Jan 1 9:00 - TLT # India + +# From Ian P. Beacock, in "A brief history of (modern) time", The Atlantic +# http://www.theatlantic.com/technology/archive/2015/12/the-creation-of-modern-time/421419/ +# (2015-12-22): +# In January 1906, several thousand cotton-mill workers rioted on the +# outskirts of Bombay.... They were protesting the proposed abolition of +# local time in favor of Indian Standard Time.... Journalists called this +# dispute the "Battle of the Clocks." It lasted nearly half a century. + # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Kolkata 5:53:28 - LMT 1880 # Kolkata 5:53:20 - HMT 1941 Oct # Howrah Mean Time? @@ -907,7 +917,7 @@ Zone Asia/Kolkata 5:53:28 - LMT 1880 # Kolkata # These would be the earliest possible times for a change. # Régimes horaires pour le monde entier, by Henri Le Corre, (Éditions # Traditionnelles, 1987, Paris) says that Java and Madura switched -# from JST to UTC+07:30 on 1945-09-23, and gives 1944-09-01 for Jayapura +# from UT +09 to +07:30 on 1945-09-23, and gives 1944-09-01 for Jayapura # (Hollandia). For now, assume all Indonesian locations other than Jayapura # switched on 1945-09-23. # @@ -918,11 +928,11 @@ Zone Asia/Kolkata 5:53:28 - LMT 1880 # Kolkata # summary published by the Time and Frequency Laboratory of the # Research Center for Calibration, Instrumentation and Metrology, # Indonesia, <http://time.kim.lipi.go.id/time-eng.php> (2006-09-29). -# The abbreviations are: +# The time zone abbreviations and UT offsets are: # -# WIB - UTC+7 - Waktu Indonesia Barat (Indonesia western time) -# WITA - UTC+8 - Waktu Indonesia Tengah (Indonesia central time) -# WIT - UTC+9 - Waktu Indonesia Timur (Indonesia eastern time) +# WIB - +07 - Waktu Indonesia Barat (Indonesia western time) +# WITA - +08 - Waktu Indonesia Tengah (Indonesia central time) +# WIT - +09 - Waktu Indonesia Timur (Indonesia eastern time) # # Zone NAME GMTOFF RULES FORMAT [UNTIL] # Java, Sumatra @@ -1084,8 +1094,15 @@ Rule Iran 2032 2033 - Mar 21 0:00 1:00 D Rule Iran 2032 2033 - Sep 21 0:00 0 S Rule Iran 2034 2035 - Mar 22 0:00 1:00 D Rule Iran 2034 2035 - Sep 22 0:00 0 S -Rule Iran 2036 2037 - Mar 21 0:00 1:00 D -Rule Iran 2036 2037 - Sep 21 0:00 0 S +# +# The following rules are approximations starting in the year 2038. +# These are the best post-2037 approximations available, given the +# restrictions of a single rule using a Gregorian-based data format. +# At some point this table will need to be extended, though quite +# possibly Iran will change the rules first. +Rule Iran 2036 max - Mar 21 0:00 1:00 D +Rule Iran 2036 max - Sep 21 0:00 0 S + # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Tehran 3:25:44 - LMT 1916 3:25:44 - TMT 1946 # Tehran Mean Time @@ -1542,23 +1559,6 @@ Zone Asia/Amman 2:23:44 - LMT 1931 # Kazakhstan -# From Paul Eggert (1996-11-22): -# Andrew Evtichov (1996-04-13) writes that Kazakhstan -# stayed in sync with Moscow after 1990, and that Aqtobe (formerly Aktyubinsk) -# and Aqtau (formerly Shevchenko) are the largest cities in their zones. -# Guess that Aqtau and Aqtobe diverged in 1995, since that's the first time -# IATA SSIM mentions a third time zone in Kazakhstan. - -# From Paul Eggert (2006-03-22): -# German Iofis, ELSI, Almaty (2001-10-09) reports that Kazakhstan uses -# RussiaAsia rules, instead of switching at 00:00 as the IATA has it. -# Go with Shanks & Pottenger, who have them always using RussiaAsia rules. -# Also go with the following claims of Shanks & Pottenger: -# -# - Kazakhstan did not observe DST in 1991. -# - Qyzylorda switched from +5:00 to +6:00 on 1992-01-19 02:00. -# - Oral switched from +5:00 to +4:00 in spring 1989. - # From Kazakhstan Embassy's News Bulletin No. 11 # <http://www.kazsociety.org.uk/news/2005/03/30.htm> (2005-03-21): # The Government of Kazakhstan passed a resolution March 15 abolishing @@ -1575,61 +1575,232 @@ Zone Asia/Amman 2:23:44 - LMT 1931 # everything else.... I guess that would make Kazakhstan time zones # de jure UTC+5 and UTC+6 respectively. +# From Stepan Golosunov (2016-03-27) ([*] means see later comments below): +# Review of the linked documents from http://adilet.zan.kz/ +# produced the following data for post-1991 Kazakhstan: +# +# 0. Act of the Cabinet of Ministers of the USSR +# from 1991-02-04 No. 20 +# http://pravo.gov.ru/proxy/ips/?docbody=&nd=102010545 +# removed the extra hour ("decree time") on the territory of the USSR +# starting with the last Sunday of March 1991. +# It also allowed (but not mandated) Kazakh SSR, Kirghiz SSR, Tajik SSR, +# Turkmen SSR and Uzbek SSR to not have "summer" time. +# +# The 1992-01-13 act also refers to the act of the Cabinet of Ministers +# of the Kazakh SSR from 1991-03-20 No. 170 "About the act of the Cabinet +# of Ministers of the USSR from 1991-02-04 No. 20" but I didn't found its +# text. +# +# According to Izvestia newspaper No. 68 (23334) from 1991-03-20 +# (page 6; available at http://libinfo.org/newsr/newsr2574.djvu via +# http://libinfo.org/index.php?id=58564) on 1991-03-31 at 2:00 during +# transition to "summer" time: +# Republic of Georgia, Latvian SSR, Lithuanian SSR, SSR Moldova, +# Estonian SSR; Komi ASSR; Kaliningrad oblast; Nenets autonomous okrug +# were to move clocks 1 hour forward. +# Kazakh SSR (excluding Uralsk oblast); Republic of Kyrgyzstan, Tajik +# SSR; Andijan, Jizzakh, Namangan, Sirdarya, Tashkent, Fergana oblasts +# of the Uzbek SSR were to move clocks 1 hour backwards. +# Other territories were to not move clocks. +# When the "summer" time would end on 1991-09-29, clocks were to be +# moved 1 hour backwards on the territory of the USSR excluding +# Kazakhstan, Kirghizia, Uzbekistan, Turkmenia, Tajikistan. +# +# Apparently there were last minute changes. Apparently Kazakh act No. 170 +# was one of such changes. +# +# https://ru.wikipedia.org/wiki/Декретное время +# claims that Sovetskaya Rossiya newspaper on 1991-03-29 published that +# Nenets autonomous okrug, Komi and Kazakhstan (excluding Uralsk oblast) +# were to not move clocks and Uralsk oblast was to move clocks +# forward; on 1991-09-29 Kazakhstan was to move clocks backwards. +# (Probably there were changes even after that publication. There is an +# article claiming that Kaliningrad oblast decided on 1991-03-29 to not +# move clocks.) +# +# This implies that on 1991-03-31 Asia/Oral remained on +04/+05 while +# the rest of Kazakhstan switched from +06/+07 to +05/06 or from +05/06 +# to +04/+05. It's unclear how Kzyl-Orda oblast moved into the fifth +# time belt. (By switching from +04/+05 to +05/+06 on 1991-09-29?) ... +# +# 1. Act of the Cabinet of Ministers of the Republic of Kazakhstan +# from 1992-01-13 No. 28 +# http://adilet.zan.kz/rus/docs/P920000028_ +# (text includes modification from the 1996 act) +# introduced new rules for calculation of time, mirroring Russian +# 1992-01-08 act. It specified that time would be calculated +# according to time belts plus extra hour ("decree time"), moved clocks +# on the whole territory of Kazakhstan 1 hour forward on 1992-01-19 at +# 2:00, specified DST rules. It acknowledged that Kazakhstan was +# located in the fourth and the fifth time belts and specified the +# border between them to be located east of Kustanay and Aktyubinsk +# oblasts (notably including Turgai and Kzyl-Orda oblasts into the fifth +# time belt). +# +# This means switch on 1992-01-19 at 2:00 from +04/+05 to +05/+06 for +# Asia/Aqtau, Asia/Aqtobe, Asia/Oral, Atyrau and Kustanay oblasts; from +# +05/+06 to +06/+07 for Asia/Almaty and Asia/Qyzylorda (and Arkalyk) [*].... +# +# 2. Act of the Cabinet of Ministers of the Republic of Kazakhstan +# from 1992-03-27 No. 284 +# http://adilet.zan.kz/rus/docs/P920000284_ +# cancels extra hour ("decree time") for Uralsk and Kzyl-Orda oblasts +# since the last Sunday of March 1992, while keeping them in the fourth +# and the fifth time belts respectively. +# +# 3. Order of the Prime Minister of the Republic of Kazakhstan +# from 1994-09-23 No. 384 +# http://adilet.zan.kz/rus/docs/R940000384_ +# cancels the extra hour ("decree time") on the territory of Mangystau +# oblast since the last Sunday of September 1994 (saying that time on +# the territory would correspond to the third time belt as a +# result).... +# +# 4. Act of the Government of the Republic of Kazakhstan +# from 1996-05-08 No. 575 +# http://adilet.zan.kz/rus/docs/P960000575_ +# amends the 1992-01-13 act to end summer time in October instead +# of September, mirroring identical Russian change from 1996-04-23 act. +# +# 5. Act of the Government of the Republic of Kazakhstan +# from 1999-03-26 No. 305 +# http://adilet.zan.kz/rus/docs/P990000305_ +# cancels the extra hour ("decree time") for Atyrau oblast since the +# last Sunday of March 1999 while retaining the oblast in the fourth +# time belt. +# +# This means change from +05/+06 to +04/+05. +# +# There is no zone for Atyrau currently (listed under Asia/Aqtau in +# zone1970.tab).[*] +# +# 6. Act of the Government of the Republic of Kazakhstan +# from 2000-11-23 No. 1749 +# http://adilet.zan.kz/rus/archive/docs/P000001749_/23.11.2000 +# replaces the previous five documents. +# +# The only changes I noticed are in definition of the border between the +# fourth and the fifth time belts. They account for changes in spelling +# and administrative division (splitting of Turgai oblast in 1997 +# probably changed time in territories incorporated into Kostanay oblast +# (including Arkalyk) from +06/+07 to +05/+06) and move Kyzylorda oblast +# from being in the fifth time belt and not using decree time into the +# fourth time belt (no change in practice).[*] +# +# 7. Act of the Government of the Republic of Kazakhstan +# from 2003-12-29 No. 1342 +# http://adilet.zan.kz/rus/docs/P030001342_ +# modified the 2000-11-23 act. No relevant changes, apparently. +# +# 8. Act of the Government of the Republic of Kazakhstan +# from 2004-07-20 No. 775 +# http://adilet.zan.kz/rus/archive/docs/P040000775_/20.07.2004 +# modified the 2000-11-23 act to move Kostanay and Kyzylorda oblasts into +# the fifth time belt and add Aktobe oblast to the list of regions not +# using extra hour ("decree time"), leaving Kazakhstan with only 2 time +# zones (+04/+05 and +06/+07). The changes were to be implemented +# during DST transitions in 2004 and 2005 but the acts got radically +# amended before implementation happened. +# +# 9. Act of the Government of the Republic of Kazakhstan +# from 2004-09-15 No. 1059 +# http://adilet.zan.kz/rus/docs/P040001059_ +# modified the 2000-11-23 act to remove exceptions from the "decree time" +# (leaving Kazakhstan in +05/+06 and +06/+07 zones), amended the +# 2004-07-20 act to implement changes for Atyrau, West Kazakhstan, +# Kostanay, Kyzylorda and Mangystau oblasts by not moving clocks +# during the 2014 transition to "winter" time. +# +# This means transition from +04/+05 to +05/+06 for Atyrau oblast (no +# zone currently), Asia/Oral, Asia/Aqtau and transition from +05/+06 to +# +06/+07 for Kostanay oblast (Kostanay and Arkalyk, no zones currently) +# and Asia/Qyzylorda on 2004-10-31 at 3:00....[*] +# +# 10. Act of the Government of the Republic of Kazakhstan +# from 2005-03-15 No. 231 +# http://adilet.zan.kz/rus/docs/P050000231_ +# removes DST provisions from the 2000-11-23 act, removes most of the +# (already implemented) provisions from the 2004-07-20 and 2004-09-15 +# acts, comes into effect 10 days after official publication. +# The only practical effect seems to be the abolition of the summer +# time. +# +# Unamended version of the act of the Government of the Russian Federation +# No. 23 from 1992-01-08 [See 'europe' file for details]. +# Kazakh 1992-01-13 act appears to provide the same rules and 1992-03-27 +# act was to be enacted on the last Sunday of March 1992. + +# From Paul Eggert (2016-04-15): +# The tables below should reflect Stepan Golosunov's remarks above, +# except for the items marked "[*]" which I haven't gotten to yet. +# It looks like we will need new zones Asia/Atyrau and Asia/Qostanay +# to handle changes from 1992 through 2004 that we did not previously +# know about. + # # Zone NAME GMTOFF RULES FORMAT [UNTIL] # # Almaty (formerly Alma-Ata), representing most locations in Kazakhstan +# This includes KZ-AKM, KZ-ALA, KZ-ALM, KZ-AST, KZ-BAY, KZ-VOS, KZ-ZHA, +# KZ-KAR, KZ-SEV, KZ-PAV, and KZ-YUZ. Zone Asia/Almaty 5:07:48 - LMT 1924 May 2 # or Alma-Ata - 5:00 - ALMT 1930 Jun 21 # Alma-Ata Time - 6:00 RussiaAsia ALM%sT 1991 - 6:00 - ALMT 1992 - 6:00 RussiaAsia ALM%sT 2005 Mar 15 - 6:00 - ALMT -# Qyzylorda (aka Kyzylorda, Kizilorda, Kzyl-Orda, etc.) + 5:00 - +05 1930 Jun 21 + 6:00 RussiaAsia +06/+07 1991 Mar 31 2:00s + 5:00 RussiaAsia +05/+06 1992 Jan 19 2:00s + 6:00 RussiaAsia +06/+07 2004 Oct 31 2:00s + 6:00 - +06 +# Qyzylorda (aka Kyzylorda, Kizilorda, Kzyl-Orda, etc.) (KZ-KZY) Zone Asia/Qyzylorda 4:21:52 - LMT 1924 May 2 - 4:00 - KIZT 1930 Jun 21 # Kizilorda Time - 5:00 - KIZT 1981 Apr 1 - 5:00 1:00 KIZST 1981 Oct 1 - 6:00 - KIZT 1982 Apr 1 - 5:00 RussiaAsia KIZ%sT 1991 - 5:00 - KIZT 1991 Dec 16 # independence - 5:00 - QYZT 1992 Jan 19 2:00 - 6:00 RussiaAsia QYZ%sT 2005 Mar 15 - 6:00 - QYZT -# Aqtobe (aka Aktobe, formerly Aktyubinsk) + 4:00 - +04 1930 Jun 21 + 5:00 - +05 1981 Apr 1 + 5:00 1:00 +06 1981 Oct 1 + 6:00 - +06 1982 Apr 1 + 5:00 RussiaAsia +05/+06 1991 Mar 31 2:00s + 4:00 RussiaAsia +04/+05 1991 Sep 29 2:00s + 5:00 RussiaAsia +05/+06 1992 Jan 19 2:00s + 6:00 RussiaAsia +06/+07 1992 Mar 29 2:00s + 5:00 RussiaAsia +05/+06 2004 Oct 31 2:00s + 6:00 - +06 +# Aqtobe (aka Aktobe, formerly Aktyubinsk) (KZ-AKT) Zone Asia/Aqtobe 3:48:40 - LMT 1924 May 2 - 4:00 - AKTT 1930 Jun 21 # Aktyubinsk Time - 5:00 - AKTT 1981 Apr 1 - 5:00 1:00 AKTST 1981 Oct 1 - 6:00 - AKTT 1982 Apr 1 - 5:00 RussiaAsia AKT%sT 1991 - 5:00 - AKTT 1991 Dec 16 # independence - 5:00 RussiaAsia AQT%sT 2005 Mar 15 # Aqtobe Time - 5:00 - AQTT -# Mangghystau + 4:00 - +04 1930 Jun 21 + 5:00 - +05 1981 Apr 1 + 5:00 1:00 +06 1981 Oct 1 + 6:00 - +06 1982 Apr 1 + 5:00 RussiaAsia +05/+06 1991 Mar 31 2:00s + 4:00 RussiaAsia +04/+05 1992 Jan 19 2:00s + 5:00 RussiaAsia +05/+06 2004 Oct 31 2:00s + 5:00 - +05 +# Qostanay (KZ-KUS) + +# Mangghystau (KZ-MAN) # Aqtau was not founded until 1963, but it represents an inhabited region, # so include time stamps before 1963. Zone Asia/Aqtau 3:21:04 - LMT 1924 May 2 - 4:00 - FORT 1930 Jun 21 # Fort Shevchenko T - 5:00 - FORT 1963 - 5:00 - SHET 1981 Oct 1 # Shevchenko Time - 6:00 - SHET 1982 Apr 1 - 5:00 RussiaAsia SHE%sT 1991 - 5:00 - SHET 1991 Dec 16 # independence - 5:00 RussiaAsia AQT%sT 1995 Mar lastSun 2:00 # Aqtau Time - 4:00 RussiaAsia AQT%sT 2005 Mar 15 - 5:00 - AQTT -# West Kazakhstan + 4:00 - +04 1930 Jun 21 + 5:00 - +05 1963 + 5:00 - +05 1981 Oct 1 + 6:00 - +06 1982 Apr 1 + 5:00 RussiaAsia +05/+06 1991 Mar 31 2:00s + 4:00 RussiaAsia +04/+05 1992 Jan 19 2:00s + 5:00 RussiaAsia +05/+06 1994 Sep 25 2:00s + 4:00 RussiaAsia +04/+05 2004 Oct 31 2:00s + 5:00 - +05 + +# West Kazakhstan (KZ-ZAP) +# From Paul Eggert (2016-03-18): +# The 1989 transition is from USSR act No. 227 (1989-03-14). Zone Asia/Oral 3:25:24 - LMT 1924 May 2 # or Ural'sk - 4:00 - URAT 1930 Jun 21 # Ural'sk time - 5:00 - URAT 1981 Apr 1 - 5:00 1:00 URAST 1981 Oct 1 - 6:00 - URAT 1982 Apr 1 - 5:00 RussiaAsia URA%sT 1989 Mar 26 2:00 - 4:00 RussiaAsia URA%sT 1991 - 4:00 - URAT 1991 Dec 16 # independence - 4:00 RussiaAsia ORA%sT 2005 Mar 15 # Oral Time - 5:00 - ORAT + 4:00 - +04 1930 Jun 21 + 5:00 - +05 1981 Apr 1 + 5:00 1:00 +06 1981 Oct 1 + 6:00 - +06 1982 Apr 1 + 5:00 RussiaAsia +05/+06 1989 Mar 26 2:00s + 4:00 RussiaAsia +04/+05 1992 Jan 19 2:00s + 5:00 RussiaAsia +05/+06 1992 Mar 29 2:00s + 4:00 RussiaAsia +04/+05 2004 Oct 31 2:00s + 5:00 - +05 # Kyrgyzstan (Kirgizstan) # Transitions through 1991 are from Shanks & Pottenger. @@ -1650,11 +1821,11 @@ Rule Kyrgyz 1997 2005 - Mar lastSun 2:30 1:00 S Rule Kyrgyz 1997 2004 - Oct lastSun 2:30 0 - # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Bishkek 4:58:24 - LMT 1924 May 2 - 5:00 - FRUT 1930 Jun 21 # Frunze Time - 6:00 RussiaAsia FRU%sT 1991 Mar 31 2:00s - 5:00 1:00 FRUST 1991 Aug 31 2:00 # independence - 5:00 Kyrgyz KG%sT 2005 Aug 12 # Kyrgyzstan Time - 6:00 - KGT + 5:00 - +05 1930 Jun 21 + 6:00 RussiaAsia +06/+07 1991 Mar 31 2:00s + 5:00 RussiaAsia +05/+06 1991 Aug 31 2:00 + 5:00 Kyrgyz +05/+06 2005 Aug 12 + 6:00 - +06 ############################################################################### @@ -1693,25 +1864,24 @@ Rule ROK 1957 1960 - Sep Sun>=18 0:00 0 S Rule ROK 1987 1988 - May Sun>=8 2:00 1:00 D Rule ROK 1987 1988 - Oct Sun>=8 3:00 0 S -# From Paul Eggert (2014-10-30): +# From Paul Eggert (2016-08-23): # The Korean Wikipedia entry gives the following sources for UT offsets: # -# 1908: Official Journal Article No. 3994 (Edict No. 5) +# 1908: Official Journal Article No. 3994 (decree No. 5) # 1912: Governor-General of Korea Official Gazette Issue No. 367 # (Announcement No. 338) # 1954: Presidential Decree No. 876 (1954-03-17) # 1961: Law No. 676 (1961-08-07) -# 1987: Law No. 3919 (1986-12-31) # -# The Wikipedia entry also has confusing information about a change -# to UT+9 in April 1910, but then what would be the point of the later change -# to UT+9 on 1912-01-01? Omit the 1910 change for now. +# (Another source "1987: Law No. 3919 (1986-12-31)" was in the 2014-10-30 +# edition of the Korean Wikipedia entry.) # # I guessed that time zone abbreviations through 1945 followed the same # rules as discussed under Taiwan, with nominal switches from JST to KST # when the respective cities were taken over by the Allies after WWII. # -# For Pyongyang we have no information; guess no changes since World War II. +# For Pyongyang, guess no changes from World War II until 2015, as we +# have no information otherwise. # From Steffen Thorsen (2015-08-07): # According to many news sources, North Korea is going to change to @@ -1871,7 +2041,7 @@ Zone Indian/Maldives 4:54:00 - LMT 1880 # Male # Bill Bonnet (2005-05-19) reports that the US Embassy in Ulaanbaatar says # there is only one time zone and that DST is observed, citing Microsoft # Windows XP as the source. Risto Nykänen (2005-05-16) reports that -# travelmongolia.org says there are two time zones (UTC+7, UTC+8) with no DST. +# travelmongolia.org says there are two time zones (UT +07, +08) with no DST. # Oscar van Vlijmen (2005-05-20) reports that the Mongolian Embassy in # Washington, DC says there are two time zones, with DST observed. # He also found @@ -2111,8 +2281,8 @@ Zone Asia/Kathmandu 5:41:16 - LMT 1920 # http://www.app.com.pk/en_/index.php?option=com_content&task=view&id=99374&Itemid=2 # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S -Rule Pakistan 2002 only - Apr Sun>=2 0:01 1:00 S -Rule Pakistan 2002 only - Oct Sun>=2 0:01 0 - +Rule Pakistan 2002 only - Apr Sun>=2 0:00 1:00 S +Rule Pakistan 2002 only - Oct Sun>=2 0:00 0 - Rule Pakistan 2008 only - Jun 1 0:00 1:00 S Rule Pakistan 2008 2009 - Nov 1 0:00 0 - Rule Pakistan 2009 only - Apr 15 0:00 1:00 S @@ -2380,6 +2550,16 @@ Zone Asia/Karachi 4:28:12 - LMT 1907 # http://www.timeanddate.com/time/change/gaza-strip/gaza # http://www.timeanddate.com/time/change/west-bank/hebron +# From Hannah Kreitem (2016-03-09): +# http://www.palestinecabinet.gov.ps/WebSite/ar/ViewDetails?ID=31728 +# [Google translation]: "The Council also decided to start daylight +# saving in Palestine as of one o'clock on Saturday morning, +# 2016-03-26, to provide the clock 60 minutes ahead." +# +# From Paul Eggert (2016-03-12): +# Predict spring transitions on March's last Saturday at 01:00 from now on. +# Leave fall predictions alone for now. + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule EgyptAsia 1957 only - May 10 0:00 1:00 S Rule EgyptAsia 1957 1958 - Oct 1 0:00 0 - @@ -2408,7 +2588,8 @@ Rule Palestine 2012 2014 - Mar lastThu 24:00 1:00 S Rule Palestine 2012 only - Sep 21 1:00 0 - Rule Palestine 2013 only - Sep Fri>=21 0:00 0 - Rule Palestine 2014 max - Oct Fri>=21 0:00 0 - -Rule Palestine 2015 max - Mar lastFri 24:00 1:00 S +Rule Palestine 2015 only - Mar lastFri 24:00 1:00 S +Rule Palestine 2016 max - Mar lastSat 1:00 1:00 S # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Gaza 2:17:52 - LMT 1900 Oct @@ -2496,7 +2677,7 @@ Link Asia/Qatar Asia/Bahrain # earlier date. # # Shanks & Pottenger also state that until 1968-05-01 Saudi Arabia had two -# time zones; the other zone, at UTC+4, was in the far eastern part of +# time zones; the other zone, at UT +04, was in the far eastern part of # the country. Ignore this, as it's before our 1970 cutoff. # # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -2765,10 +2946,10 @@ Zone Asia/Damascus 2:25:12 - LMT 1920 # Dimashq # From Shanks & Pottenger. # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Dushanbe 4:35:12 - LMT 1924 May 2 - 5:00 - DUST 1930 Jun 21 # Dushanbe Time - 6:00 RussiaAsia DUS%sT 1991 Mar 31 2:00s - 5:00 1:00 DUSST 1991 Sep 9 2:00s - 5:00 - TJT # Tajikistan Time + 5:00 - +05 1930 Jun 21 + 6:00 RussiaAsia +06/+07 1991 Mar 31 2:00s + 5:00 1:00 +05/+06 1991 Sep 9 2:00s + 5:00 - +05 # Thailand # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -2782,11 +2963,10 @@ Link Asia/Bangkok Asia/Vientiane # Laos # From Shanks & Pottenger. # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Ashgabat 3:53:32 - LMT 1924 May 2 # or Ashkhabad - 4:00 - ASHT 1930 Jun 21 # Ashkhabad Time - 5:00 RussiaAsia ASH%sT 1991 Mar 31 2:00 - 4:00 RussiaAsia ASH%sT 1991 Oct 27 # independence - 4:00 RussiaAsia TM%sT 1992 Jan 19 2:00 - 5:00 - TMT + 4:00 - +04 1930 Jun 21 + 5:00 RussiaAsia +05/+06 1991 Mar 31 2:00 + 4:00 RussiaAsia +04/+05 1992 Jan 19 2:00 + 5:00 - +05 # United Arab Emirates # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -2798,20 +2978,18 @@ Link Asia/Dubai Asia/Muscat # Oman # Byalokoz 1919 says Uzbekistan was 4:27:53. # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Samarkand 4:27:53 - LMT 1924 May 2 - 4:00 - SAMT 1930 Jun 21 # Samarkand Time - 5:00 - SAMT 1981 Apr 1 - 5:00 1:00 SAMST 1981 Oct 1 - 6:00 - TAST 1982 Apr 1 # Tashkent Time - 5:00 RussiaAsia SAM%sT 1991 Sep 1 # independence - 5:00 RussiaAsia UZ%sT 1992 - 5:00 - UZT + 4:00 - +04 1930 Jun 21 + 5:00 - +05 1981 Apr 1 + 5:00 1:00 +06 1981 Oct 1 + 6:00 - +06 1982 Apr 1 + 5:00 RussiaAsia +05/+06 1992 + 5:00 - +05 # Milne says Tashkent was 4:37:10.8; round to nearest. Zone Asia/Tashkent 4:37:11 - LMT 1924 May 2 - 5:00 - TAST 1930 Jun 21 # Tashkent Time - 6:00 RussiaAsia TAS%sT 1991 Mar 31 2:00 - 5:00 RussiaAsia TAS%sT 1991 Sep 1 # independence - 5:00 RussiaAsia UZ%sT 1992 - 5:00 - UZT + 5:00 - +05 1930 Jun 21 + 6:00 RussiaAsia +06/+07 1991 Mar 31 2:00 + 5:00 RussiaAsia +05/+06 1992 + 5:00 - +05 # Vietnam diff --git a/contrib/tzdata/australasia b/contrib/tzdata/australasia index 5c272db..f49df1d 100644 --- a/contrib/tzdata/australasia +++ b/contrib/tzdata/australasia @@ -60,6 +60,14 @@ Zone Australia/Eucla 8:35:28 - LMT 1895 Dec # Hamilton is the largest, but there is also a Hamilton in Victoria, # so use Lindeman. # +# From J William Piggott (2016-02-20): +# There is no location named Holiday Islands in Queensland Australia; holiday +# islands is a colloquial term used globally. Hayman and Lindeman are at the +# north and south extremes of the Whitsunday Islands archipelago, and +# Hamilton is in between; it is reasonable to believe that this time zone +# applies to all of the Whitsundays. +# http://www.australia.gov.au/about-australia/australian-story/austn-islands +# # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule AQ 1971 only - Oct lastSun 2:00s 1:00 D Rule AQ 1972 only - Feb lastSun 2:00s 0 S @@ -235,11 +243,11 @@ Zone Australia/Lord_Howe 10:36:20 - LMT 1895 Feb # will produce a binary file with an [A]EST-type as the first 32-bit type; # this is required for correct handling of times before 1916 by # pre-2013 versions of localtime. -Zone Antarctica/Macquarie 0 - zzz 1899 Nov +Zone Antarctica/Macquarie 0 - -00 1899 Nov 10:00 - AEST 1916 Oct 1 2:00 10:00 1:00 AEDT 1917 Feb 10:00 Aus AE%sT 1919 Apr 1 0:00s - 0 - zzz 1948 Mar 25 + 0 - -00 1948 Mar 25 10:00 Aus AE%sT 1967 10:00 AT AE%sT 2010 Apr 4 3:00 11:00 - MIST # Macquarie I Standard Time @@ -537,7 +545,7 @@ Zone Pacific/Port_Moresby 9:48:40 - LMT 1880 # Base the Bougainville entry on the Arawa-Kieta region, which appears to have # the most people even though it was devastated in the Bougainville Civil War. # -# Although Shanks gives 1942-03-15 / 1943-11-01 for JST, these dates +# Although Shanks gives 1942-03-15 / 1943-11-01 for UT +09, these dates # are apparently rough guesswork from the starts of military campaigns. # The World War II entries below are instead based on Arawa-Kieta. # The Japanese occupied Kieta in July 1942, @@ -545,8 +553,8 @@ Zone Pacific/Port_Moresby 9:48:40 - LMT 1880 # http://pwencycl.kgbudge.com/B/o/Bougainville.htm # and seem to have controlled it until their 1945-08-21 surrender. # -# The Autonomous Region of Bougainville plans to switch from UTC+10 to UTC+11 -# on 2014-12-28 at 02:00. They call UTC+11 "Bougainville Standard Time"; +# The Autonomous Region of Bougainville switched from UT +10 to +11 +# on 2014-12-28 at 02:00. They call +11 "Bougainville Standard Time"; # abbreviate this as BST. See: # http://www.bougainville24.com/bougainville-issues/bougainville-gets-own-timezone/ # @@ -612,7 +620,7 @@ Link Pacific/Pago_Pago Pacific/Midway # in US minor outlying islands # From Paul Eggert (2014-06-27): # The International Date Line Act 2011 # http://www.parliament.gov.ws/images/ACTS/International_Date_Line_Act__2011_-_Eng.pdf -# changed Samoa from UTC-11 to UTC+13, effective "12 o'clock midnight, on +# changed Samoa from UT -11 to +13, effective "12 o'clock midnight, on # Thursday 29th December 2011". The International Date Line was adjusted # accordingly. @@ -707,7 +715,7 @@ Zone Pacific/Funafuti 11:56:52 - LMT 1901 # 1886-1891; Baker was similar but exact dates are not known. # Inhabited by civilians 1935-1942; U.S. military bases 1943-1944; # uninhabited thereafter. -# Howland observed Hawaii Standard Time (UT-10:30) in 1937; +# Howland observed Hawaii Standard Time (UT -10:30) in 1937; # see page 206 of Elgen M. Long and Marie K. Long, # Amelia Earhart: the Mystery Solved, Simon & Schuster (2000). # So most likely Howland and Baker observed Hawaii Time from 1935 @@ -1465,7 +1473,7 @@ Zone Pacific/Wallis 12:15:20 - LMT 1901 # Zealand time. I understand that is the time they keep locally, anyhow." # For now, assume this practice goes back to the introduction of standard time # in New Zealand, as this would make Chatham Islands time almost exactly match -# LMT back when New Zealand was at UTC+11:30; also, assume Chatham Islands did +# LMT back when New Zealand was at UT +11:30; also, assume Chatham Islands did # not observe New Zealand's prewar DST. ############################################################################### @@ -1521,7 +1529,7 @@ Zone Pacific/Wallis 12:15:20 - LMT 1901 # For now, we assume the Ladrones switched at the same time as the Philippines; # see Asia/Manila. -# US Public Law 106-564 (2000-12-23) made UTC+10 the official standard time, +# US Public Law 106-564 (2000-12-23) made UT +10 the official standard time, # under the name "Chamorro Standard Time". There is no official abbreviation, # but Congressman Robert A. Underwood, author of the bill that became law, # wrote in a press release (2000-12-27) that he will seek the use of "ChST". @@ -1533,15 +1541,15 @@ Zone Pacific/Wallis 12:15:20 - LMT 1901 # "I am certain, having lived there for the past decade, that 'Truk' # (now properly known as Chuuk) ... is in the time zone GMT+10." # -# Shanks & Pottenger write that Truk switched from UTC+10 to UTC+11 +# Shanks & Pottenger write that Truk switched from UT +10 to +11 # on 1978-10-01; ignore this for now. # From Paul Eggert (1999-10-29): # The Federated States of Micronesia Visitors Board writes in # The Federated States of Micronesia - Visitor Information (1999-01-26) # http://www.fsmgov.org/info/clocks.html -# that Truk and Yap are UTC+10, and Ponape and Kosrae are UTC+11. -# We don't know when Kosrae switched from UTC+12; assume January 1 for now. +# that Truk and Yap are UT +10, and Ponape and Kosrae are +11. +# We don't know when Kosrae switched from +12; assume January 1 for now. # Midway @@ -1607,11 +1615,11 @@ Zone Pacific/Wallis 12:15:20 - LMT 1901 # ordaining - by a masterpiece of diplomatic flattery - that # the Fourth of July should be celebrated twice in that year." -# Although Shanks & Pottenger says they both switched to UTC-11:30 -# in 1911, and to UTC-11 in 1950. many earlier sources give UTC-11 +# Although Shanks & Pottenger says they both switched to UT -11:30 +# in 1911, and to -11 in 1950. many earlier sources give -11 # for American Samoa, e.g., the US National Bureau of Standards # circular "Standard Time Throughout the World", 1932. -# Assume American Samoa switched to UTC-11 in 1911, not 1950, +# Assume American Samoa switched to -11 in 1911, not 1950, # and that after 1950 they agreed until (western) Samoa skipped a # day in 2011. Assume also that the Samoas follow the US and New # Zealand's "ST"/"DT" style of daylight-saving abbreviations. diff --git a/contrib/tzdata/backward b/contrib/tzdata/backward index 8b0fef5..aa23dd8 100644 --- a/contrib/tzdata/backward +++ b/contrib/tzdata/backward @@ -23,6 +23,7 @@ Link America/Argentina/Mendoza America/Mendoza Link America/Toronto America/Montreal Link America/Rio_Branco America/Porto_Acre Link America/Argentina/Cordoba America/Rosario +Link America/Tijuana America/Santa_Isabel Link America/Denver America/Shiprock Link America/Port_of_Spain America/Virgin Link Pacific/Auckland Antarctica/South_Pole @@ -35,6 +36,7 @@ Link Asia/Shanghai Asia/Harbin Link Asia/Urumqi Asia/Kashgar Link Asia/Kathmandu Asia/Katmandu Link Asia/Macau Asia/Macao +Link Asia/Yangon Asia/Rangoon Link Asia/Ho_Chi_Minh Asia/Saigon Link Asia/Jerusalem Asia/Tel_Aviv Link Asia/Thimphu Asia/Thimbu diff --git a/contrib/tzdata/etcetera b/contrib/tzdata/etcetera index 3d271a5..99ad2c7 100644 --- a/contrib/tzdata/etcetera +++ b/contrib/tzdata/etcetera @@ -8,6 +8,13 @@ # need now for the entries that are not on UTC are for ships at sea # that cannot use POSIX TZ settings. +# Starting with POSIX 1003.1-2001, the entries below are all +# unnecessary as settings for the TZ environment variable. E.g., +# instead of TZ='Etc/GMT+4' one can use the POSIX setting TZ='<-04>+4'. +# +# Do not use a POSIX TZ setting like TZ='GMT+4', which is four hours +# behind GMT but uses the completely misleading abbreviation "GMT". + Zone Etc/GMT 0 - GMT Zone Etc/UTC 0 - UTC Zone Etc/UCT 0 - UCT @@ -26,23 +33,13 @@ Link Etc/GMT Etc/GMT-0 Link Etc/GMT Etc/GMT+0 Link Etc/GMT Etc/GMT0 -# We use POSIX-style signs in the Zone names and the output abbreviations, +# Be consistent with POSIX TZ settings in the Zone names, # even though this is the opposite of what many people expect. # POSIX has positive signs west of Greenwich, but many people expect # positive signs east of Greenwich. For example, TZ='Etc/GMT+4' uses -# the abbreviation "GMT+4" and corresponds to 4 hours behind UT +# the abbreviation "-04" and corresponds to 4 hours behind UT # (i.e. west of Greenwich) even though many people would expect it to # mean 4 hours ahead of UT (i.e. east of Greenwich). -# -# In the draft 5 of POSIX 1003.1-200x, the angle bracket notation allows for -# TZ='<GMT-4>+4'; if you want time zone abbreviations conforming to -# ISO 8601 you can use TZ='<-0400>+4'. Thus the commonly-expected -# offset is kept within the angle bracket (and is used for display) -# while the POSIX sign is kept outside the angle bracket (and is used -# for calculation). -# -# Do not use a TZ setting like TZ='GMT+4', which is four hours behind -# GMT but uses the completely misleading abbreviation "GMT". # Earlier incarnations of this package were not POSIX-compliant, # and had lines such as @@ -51,30 +48,31 @@ Link Etc/GMT Etc/GMT0 # way does a # zic -l GMT-12 # so we moved the names into the Etc subdirectory. +# Also, the time zone abbreviations are now compatible with %z. -Zone Etc/GMT-14 14 - GMT-14 # 14 hours ahead of GMT -Zone Etc/GMT-13 13 - GMT-13 -Zone Etc/GMT-12 12 - GMT-12 -Zone Etc/GMT-11 11 - GMT-11 -Zone Etc/GMT-10 10 - GMT-10 -Zone Etc/GMT-9 9 - GMT-9 -Zone Etc/GMT-8 8 - GMT-8 -Zone Etc/GMT-7 7 - GMT-7 -Zone Etc/GMT-6 6 - GMT-6 -Zone Etc/GMT-5 5 - GMT-5 -Zone Etc/GMT-4 4 - GMT-4 -Zone Etc/GMT-3 3 - GMT-3 -Zone Etc/GMT-2 2 - GMT-2 -Zone Etc/GMT-1 1 - GMT-1 -Zone Etc/GMT+1 -1 - GMT+1 -Zone Etc/GMT+2 -2 - GMT+2 -Zone Etc/GMT+3 -3 - GMT+3 -Zone Etc/GMT+4 -4 - GMT+4 -Zone Etc/GMT+5 -5 - GMT+5 -Zone Etc/GMT+6 -6 - GMT+6 -Zone Etc/GMT+7 -7 - GMT+7 -Zone Etc/GMT+8 -8 - GMT+8 -Zone Etc/GMT+9 -9 - GMT+9 -Zone Etc/GMT+10 -10 - GMT+10 -Zone Etc/GMT+11 -11 - GMT+11 -Zone Etc/GMT+12 -12 - GMT+12 +Zone Etc/GMT-14 14 - +14 +Zone Etc/GMT-13 13 - +13 +Zone Etc/GMT-12 12 - +12 +Zone Etc/GMT-11 11 - +11 +Zone Etc/GMT-10 10 - +10 +Zone Etc/GMT-9 9 - +09 +Zone Etc/GMT-8 8 - +08 +Zone Etc/GMT-7 7 - +07 +Zone Etc/GMT-6 6 - +06 +Zone Etc/GMT-5 5 - +05 +Zone Etc/GMT-4 4 - +04 +Zone Etc/GMT-3 3 - +03 +Zone Etc/GMT-2 2 - +02 +Zone Etc/GMT-1 1 - +01 +Zone Etc/GMT+1 -1 - -01 +Zone Etc/GMT+2 -2 - -02 +Zone Etc/GMT+3 -3 - -03 +Zone Etc/GMT+4 -4 - -04 +Zone Etc/GMT+5 -5 - -05 +Zone Etc/GMT+6 -6 - -06 +Zone Etc/GMT+7 -7 - -07 +Zone Etc/GMT+8 -8 - -08 +Zone Etc/GMT+9 -9 - -09 +Zone Etc/GMT+10 -10 - -10 +Zone Etc/GMT+11 -11 - -11 +Zone Etc/GMT+12 -12 - -12 diff --git a/contrib/tzdata/europe b/contrib/tzdata/europe index 358a048..6020059 100644 --- a/contrib/tzdata/europe +++ b/contrib/tzdata/europe @@ -75,8 +75,7 @@ # 1:00 CET CEST CEMT Central Europe # 1:00:14 SET Swedish (1879-1899)* # 2:00 EET EEST Eastern Europe -# 3:00 FET Further-eastern Europe (2011-2014)* -# 3:00 MSK MSD MSM* Minsk, Moscow +# 3:00 MSK MSD Moscow # From Peter Ilieve (1994-12-04), # The original six [EU members]: Belgium, France, (West) Germany, Italy, @@ -583,16 +582,33 @@ Rule E-Eur 1979 1995 - Sep lastSun 0:00 0 - Rule E-Eur 1981 max - Mar lastSun 0:00 1:00 S Rule E-Eur 1996 max - Oct lastSun 0:00 0 - + +# Daylight saving time for Russia and the Soviet Union +# +# The 1917-1921 decree URLs are from Alexander Belopolsky (2016-08-23). + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Russia 1917 only - Jul 1 23:00 1:00 MST # Moscow Summer Time +# +# Decree No. 142 (1917-12-22) http://istmat.info/node/28137 Rule Russia 1917 only - Dec 28 0:00 0 MMT # Moscow Mean Time +# +# Decree No. 497 (1918-05-30) http://istmat.info/node/30001 Rule Russia 1918 only - May 31 22:00 2:00 MDST # Moscow Double Summer Time Rule Russia 1918 only - Sep 16 1:00 1:00 MST +# +# Decree No. 258 (1919-05-29) http://istmat.info/node/37949 Rule Russia 1919 only - May 31 23:00 2:00 MDST -Rule Russia 1919 only - Jul 1 2:00 1:00 MSD +# +Rule Russia 1919 only - Jul 1 0:00u 1:00 MSD Rule Russia 1919 only - Aug 16 0:00 0 MSK +# +# Decree No. 63 (1921-02-03) http://istmat.info/node/45840 Rule Russia 1921 only - Feb 14 23:00 1:00 MSD -Rule Russia 1921 only - Mar 20 23:00 2:00 MSM # Midsummer +# +# Decree No. 121 (1921-03-07) http://istmat.info/node/45949 +Rule Russia 1921 only - Mar 20 23:00 2:00 +05 +# Rule Russia 1921 only - Sep 1 0:00 1:00 MSD Rule Russia 1921 only - Oct 1 0:00 0 - # Act No. 925 of the Council of Ministers of the USSR (1980-10-24): @@ -600,16 +616,40 @@ Rule Russia 1981 1984 - Apr 1 0:00 1:00 S Rule Russia 1981 1983 - Oct 1 0:00 0 - # Act No. 967 of the Council of Ministers of the USSR (1984-09-13), repeated in # Act No. 227 of the Council of Ministers of the USSR (1989-03-14): -Rule Russia 1984 1991 - Sep lastSun 2:00s 0 - -Rule Russia 1985 1991 - Mar lastSun 2:00s 1:00 S +Rule Russia 1984 1995 - Sep lastSun 2:00s 0 - +Rule Russia 1985 2010 - Mar lastSun 2:00s 1:00 S # -Rule Russia 1992 only - Mar lastSat 23:00 1:00 S -Rule Russia 1992 only - Sep lastSat 23:00 0 - -Rule Russia 1993 2010 - Mar lastSun 2:00s 1:00 S -Rule Russia 1993 1995 - Sep lastSun 2:00s 0 - Rule Russia 1996 2010 - Oct lastSun 2:00s 0 - # As described below, Russia's 2014 change affects Zone data, not Rule data. +# From Stepan Golosunov (2016-03-07): +# Wikipedia and other sources refer to the Act of the Council of +# Ministers of the USSR from 1988-01-04 No. 5 and the Act of the +# Council of Ministers of the USSR from 1989-03-14 No. 227. +# +# I did not find full texts of these acts. For the 1989 one we have +# title at http://base.garant.ru/70754136/ : +# "About change in calculation of time on the territories of +# Lithuanian SSR, Latvian SSR and Estonian SSR, Astrakhan, +# Kaliningrad, Kirov, Kuybyshev, Ulyanovsk and Uralsk oblasts". +# And http://astrozet.net/files/Zones/DOC/RU/1980-925.txt appears to +# contain quotes from both acts: Since last Sunday of March 1988 rules +# of the second time belt are installed in Volgograd and Saratov +# oblasts. Since last Sunday of March 1989: +# a) Lithuanian SSR, Latvian SSR, Estonian SSR, Kaliningrad oblast: +# second time belt rules without extra hour (Moscow-1); +# b) Astrakhan, Kirov, Kuybyshev, Ulyanovsk oblasts: second time belt +# rules (Moscow time) +# c) Uralsk oblast: third time belt rules (Moscow+1). + +# From Stepan Golosunov (2016-03-27): +# Unamended version of the act of the +# Government of the Russian Federation No. 23 from 08.01.1992 +# http://pravo.gov.ru/proxy/ips/?docbody=&nd=102014034&rdk=0 +# says that every year clocks were to be moved forward on last Sunday +# of March at 2 hours and moved backwards on last Sunday of September +# at 3 hours. It was amended in 1996 to replace September with October. + # From Alexander Krivenyshev (2011-06-14): # According to Kremlin press service, Russian President Dmitry Medvedev # signed a federal law "On calculation of time" on June 9, 2011. @@ -731,6 +771,14 @@ Zone Europe/Vienna 1:05:21 - LMT 1893 Apr 1:00 EU CE%sT # Belarus +# +# From Stepan Golosunov (2016-07-02): +# http://www.lawbelarus.com/repub/sub30/texf9611.htm +# (Act of the Cabinet of Ministers of the Republic of Belarus from +# 1992-03-25 No. 157) ... says clocks were to be moved forward at 2:00 +# on last Sunday of March and backward at 3:00 on last Sunday of September +# (the same as previous USSR and contemporary Russian regulations). +# # From Yauhen Kharuzhy (2011-09-16): # By latest Belarus government act Europe/Minsk timezone was changed to # GMT+3 without DST (was GMT+2 with DST). @@ -743,8 +791,6 @@ Zone Europe/Vienna 1:05:21 - LMT 1893 Apr # From Alexander Bokovoy (2014-10-09): # Belarussian government decided against changing to winter time.... # http://eng.belta.by/all_news/society/Belarus-decides-against-adjusting-time-in-Russias-wake_i_76335.html -# From Paul Eggert (2014-10-08): -# Hence Belarus can share time zone abbreviations with Moscow again. # # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Europe/Minsk 1:50:16 - LMT 1880 @@ -754,12 +800,8 @@ Zone Europe/Minsk 1:50:16 - LMT 1880 1:00 C-Eur CE%sT 1944 Jul 3 3:00 Russia MSK/MSD 1990 3:00 - MSK 1991 Mar 31 2:00s - 2:00 1:00 EEST 1991 Sep 29 2:00s - 2:00 - EET 1992 Mar 29 0:00s - 2:00 1:00 EEST 1992 Sep 27 0:00s 2:00 Russia EE%sT 2011 Mar 27 2:00s - 3:00 - FET 2014 Oct 26 1:00s - 3:00 - MSK + 3:00 - +03 # Belgium # @@ -1005,6 +1047,12 @@ Zone Atlantic/Faroe -0:27:04 - LMT 1908 Jan 11 # Tórshavn # startkart.no says Thule does not observe DST, but this is clearly an error, # so go with Shanks & Pottenger for Thule transitions until this year. # For 2007 on assume Thule will stay in sync with US DST rules. + +# From J William Piggott (2016-02-20): +# "Greenland north of the community of Scoresbysund" is officially named +# "National Park" by Executive Order: +# http://naalakkersuisut.gl/~/media/Nanoq/Files/Attached%20Files/Engelske-tekster/Legislation/Executive%20Order%20National%20Park.rtf +# It is their only National Park. # # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Thule 1991 1992 - Mar lastSun 2:00 1:00 D @@ -1030,6 +1078,10 @@ Zone America/Thule -4:35:08 - LMT 1916 Jul 28 # Pituffik air base -4:00 Thule A%sT # Estonia +# +# From Paul Eggert (2016-03-18): +# The 1989 transition is from USSR act No. 227 (1989-03-14). +# # From Peter Ilieve (1994-10-15): # A relative in Tallinn confirms the accuracy of the data for 1989 onwards # [through 1994] and gives the legal authority for it, @@ -1257,7 +1309,7 @@ Zone Europe/Paris 0:09:21 - LMT 1891 Mar 15 0:01 # http://www.parlament-berlin.de/pds-fraktion.nsf/727459127c8b66ee8525662300459099/defc77cb784f180ac1256c2b0030274b/$FILE/bersarint.pdf # says that Bersarin issued an order to use Moscow time on May 20. # However, Moscow did not observe daylight saving in 1945, so -# this was equivalent to CEMT (GMT+3), not GMT+4. +# this was equivalent to UT +03, not +04. # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S @@ -1623,6 +1675,9 @@ Link Europe/Zurich Europe/Vaduz # Lithuania +# From Paul Eggert (2016-03-18): +# The 1989 transition is from USSR act No. 227 (1989-03-14). + # From Paul Eggert (1996-11-22): # IATA SSIM (1992/1996) says Lithuania uses W-Eur rules, but since it is # known to be wrong about Estonia and Latvia, assume it's wrong here too. @@ -1662,8 +1717,8 @@ Zone Europe/Vilnius 1:41:16 - LMT 1880 1:00 - CET 1940 Aug 3 3:00 - MSK 1941 Jun 24 1:00 C-Eur CE%sT 1944 Aug - 3:00 Russia MSK/MSD 1991 Mar 31 2:00s - 2:00 1:00 EEST 1991 Sep 29 2:00s + 3:00 Russia MSK/MSD 1989 Mar 26 2:00s + 2:00 Russia EE%sT 1991 Sep 29 2:00s 2:00 C-Eur EE%sT 1998 2:00 - EET 1998 Mar 29 1:00u 1:00 EU CE%sT 1999 Oct 31 1:00u @@ -1728,6 +1783,16 @@ Zone Europe/Malta 0:58:04 - LMT 1893 Nov 2 0:00s # Valletta # Moldova +# From Stepan Golosunov (2016-03-07): +# the act of the government of the Republic of Moldova Nr. 132 from 1990-05-04 +# http://lex.justice.md/viewdoc.php?action=view&view=doc&id=298782&lang=2 +# ... says that since 1990-05-06 on the territory of the Moldavian SSR +# time would be calculated as the standard time of the second time belt +# plus one hour of the "summer" time. To implement that clocks would be +# adjusted one hour backwards at 1990-05-06 2:00. After that "summer" +# time would be cancelled last Sunday of September at 3:00 and +# reintroduced last Sunday of March at 2:00. + # From Paul Eggert (2006-03-22): # A previous version of this database followed Shanks & Pottenger, who write # that Tiraspol switched to Moscow time on 1992-01-19 at 02:00. @@ -1786,9 +1851,7 @@ Zone Europe/Chisinau 1:55:20 - LMT 1880 2:00 Romania EE%sT 1940 Aug 15 2:00 1:00 EEST 1941 Jul 17 1:00 C-Eur CE%sT 1944 Aug 24 - 3:00 Russia MSK/MSD 1990 - 3:00 - MSK 1990 May 6 - 2:00 - EET 1991 + 3:00 Russia MSK/MSD 1990 May 6 2:00 2:00 Russia EE%sT 1992 2:00 E-Eur EE%sT 1997 # See Romania commentary for the guessed 1997 transition to EU rules. @@ -2210,7 +2273,6 @@ Zone Europe/Bucharest 1:44:24 - LMT 1891 Oct # http://www.worldtimezone.com/dst_news/dst_news_russia-map-2014-07.html # From Paul Eggert (2006-03-22): -# Except for Moscow after 1919-07-01, I invented the time zone abbreviations. # Moscow time zone abbreviations after 1919-07-01, and Moscow rules after 1991, # are from Andrey A. Chernov. The rest is from Shanks & Pottenger, # except we follow Chernov's report that 1992 DST transitions were Sat @@ -2266,16 +2328,32 @@ Zone Europe/Bucharest 1:44:24 - LMT 1891 Oct # Europe/Kaliningrad covers... # 39 RU-KGD Kaliningrad Oblast +# From Paul Eggert (2016-03-18): +# The 1989 transition is from USSR act No. 227 (1989-03-14). + +# From Stepan Golosunov (2016-03-07): +# http://www.rgo.ru/ru/kaliningradskoe-oblastnoe-otdelenie/ob-otdelenii/publikacii/kak-nam-zhilos-bez-letnego-vremeni +# confirms that the 1989 change to Moscow-1 was implemented. +# (The article, though, is misattributed to 1990 while saying that +# summer->winter transition would be done on the 24 of September. But +# 1990-09-24 was Monday, while 1989-09-24 was Sunday as expected.) +# ... +# http://www.kaliningradka.ru/site_pc/cherez/index.php?ELEMENT_ID=40091 +# says that Kaliningrad switched to Moscow-1 on 1989-03-26, avoided +# at the last moment switch to Moscow-1 on 1991-03-31, switched to +# Moscow on 1991-11-03, switched to Moscow-1 on 1992-01-19. + Zone Europe/Kaliningrad 1:22:00 - LMT 1893 Apr 1:00 C-Eur CE%sT 1945 2:00 Poland CE%sT 1946 - 3:00 Russia MSK/MSD 1991 Mar 31 2:00s + 3:00 Russia MSK/MSD 1989 Mar 26 2:00s 2:00 Russia EE%sT 2011 Mar 27 2:00s - 3:00 - FET 2014 Oct 26 2:00s + 3:00 - +03 2014 Oct 26 2:00s 2:00 - EET -# From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): +# From Paul Eggert (2016-02-21), per Tim Parenti (2014-07-03) and +# Oscar van Vlijmen (2001-08-25): # Europe/Moscow covers... # 01 RU-AD Adygea, Republic of # 05 RU-DA Dagestan, Republic of @@ -2318,12 +2396,102 @@ Zone Europe/Kaliningrad 1:22:00 - LMT 1893 Apr # 68 RU-TAM Tambov Oblast # 69 RU-TVE Tver Oblast # 71 RU-TUL Tula Oblast -# 73 RU-ULY Ulyanovsk Oblast # 76 RU-YAR Yaroslavl Oblast # 77 RU-MOW Moscow # 78 RU-SPE Saint Petersburg # 83 RU-NEN Nenets Autonomous Okrug +# From Paul Eggert (2016-08-23): +# The Soviets switched to UT-based time in 1919. Decree No. 59 +# (1919-02-08) http://istmat.info/node/35567 established UT-based time +# zones, and Decree No. 147 (1919-03-29) http://istmat.info/node/35854 +# specified a transition date of 1919-07-01, apparently at 00:00 UT. +# No doubt only the Soviet-controlled regions switched on that date; +# later transitions to UT-based time in other parts of Russia are +# taken from what appear to be guesses by Shanks. +# (Thanks to Alexander Belopolsky for pointers to the decrees.) + +# From Stepan Golosunov (2016-03-07): +# 11. Regions-violators, 1981-1982. +# Wikipedia refers to +# http://maps.monetonos.ru/maps/raznoe/Old_Maps/Old_Maps/Articles/022/3_1981.html +# http://besp.narod.ru/nauka_1981_3.htm +# +# The second link provides two articles scanned from the Nauka i Zhizn +# magazine No. 3, 1981 and a scan of the short article attributed to +# the Trud newspaper from February 1982. The first link provides the +# same Nauka i Zhizn articles converted to the text form (but misses +# time belt changes map). +# +# The second Nauka i Zhizn article says that in addition to +# introduction of summer time on 1981-04-01 there are some time belt +# border changes on 1981-10-01, mostly affecting Nenets Autonomous +# Okrug, Krasnoyarsk Krai, Yakutia, Magadan Oblast and Chukotka +# according to the provided map (colored one). In addition to that +# "time violators" (regions which were not using rules of the time +# belts in which they were located) would not be moving off the DST on +# 1981-10-01 to restore the decree time usage. (Komi ASSR was +# supposed to repeat that move in October 1982 to account for the 2 +# hour difference.) Map depicting "time violators" before 1981-10-01 +# is also provided. +# +# The article from Trud says that 1981-10-01 changes caused problems +# and some territories would be moved to pre-1981-10-01 time by not +# moving to summer time on 1982-04-01. Namely: Dagestan, +# Kabardino-Balkar, Kalmyk, Komi, Mari, Mordovian, North Ossetian, +# Tatar, Chechen-Ingush and Chuvash ASSR, Krasnodar and Stavropol +# krais, Arkhangelsk, Vladimir, Vologda, Voronezh, Gorky, Ivanovo, +# Kostroma, Lipetsk, Penza, Rostov, Ryazan, Tambov, Tyumen and +# Yaroslavl oblasts, Nenets and Evenk autonomous okrugs, Khatangsky +# district of Taymyr Autonomous Okrug. As a result Evenk Autonomous +# Okrug and Khatangsky district of Taymyr Autonomous Okrug would end +# up on Moscow+4, Tyumen Oblast on Moscow+2 and the rest on Moscow +# time. +# +# http://astrozet.net/files/Zones/DOC/RU/1980-925.txt +# attributes the 1982 changes to the Act of the Council of Ministers +# of the USSR No. 126 from 18.02.1982. 1980-925.txt also adds +# Udmurtia to the list of affected territories and lists Khatangsky +# district separately from Taymyr Autonomous Okrug. Probably erroneously. +# +# The affected territories are currently listed under Europe/Moscow, +# Asia/Yekaterinburg and Asia/Krasnoyarsk. +# +# 12. Udmurtia +# The fact that Udmurtia is depicted as a violator in the Nauka i +# Zhizn article hints at Izhevsk being on different time from +# Kuybyshev before 1981-10-01. Udmurtia is not mentioned in the 1989 act. +# http://astrozet.net/files/Zones/DOC/RU/1980-925.txt +# implies Udmurtia was on Moscow time after 1982-04-01. +# Wikipedia implies Udmurtia being on Moscow+1 until 1991. +# +# ... +# +# All Russian zones are supposed to have by default a -1 change at +# 1991-03-31 2:00 (cancellation of the decree time in the USSR) and a +1 +# change at 1992-01-19 2:00 (restoration of the decree time in Russia). +# +# There were some exceptions, though. +# Wikipedia says newspapers listed Astrakhan, Saratov, Kirov, Volgograd, +# Izhevsk, Grozny, Kazan and Samara as such exceptions for the 1992 +# change. (Different newspapers providing different lists. And some +# lists found in the internet are quite wild.) +# +# And apparently some exceptions were reverted in the last moment. +# http://www.kaliningradka.ru/site_pc/cherez/index.php?ELEMENT_ID=40091 +# says that Kaliningrad decided not to be an exception 2 days before the +# 1991-03-31 switch and one person at +# http://izhevsk.ru/forum_light_message/50/682597-m8369040.html +# says he remembers that Samara opted out of the 1992-01-19 exception +# 2 days before the switch. +# +# +# From Paul Eggert (2016-03-18): +# Given the above, we appear to be missing some Zone entries for the +# chaotic early 1980s in Russia. It's not clear what these entries +# should be. For now, sweep this under the rug and just document the +# time in Moscow. + # From Vladimir Karpinsky (2014-07-08): # LMT in Moscow (before Jul 3, 1916) is 2:30:17, that was defined by Moscow # Observatory (coordinates: 55 deg. 45'29.70", 37 deg. 34'05.30").... @@ -2344,7 +2512,7 @@ Zone Europe/Kaliningrad 1:22:00 - LMT 1893 Apr Zone Europe/Moscow 2:30:17 - LMT 1880 2:30:17 - MMT 1916 Jul 3 # Moscow Mean Time - 2:31:19 Russia %s 1919 Jul 1 2:00 + 2:31:19 Russia %s 1919 Jul 1 0:00u 3:00 Russia %s 1921 Oct 3:00 Russia MSK/MSD 1922 Oct 2:00 - EET 1930 Jun 21 @@ -2397,47 +2565,101 @@ Zone Europe/Simferopol 2:16:24 - LMT 1880 3:00 - MSK -# From Tim Parenti (2014-07-03): -# Europe/Volgograd covers... +# From Paul Eggert (2016-03-18): +# Europe/Astrakhan covers: # 30 RU-AST Astrakhan Oblast +# +# The 1989 transition is from USSR act No. 227 (1989-03-14). + +# From Alexander Krivenyshev (2016-01-12): +# On February 10, 2016 Astrakhan Oblast got approval by the Federation +# Council to change its time zone to UTC+4 (from current UTC+3 Moscow time).... +# This Federal Law shall enter into force on 27 March 2016 at 02:00. +# From Matt Johnson (2016-03-09): +# http://publication.pravo.gov.ru/Document/View/0001201602150056 + +Zone Europe/Astrakhan 3:12:12 - LMT 1924 May + 3:00 - +03 1930 Jun 21 + 4:00 Russia +04/+05 1989 Mar 26 2:00s + 3:00 Russia +03/+04 1991 Mar 31 2:00s + 4:00 - +04 1992 Mar 29 2:00s + 3:00 Russia +03/+04 2011 Mar 27 2:00s + 4:00 - +04 2014 Oct 26 2:00s + 3:00 - +03 2016 Mar 27 2:00s + 4:00 - +04 + +# From Paul Eggert (2016-03-18): +# Europe/Volgograd covers: # 34 RU-VGG Volgograd Oblast -# 43 RU-KIR Kirov Oblast # 64 RU-SAR Saratov Oblast - -# From Paul Eggert (2006-05-09): -# Shanks & Pottenger say Kirov is still at +0400 but Wikipedia says +0300. -# Perhaps it switched after the others? But we have no data. +# The 1988 transition is from USSR act No. 5 (1988-01-04). Zone Europe/Volgograd 2:57:40 - LMT 1920 Jan 3 - 3:00 - TSAT 1925 Apr 6 # Tsaritsyn Time - 3:00 - STAT 1930 Jun 21 # Stalingrad Time - 4:00 - STAT 1961 Nov 11 - 4:00 Russia VOL%sT 1989 Mar 26 2:00s # Volgograd T - 3:00 Russia VOL%sT 1991 Mar 31 2:00s - 4:00 - VOLT 1992 Mar 29 2:00s - 3:00 Russia MSK/MSD 2011 Mar 27 2:00s - 4:00 - MSK 2014 Oct 26 2:00s - 3:00 - MSK - + 3:00 - +03 1930 Jun 21 + 4:00 - +04 1961 Nov 11 + 4:00 Russia +04/+05 1988 Mar 27 2:00s + 3:00 Russia +03/+04 1991 Mar 31 2:00s + 4:00 - +04 1992 Mar 29 2:00s + 3:00 Russia +03/+04 2011 Mar 27 2:00s + 4:00 - +04 2014 Oct 26 2:00s + 3:00 - +03 + +# From Paul Eggert (2016-03-18): +# Europe/Kirov covers: +# 43 RU-KIR Kirov Oblast +# The 1989 transition is from USSR act No. 227 (1989-03-14). +# +Zone Europe/Kirov 3:18:48 - LMT 1919 Jul 1 0:00u + 3:00 - +03 1930 Jun 21 + 4:00 Russia +04/+05 1989 Mar 26 2:00s + 3:00 Russia +03/+04 1991 Mar 31 2:00s + 4:00 - +04 1992 Mar 29 2:00s + 3:00 Russia +03/+04 2011 Mar 27 2:00s + 4:00 - +04 2014 Oct 26 2:00s + 3:00 - +03 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): # Europe/Samara covers... # 18 RU-UD Udmurt Republic # 63 RU-SAM Samara Oblast +# From Paul Eggert (2016-03-18): # Byalokoz 1919 says Samara was 3:20:20. +# The 1989 transition is from USSR act No. 227 (1989-03-14). + +Zone Europe/Samara 3:20:20 - LMT 1919 Jul 1 0:00u + 3:00 - +03 1930 Jun 21 + 4:00 - +04 1935 Jan 27 + 4:00 Russia +04/+05 1989 Mar 26 2:00s + 3:00 Russia +03/+04 1991 Mar 31 2:00s + 2:00 Russia +02/+03 1991 Sep 29 2:00s + 3:00 - +03 1991 Oct 20 3:00 + 4:00 Russia +04/+05 2010 Mar 28 2:00s + 3:00 Russia +03/+04 2011 Mar 27 2:00s + 4:00 - +04 + +# From Paul Eggert (2016-03-18): +# Europe/Ulyanovsk covers: +# 73 RU-ULY Ulyanovsk Oblast -Zone Europe/Samara 3:20:20 - LMT 1919 Jul 1 2:00 - 3:00 - SAMT 1930 Jun 21 - 4:00 - SAMT 1935 Jan 27 - 4:00 Russia KUY%sT 1989 Mar 26 2:00s # Kuybyshev - 3:00 Russia MSK/MSD 1991 Mar 31 2:00s - 2:00 Russia EE%sT 1991 Sep 29 2:00s - 3:00 - KUYT 1991 Oct 20 3:00 - 4:00 Russia SAM%sT 2010 Mar 28 2:00s # Samara Time - 3:00 Russia SAM%sT 2011 Mar 27 2:00s - 4:00 - SAMT - +# The 1989 transition is from USSR act No. 227 (1989-03-14). + +# From Alexander Krivenyshev (2016-02-17): +# Ulyanovsk ... on their way to change time zones by March 27, 2016 at 2am. +# Ulyanovsk Oblast ... from MSK to MSK+1 (UTC+3 to UTC+4) ... +# 920582-6 ... 02/17/2016 The State Duma passed the bill in the first reading. +# From Matt Johnson (2016-03-09): +# http://publication.pravo.gov.ru/Document/View/0001201603090051 + +Zone Europe/Ulyanovsk 3:13:36 - LMT 1919 Jul 1 0:00u + 3:00 - +03 1930 Jun 21 + 4:00 Russia +04/+05 1989 Mar 26 2:00s + 3:00 Russia +03/+04 1991 Mar 31 2:00s + 2:00 Russia +02/+03 1992 Jan 19 2:00s + 3:00 Russia +03/+04 2011 Mar 27 2:00s + 4:00 - +04 2014 Oct 26 2:00s + 3:00 - +03 2016 Mar 27 2:00s + 4:00 - +04 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): # Asia/Yekaterinburg covers... @@ -2461,47 +2683,140 @@ Zone Europe/Samara 3:20:20 - LMT 1919 Jul 1 2:00 Zone Asia/Yekaterinburg 4:02:33 - LMT 1916 Jul 3 3:45:05 - PMT 1919 Jul 15 4:00 - 4:00 - SVET 1930 Jun 21 # Sverdlovsk Time - 5:00 Russia SVE%sT 1991 Mar 31 2:00s - 4:00 Russia SVE%sT 1992 Jan 19 2:00s - 5:00 Russia YEK%sT 2011 Mar 27 2:00s - 6:00 - YEKT 2014 Oct 26 2:00s - 5:00 - YEKT + 4:00 - +04 1930 Jun 21 + 5:00 Russia +05/+06 1991 Mar 31 2:00s + 4:00 Russia +04/+05 1992 Jan 19 2:00s + 5:00 Russia +05/+06 2011 Mar 27 2:00s + 6:00 - +06 2014 Oct 26 2:00s + 5:00 - +05 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): # Asia/Omsk covers... -# 04 RU-AL Altai Republic -# 22 RU-ALT Altai Krai # 55 RU-OMS Omsk Oblast # Byalokoz 1919 says Omsk was 4:53:30. Zone Asia/Omsk 4:53:30 - LMT 1919 Nov 14 - 5:00 - OMST 1930 Jun 21 # Omsk Time - 6:00 Russia OMS%sT 1991 Mar 31 2:00s - 5:00 Russia OMS%sT 1992 Jan 19 2:00s - 6:00 Russia OMS%sT 2011 Mar 27 2:00s - 7:00 - OMST 2014 Oct 26 2:00s - 6:00 - OMST - + 5:00 - +05 1930 Jun 21 + 6:00 Russia +06/+07 1991 Mar 31 2:00s + 5:00 Russia +05/+06 1992 Jan 19 2:00s + 6:00 Russia +06/+07 2011 Mar 27 2:00s + 7:00 - +07 2014 Oct 26 2:00s + 6:00 - +06 + +# From Paul Eggert (2016-02-22): +# Asia/Barnaul covers: +# 04 RU-AL Altai Republic +# 22 RU-ALT Altai Krai -# From Tim Parenti (2014-07-03): -# Asia/Novosibirsk covers... +# Data before 1991 are from Shanks & Pottenger. + +# From Stepan Golosunov (2016-03-07): +# Letter of Bank of Russia from 1995-05-25 +# http://www.bestpravo.ru/rossijskoje/lj-akty/y3a.htm +# suggests that Altai Republic transitioned to Moscow+3 on +# 1995-05-28. +# +# http://regnum.ru/news/society/1957270.html +# has some historical data for Altai Krai: +# before 1957: west part on UTC+6, east on UTC+7 +# after 1957: UTC+7 +# since 1995: UTC+6 +# http://barnaul.rusplt.ru/index/pochemu_altajskij_kraj_okazalsja_v_neprivychnom_chasovom_pojase-17648.html +# confirms that and provides more details including 1995-05-28 transition date. + +# From Alexander Krivenyshev (2016-02-17): +# Altai Krai and Altai Republic on their way to change time zones +# by March 27, 2016 at 2am.... +# Altai Republic / Gorno-Altaysk MSK+3 to MSK+4 (UTC+6 to UTC+7) ... +# Altai Krai / Barnaul MSK+3 to MSK+4 (UTC+6 to UTC+7) +# From Matt Johnson (2016-03-09): +# http://publication.pravo.gov.ru/Document/View/0001201603090043 +# http://publication.pravo.gov.ru/Document/View/0001201603090038 + +Zone Asia/Barnaul 5:35:00 - LMT 1919 Dec 10 + 6:00 - +06 1930 Jun 21 + 7:00 Russia +07/+08 1991 Mar 31 2:00s + 6:00 Russia +06/+07 1992 Jan 19 2:00s + 7:00 Russia +07/+08 1995 May 28 + 6:00 Russia +06/+07 2011 Mar 27 2:00s + 7:00 - +07 2014 Oct 26 2:00s + 6:00 - +06 2016 Mar 27 2:00s + 7:00 - +07 + +# From Paul Eggert (2016-03-18): +# Asia/Novosibirsk covers: # 54 RU-NVS Novosibirsk Oblast -# 70 RU-TOM Tomsk Oblast -# From Paul Eggert (2006-08-19): I'm guessing about Tomsk here; it's -# not clear when it switched from +7 to +6. +# From Stepan Golosunov (2016-05-30): +# http://asozd2.duma.gov.ru/main.nsf/(Spravka)?OpenAgent&RN=1085784-6 +# moves Novosibirsk oblast from UTC+6 to UTC+7. +# From Stepan Golosunov (2016-07-04): +# The law was signed yesterday and published today on +# http://publication.pravo.gov.ru/Document/View/0001201607040064 Zone Asia/Novosibirsk 5:31:40 - LMT 1919 Dec 14 6:00 - 6:00 - NOVT 1930 Jun 21 # Novosibirsk Time - 7:00 Russia NOV%sT 1991 Mar 31 2:00s - 6:00 Russia NOV%sT 1992 Jan 19 2:00s - 7:00 Russia NOV%sT 1993 May 23 # say Shanks & P. - 6:00 Russia NOV%sT 2011 Mar 27 2:00s - 7:00 - NOVT 2014 Oct 26 2:00s - 6:00 - NOVT + 6:00 - +06 1930 Jun 21 + 7:00 Russia +07/+08 1991 Mar 31 2:00s + 6:00 Russia +06/+07 1992 Jan 19 2:00s + 7:00 Russia +07/+08 1993 May 23 # say Shanks & P. + 6:00 Russia +06/+07 2011 Mar 27 2:00s + 7:00 - +07 2014 Oct 26 2:00s + 6:00 - +06 2016 Jul 24 2:00s + 7:00 - +07 + +# From Paul Eggert (2016-03-18): +# Asia/Tomsk covers: +# 70 RU-TOM Tomsk Oblast + +# From Stepan Golosunov (2016-03-24): +# Byalokoz listed Tomsk at 5:39:51. + +# From Stanislaw A. Kuzikowski (1994-06-29): +# Tomsk is still 4 hours ahead of Moscow. + +# From Stepan Golosunov (2016-03-19): +# http://pravo.gov.ru/proxy/ips/?docbody=&nd=102075743 +# (fifth time belt being UTC+5+1(decree time) +# / UTC+5+1(decree time)+1(summer time)) ... +# Note that time belts (numbered from 2 (Moscow) to 12 according to their +# GMT/UTC offset and having too many exceptions like regions formally +# belonging to one belt but using time from another) were replaced +# with time zones in 2011 with different numbering (there was a +# 2-hour gap between second and third zones in 2011-2014). + +# From Stepan Golosunov (2016-04-12): +# http://asozd2.duma.gov.ru/main.nsf/(SpravkaNew)?OpenAgent&RN=1006865-6 +# This bill was approved in the first reading today. It moves Tomsk oblast +# from UTC+6 to UTC+7 and is supposed to come into effect on 2016-05-29 at +# 2:00. The bill needs to be approved in the second and the third readings by +# the State Duma, approved by the Federation Council, signed by the President +# and published to become a law. Minor changes in the text are to be expected +# before the second reading (references need to be updated to account for the +# recent changes). +# +# Judging by the ultra-short one-day amendments period, recent similar laws, +# the State Duma schedule and the Federation Council schedule +# http://www.duma.gov.ru/legislative/planning/day-shedule/por_vesna_2016/ +# http://council.gov.ru/activity/meetings/schedule/63303 +# I speculate that the final text of the bill will be proposed tomorrow, the +# bill will be approved in the second and the third readings on Friday, +# approved by the Federation Council on 2016-04-20, signed by the President and +# published as a law around 2016-04-26. + +# From Matt Johnson (2016-04-26): +# http://publication.pravo.gov.ru/Document/View/0001201604260048 + +Zone Asia/Tomsk 5:39:51 - LMT 1919 Dec 22 + 6:00 - +06 1930 Jun 21 + 7:00 Russia +07/+08 1991 Mar 31 2:00s + 6:00 Russia +06/+07 1992 Jan 19 2:00s + 7:00 Russia +07/+08 2002 May 1 3:00 + 6:00 Russia +06/+07 2011 Mar 27 2:00s + 7:00 - +07 2014 Oct 26 2:00s + 6:00 - +06 2016 May 29 2:00s + 7:00 - +07 # From Tim Parenti (2014-07-03): @@ -2526,23 +2841,18 @@ Zone Asia/Novosibirsk 5:31:40 - LMT 1919 Dec 14 6:00 # # Thus, when Russia will switch to DST on the night of March 28, 2010 # Kemerovo region (Kemerovo oblast') will not change the clock. -# -# As a result, Kemerovo oblast' will be in the same time zone as -# Novosibirsk, Omsk, Tomsk, Barnaul and Altai Republic. # From Tim Parenti (2014-07-02), per Alexander Krivenyshev (2014-07-02): # The Kemerovo region will remain at UTC+7 through the 2014-10-26 change, thus # realigning itself with KRAT. Zone Asia/Novokuznetsk 5:48:48 - LMT 1924 May 1 - 6:00 - KRAT 1930 Jun 21 # Krasnoyarsk Time - 7:00 Russia KRA%sT 1991 Mar 31 2:00s - 6:00 Russia KRA%sT 1992 Jan 19 2:00s - 7:00 Russia KRA%sT 2010 Mar 28 2:00s - 6:00 Russia NOV%sT 2011 Mar 27 2:00s # Novosibirsk - 7:00 - NOVT 2014 Oct 26 2:00s - 7:00 - KRAT # Krasnoyarsk Time - + 6:00 - +06 1930 Jun 21 + 7:00 Russia +07/+08 1991 Mar 31 2:00s + 6:00 Russia +06/+07 1992 Jan 19 2:00s + 7:00 Russia +07/+08 2010 Mar 28 2:00s + 6:00 Russia +06/+07 2011 Mar 27 2:00s + 7:00 - +07 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): # Asia/Krasnoyarsk covers... @@ -2556,12 +2866,12 @@ Zone Asia/Novokuznetsk 5:48:48 - LMT 1924 May 1 # Byalokoz 1919 says Krasnoyarsk was 6:11:26. Zone Asia/Krasnoyarsk 6:11:26 - LMT 1920 Jan 6 - 6:00 - KRAT 1930 Jun 21 # Krasnoyarsk Time - 7:00 Russia KRA%sT 1991 Mar 31 2:00s - 6:00 Russia KRA%sT 1992 Jan 19 2:00s - 7:00 Russia KRA%sT 2011 Mar 27 2:00s - 8:00 - KRAT 2014 Oct 26 2:00s - 7:00 - KRAT + 6:00 - +06 1930 Jun 21 + 7:00 Russia +07/+08 1991 Mar 31 2:00s + 6:00 Russia +06/+07 1992 Jan 19 2:00s + 7:00 Russia +07/+08 2011 Mar 27 2:00s + 8:00 - +08 2014 Oct 26 2:00s + 7:00 - +07 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): @@ -2578,12 +2888,12 @@ Zone Asia/Krasnoyarsk 6:11:26 - LMT 1920 Jan 6 Zone Asia/Irkutsk 6:57:05 - LMT 1880 6:57:05 - IMT 1920 Jan 25 # Irkutsk Mean Time - 7:00 - IRKT 1930 Jun 21 # Irkutsk Time - 8:00 Russia IRK%sT 1991 Mar 31 2:00s - 7:00 Russia IRK%sT 1992 Jan 19 2:00s - 8:00 Russia IRK%sT 2011 Mar 27 2:00s - 9:00 - IRKT 2014 Oct 26 2:00s - 8:00 - IRKT + 7:00 - +07 1930 Jun 21 + 8:00 Russia +08/+09 1991 Mar 31 2:00s + 7:00 Russia +07/+08 1992 Jan 19 2:00s + 8:00 Russia +08/+09 2011 Mar 27 2:00s + 9:00 - +09 2014 Oct 26 2:00s + 8:00 - +08 # From Tim Parenti (2014-07-06): @@ -2593,13 +2903,20 @@ Zone Asia/Irkutsk 6:57:05 - LMT 1880 # Note: Effective 2008-03-01, (75) Chita Oblast and (80) Agin-Buryat # Autonomous Okrug merged to form (92, RU-ZAB) Zabaykalsky Krai. +# From Alexander Krivenyshev (2016-01-02): +# [The] time zone in the Trans-Baikal Territory (Zabaykalsky Krai) - +# Asia/Chita [is changing] from UTC+8 to UTC+9. Effective date will +# be March 27, 2016 at 2:00am.... +# http://publication.pravo.gov.ru/Document/View/0001201512300107 + Zone Asia/Chita 7:33:52 - LMT 1919 Dec 15 - 8:00 - YAKT 1930 Jun 21 # Yakutsk Time - 9:00 Russia YAK%sT 1991 Mar 31 2:00s - 8:00 Russia YAK%sT 1992 Jan 19 2:00s - 9:00 Russia YAK%sT 2011 Mar 27 2:00s - 10:00 - YAKT 2014 Oct 26 2:00s - 8:00 - IRKT + 8:00 - +08 1930 Jun 21 + 9:00 Russia +09/+10 1991 Mar 31 2:00s + 8:00 Russia +08/+09 1992 Jan 19 2:00s + 9:00 Russia +09/+10 2011 Mar 27 2:00s + 10:00 - +10 2014 Oct 26 2:00s + 8:00 - +08 2016 Mar 27 2:00 + 9:00 - +09 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2009-11-29): @@ -2639,12 +2956,12 @@ Zone Asia/Chita 7:33:52 - LMT 1919 Dec 15 # Byalokoz 1919 says Yakutsk was 8:38:58. Zone Asia/Yakutsk 8:38:58 - LMT 1919 Dec 15 - 8:00 - YAKT 1930 Jun 21 # Yakutsk Time - 9:00 Russia YAK%sT 1991 Mar 31 2:00s - 8:00 Russia YAK%sT 1992 Jan 19 2:00s - 9:00 Russia YAK%sT 2011 Mar 27 2:00s - 10:00 - YAKT 2014 Oct 26 2:00s - 9:00 - YAKT + 8:00 - +08 1930 Jun 21 + 9:00 Russia +09/+10 1991 Mar 31 2:00s + 8:00 Russia +08/+09 1992 Jan 19 2:00s + 9:00 Russia +09/+10 2011 Mar 27 2:00s + 10:00 - +10 2014 Oct 26 2:00s + 9:00 - +09 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2009-11-29): @@ -2662,12 +2979,12 @@ Zone Asia/Yakutsk 8:38:58 - LMT 1919 Dec 15 # Go with Byalokoz. Zone Asia/Vladivostok 8:47:31 - LMT 1922 Nov 15 - 9:00 - VLAT 1930 Jun 21 # Vladivostok Time - 10:00 Russia VLA%sT 1991 Mar 31 2:00s - 9:00 Russia VLA%sT 1992 Jan 19 2:00s - 10:00 Russia VLA%sT 2011 Mar 27 2:00s - 11:00 - VLAT 2014 Oct 26 2:00s - 10:00 - VLAT + 9:00 - +09 1930 Jun 21 + 10:00 Russia +10/+11 1991 Mar 31 2:00s + 9:00 Russia +09/+10 1992 Jan 19 2:00s + 10:00 Russia +10/+11 2011 Mar 27 2:00s + 11:00 - +11 2014 Oct 26 2:00s + 10:00 - +10 # From Tim Parenti (2014-07-03): @@ -2685,14 +3002,14 @@ Zone Asia/Vladivostok 8:47:31 - LMT 1922 Nov 15 # This transition is no doubt wrong, but we have no better info. Zone Asia/Khandyga 9:02:13 - LMT 1919 Dec 15 - 8:00 - YAKT 1930 Jun 21 # Yakutsk Time - 9:00 Russia YAK%sT 1991 Mar 31 2:00s - 8:00 Russia YAK%sT 1992 Jan 19 2:00s - 9:00 Russia YAK%sT 2004 - 10:00 Russia VLA%sT 2011 Mar 27 2:00s - 11:00 - VLAT 2011 Sep 13 0:00s # Decree 725? - 10:00 - YAKT 2014 Oct 26 2:00s - 9:00 - YAKT + 8:00 - +08 1930 Jun 21 + 9:00 Russia +09/+10 1991 Mar 31 2:00s + 8:00 Russia +08/+09 1992 Jan 19 2:00s + 9:00 Russia +09/+10 2004 + 10:00 Russia +10/+11 2011 Mar 27 2:00s + 11:00 - +11 2011 Sep 13 0:00s # Decree 725? + 10:00 - +10 2014 Oct 26 2:00s + 9:00 - +09 # From Tim Parenti (2014-07-03): @@ -2701,16 +3018,21 @@ Zone Asia/Khandyga 9:02:13 - LMT 1919 Dec 15 # ...with the exception of: # 65-11 **** Severo-Kurilsky District (North Kuril Islands) +# From Matt Johnson (2016-02-22): +# Asia/Sakhalin is moving (in entirety) from UTC+10 to UTC+11 ... +# (2016-03-09): +# http://publication.pravo.gov.ru/Document/View/0001201603090044 + # The Zone name should be Asia/Yuzhno-Sakhalinsk, but that's too long. Zone Asia/Sakhalin 9:30:48 - LMT 1905 Aug 23 - 9:00 - JCST 1937 Oct 1 - 9:00 - JST 1945 Aug 25 - 11:00 Russia SAK%sT 1991 Mar 31 2:00s # Sakhalin T - 10:00 Russia SAK%sT 1992 Jan 19 2:00s - 11:00 Russia SAK%sT 1997 Mar lastSun 2:00s - 10:00 Russia SAK%sT 2011 Mar 27 2:00s - 11:00 - SAKT 2014 Oct 26 2:00s - 10:00 - SAKT + 9:00 - +09 1945 Aug 25 + 11:00 Russia +11/+12 1991 Mar 31 2:00s # Sakhalin T + 10:00 Russia +10/+11 1992 Jan 19 2:00s + 11:00 Russia +11/+12 1997 Mar lastSun 2:00s + 10:00 Russia +10/+11 2011 Mar 27 2:00s + 11:00 - +11 2014 Oct 26 2:00s + 10:00 - +10 2016 Mar 27 2:00s + 11:00 - +11 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2009-11-29): @@ -2724,13 +3046,22 @@ Zone Asia/Sakhalin 9:30:48 - LMT 1905 Aug 23 # until now by Asia/Magadan, will instead move to UTC+11. These regions will # need their own zone. +# From Alexander Krivenyshev (2016-03-27): +# ... draft bill 948300-6 to change its time zone from UTC+10 to UTC+11 ... +# will take ... effect ... on April 24, 2016 at 2 o'clock +# +# From Matt Johnson (2016-04-05): +# ... signed by the President today ... +# http://publication.pravo.gov.ru/Document/View/0001201604050038 + Zone Asia/Magadan 10:03:12 - LMT 1924 May 2 - 10:00 - MAGT 1930 Jun 21 # Magadan Time - 11:00 Russia MAG%sT 1991 Mar 31 2:00s - 10:00 Russia MAG%sT 1992 Jan 19 2:00s - 11:00 Russia MAG%sT 2011 Mar 27 2:00s - 12:00 - MAGT 2014 Oct 26 2:00s - 10:00 - MAGT + 10:00 - +10 1930 Jun 21 # Magadan Time + 11:00 Russia +11/+12 1991 Mar 31 2:00s + 10:00 Russia +10/+11 1992 Jan 19 2:00s + 11:00 Russia +11/+12 2011 Mar 27 2:00s + 12:00 - +12 2014 Oct 26 2:00s + 10:00 - +10 2016 Apr 24 2:00s + 11:00 - +11 # From Tim Parenti (2014-07-06): @@ -2773,17 +3104,14 @@ Zone Asia/Magadan 10:03:12 - LMT 1924 May 2 # in Russian.) In addition, Srednekolymsk appears to be a much older # settlement and the population of Zyryanka seems to be declining. # Go with Srednekolymsk. -# -# Since Magadan Oblast moves to UTC+10 on 2014-10-26, we cannot keep using MAGT -# as the abbreviation. Use SRET instead. Zone Asia/Srednekolymsk 10:14:52 - LMT 1924 May 2 - 10:00 - MAGT 1930 Jun 21 # Magadan Time - 11:00 Russia MAG%sT 1991 Mar 31 2:00s - 10:00 Russia MAG%sT 1992 Jan 19 2:00s - 11:00 Russia MAG%sT 2011 Mar 27 2:00s - 12:00 - MAGT 2014 Oct 26 2:00s - 11:00 - SRET # Srednekolymsk Time + 10:00 - +10 1930 Jun 21 + 11:00 Russia +11/+12 1991 Mar 31 2:00s + 10:00 Russia +10/+11 1992 Jan 19 2:00s + 11:00 Russia +11/+12 2011 Mar 27 2:00s + 12:00 - +12 2014 Oct 26 2:00s + 11:00 - +11 # From Tim Parenti (2014-07-03): @@ -2801,14 +3129,14 @@ Zone Asia/Srednekolymsk 10:14:52 - LMT 1924 May 2 # UTC+12 since at least then, too. Zone Asia/Ust-Nera 9:32:54 - LMT 1919 Dec 15 - 8:00 - YAKT 1930 Jun 21 # Yakutsk Time - 9:00 Russia YAKT 1981 Apr 1 - 11:00 Russia MAG%sT 1991 Mar 31 2:00s - 10:00 Russia MAG%sT 1992 Jan 19 2:00s - 11:00 Russia MAG%sT 2011 Mar 27 2:00s - 12:00 - MAGT 2011 Sep 13 0:00s # Decree 725? - 11:00 - VLAT 2014 Oct 26 2:00s - 10:00 - VLAT + 8:00 - +08 1930 Jun 21 + 9:00 Russia +09/+10 1981 Apr 1 + 11:00 Russia +11/+12 1991 Mar 31 2:00s + 10:00 Russia +10/+11 1992 Jan 19 2:00s + 11:00 Russia +11/+12 2011 Mar 27 2:00s + 12:00 - +12 2011 Sep 13 0:00s # Decree 725? + 11:00 - +11 2014 Oct 26 2:00s + 10:00 - +10 # From Tim Parenti (2014-07-03), per Oscar van Vlijmen (2001-08-25): @@ -2821,12 +3149,12 @@ Zone Asia/Ust-Nera 9:32:54 - LMT 1919 Dec 15 # The Zone name should be Asia/Petropavlovsk-Kamchatski or perhaps # Asia/Petropavlovsk-Kamchatsky, but these are too long. Zone Asia/Kamchatka 10:34:36 - LMT 1922 Nov 10 - 11:00 - PETT 1930 Jun 21 # P-K Time - 12:00 Russia PET%sT 1991 Mar 31 2:00s - 11:00 Russia PET%sT 1992 Jan 19 2:00s - 12:00 Russia PET%sT 2010 Mar 28 2:00s - 11:00 Russia PET%sT 2011 Mar 27 2:00s - 12:00 - PETT + 11:00 - +11 1930 Jun 21 + 12:00 Russia +12/+13 1991 Mar 31 2:00s + 11:00 Russia +11/+12 1992 Jan 19 2:00s + 12:00 Russia +12/+13 2010 Mar 28 2:00s + 11:00 Russia +11/+12 2011 Mar 27 2:00s + 12:00 - +12 # From Tim Parenti (2014-07-03): @@ -2834,13 +3162,13 @@ Zone Asia/Kamchatka 10:34:36 - LMT 1922 Nov 10 # 87 RU-CHU Chukotka Autonomous Okrug Zone Asia/Anadyr 11:49:56 - LMT 1924 May 2 - 12:00 - ANAT 1930 Jun 21 # Anadyr Time - 13:00 Russia ANA%sT 1982 Apr 1 0:00s - 12:00 Russia ANA%sT 1991 Mar 31 2:00s - 11:00 Russia ANA%sT 1992 Jan 19 2:00s - 12:00 Russia ANA%sT 2010 Mar 28 2:00s - 11:00 Russia ANA%sT 2011 Mar 27 2:00s - 12:00 - ANAT + 12:00 - +12 1930 Jun 21 + 13:00 Russia +13/+14 1982 Apr 1 0:00s + 12:00 Russia +12/+13 1991 Mar 31 2:00s + 11:00 Russia +11/+12 1992 Jan 19 2:00s + 12:00 Russia +12/+13 2010 Mar 28 2:00s + 11:00 Russia +11/+12 2011 Mar 27 2:00s + 12:00 - +12 # San Marino @@ -3154,6 +3482,20 @@ Zone Europe/Zurich 0:34:08 - LMT 1853 Jul 16 # See above comment. # It's officially announced now by the Ministry of Energy. # Turkey delays winter time to 8th of November 04:00 # http://www.aa.com.tr/tr/turkiye/yaz-saati-uygulamasi-8-kasimda-sona-erecek/362217 +# +# From BBC News (2015-10-25): +# Confused Turks are asking "what's the time?" after automatic clocks defied a +# government decision ... "For the next two weeks #Turkey is on EEST... Erdogan +# Engineered Standard Time," said Twitter user @aysekarahasan. +# http://www.bbc.com/news/world-europe-34631326 + +# From Burak AYDIN (2016-09-08): +# Turkey will stay in Daylight Saving Time even in winter.... +# http://www.resmigazete.gov.tr/eskiler/2016/09/20160908-2.pdf +# +# From Paul Eggert (2016-09-07): +# The change is permanent, so this is the new standard time in Turkey. +# It takes effect today, which is not much notice. # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Turkey 1916 only - May 1 0:00 1:00 S @@ -3218,7 +3560,7 @@ Rule Turkey 1996 2006 - Oct lastSun 1:00s 0 - Zone Europe/Istanbul 1:55:52 - LMT 1880 1:56:56 - IMT 1910 Oct # Istanbul Mean Time? 2:00 Turkey EE%sT 1978 Oct 15 - 3:00 Turkey TR%sT 1985 Apr 20 # Turkey Time + 3:00 Turkey +03/+04 1985 Apr 20 2:00 Turkey EE%sT 2007 2:00 EU EE%sT 2011 Mar 27 1:00u 2:00 - EET 2011 Mar 28 1:00u @@ -3226,7 +3568,8 @@ Zone Europe/Istanbul 1:55:52 - LMT 1880 2:00 - EET 2014 Mar 31 1:00u 2:00 EU EE%sT 2015 Oct 25 1:00u 2:00 1:00 EEST 2015 Nov 8 1:00u - 2:00 EU EE%sT + 2:00 EU EE%sT 2016 Sep 7 + 3:00 - +03 Link Europe/Istanbul Asia/Istanbul # Istanbul is in both continents. # Ukraine diff --git a/contrib/tzdata/factory b/contrib/tzdata/factory index 839118d..336b3ef 100644 --- a/contrib/tzdata/factory +++ b/contrib/tzdata/factory @@ -1,9 +1,10 @@ # This file is in the public domain, so clarified as of # 2009-05-17 by Arthur David Olson. -# For companies who don't want to put time zone specification in -# their installation procedures. When users run date, they'll get the message. -# Also useful for the "comp.sources" version. +# For distributors who don't want to put time zone specification in +# their installation procedures. Users that run 'date' will get the +# time zone abbreviation "-00", indicating that the actual time zone +# is unknown. # Zone NAME GMTOFF RULES FORMAT Zone Factory 0 - "Local time zone must be set--use tzsetup" diff --git a/contrib/tzdata/leap-seconds.list b/contrib/tzdata/leap-seconds.list index 0a0bacb..22fa785 100644 --- a/contrib/tzdata/leap-seconds.list +++ b/contrib/tzdata/leap-seconds.list @@ -143,7 +143,7 @@ # Boulder, Colorado # Judah.Levine@nist.gov # -# Last Update of leap second values: 5 January 2015 +# Last Update of leap second values: 8 July 2016 # # The following line shows this last update date in NTP timestamp # format. This is the date on which the most recent change to @@ -151,7 +151,7 @@ # be identified by the unique pair of characters in the first two # columns as shown below. # -#$ 3629404800 +#$ 3676924800 # # The NTP timestamps are in units of seconds since the NTP epoch, # which is 1 January 1900, 00:00:00. The Modified Julian Day number @@ -199,10 +199,10 @@ # current -- the update time stamp, the data and the name of the file # will not change. # -# Updated through IERS Bulletin C50 -# File expires on: 28 June 2016 +# Updated through IERS Bulletin C52 +# File expires on: 28 June 2017 # -#@ 3676060800 +#@ 3707596800 # 2272060800 10 # 1 Jan 1972 2287785600 11 # 1 Jul 1972 @@ -231,6 +231,7 @@ 3439756800 34 # 1 Jan 2009 3550089600 35 # 1 Jul 2012 3644697600 36 # 1 Jul 2015 +3692217600 37 # 1 Jan 2017 # # the following special comment contains the # hash value of the data in this file computed @@ -246,4 +247,4 @@ # the hash line is also ignored in the # computation. # -#h 3d037453 3acade76 570bd8f8 be2b8bc9 55ec6fe8 +#h dacf2c42 2c4765d6 3c797af8 2cf630eb 699c8c67 diff --git a/contrib/tzdata/leapseconds b/contrib/tzdata/leapseconds index 70ec6d1..b4411f9 100644 --- a/contrib/tzdata/leapseconds +++ b/contrib/tzdata/leapseconds @@ -6,6 +6,7 @@ # leap-seconds.list file available from most NIST time servers. # If the URL <ftp://time.nist.gov/pub/leap-seconds.list> does not work, # you should be able to pick up leap-seconds.list from a secondary NIST server. +# See <http://tf.nist.gov/tf-cgi/servers.cgi> for a list of secondary servers. # For more about leap-seconds.list, please see # The NTP Timescale and Leap Seconds # http://www.eecis.udel.edu/~mills/leap.html @@ -55,6 +56,7 @@ Leap 2005 Dec 31 23:59:60 + S Leap 2008 Dec 31 23:59:60 + S Leap 2012 Jun 30 23:59:60 + S Leap 2015 Jun 30 23:59:60 + S +Leap 2016 Dec 31 23:59:60 + S -# Updated through IERS Bulletin C50 -# File expires on: 28 June 2016 +# Updated through IERS Bulletin C52 +# File expires on: 28 June 2017 diff --git a/contrib/tzdata/northamerica b/contrib/tzdata/northamerica index 7658a45..0bafb00 100644 --- a/contrib/tzdata/northamerica +++ b/contrib/tzdata/northamerica @@ -325,6 +325,16 @@ Zone America/New_York -4:56:02 - LMT 1883 Nov 18 12:03:58 # Statue 175 closer in synch with the US Congress' intent.... # http://www.legis.state.wi.us/2007/data/acts/07Act3.pdf +# From an email administrator of the City of Fort Pierre, SD (2015-12-21): +# Fort Pierre is technically located in the Mountain time zone as is +# the rest of Stanley County. Most of Stanley County and Fort Pierre +# uses the Central time zone due to doing most of their business in +# Pierre so it simplifies schedules. I have lived in Stanley County +# all my life and it has been that way since I can remember. (43 years!) +# +# From Paul Eggert (2015-12-25): +# Assume this practice predates 1970, so Fort Pierre can use America/Chicago. + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER Rule Chicago 1920 only - Jun 13 2:00 1:00 D Rule Chicago 1920 1921 - Oct lastSun 2:00 0 S @@ -403,11 +413,42 @@ Zone America/Denver -6:59:56 - LMT 1883 Nov 18 12:00:04 # north of the Salmon River, and the towns of Burgdorf and Warren), # Nevada (except West Wendover), Oregon (except the northern 3/4 of # Malheur county), and Washington + +# From Paul Eggert (2016-08-20): +# In early February 1948, in response to California's electricity shortage, +# PG&E changed power frequency from 60 to 59.5 Hz during daylight hours, +# causing electric clocks to lose six minutes per day. (This did not change +# legal time, and is not part of the data here.) See: +# Ross SA. An energy crisis from the past: Northern California in 1948. +# Working Paper No. 8, Institute of Governmental Studies, UC Berkeley, +# 1973-11. http://escholarship.org/uc/item/8x22k30c +# +# In another measure to save electricity, DST was instituted from 1948-03-14 +# at 02:01 to 1949-01-16 at 02:00, with the governor having the option to move +# the fallback transition earlier. See pages 3-4 of: +# http://clerk.assembly.ca.gov/sites/clerk.assembly.ca.gov/files/archive/Statutes/1948/48Vol1_Chapters.pdf +# +# In response: +# +# Governor Warren received a torrent of objecting mail, and it is not too much +# to speculate that the objections to Daylight Saving Time were one important +# factor in the defeat of the Dewey-Warren Presidential ticket in California. +# -- Ross, p 25 +# +# On December 8 the governor exercised the option, setting the date to January 1 +# (LA Times 1948-12-09). The transition time was 02:00 (LA Times 1949-01-01). +# +# Despite the controversy, in 1949 California voters approved Proposition 12, +# which established DST from April's last Sunday at 01:00 until September's +# last Sunday at 02:00. This was amended by 1962's Proposition 6, which changed +# the fall-back date to October's last Sunday. See: +# http://repository.uchastings.edu/cgi/viewcontent.cgi?article=1501&context=ca_ballot_props +# http://repository.uchastings.edu/cgi/viewcontent.cgi?article=1636&context=ca_ballot_props # # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER -Rule CA 1948 only - Mar 14 2:00 1:00 D +Rule CA 1948 only - Mar 14 2:01 1:00 D Rule CA 1949 only - Jan 1 2:00 0 S -Rule CA 1950 1966 - Apr lastSun 2:00 1:00 D +Rule CA 1950 1966 - Apr lastSun 1:00 1:00 D Rule CA 1950 1961 - Sep lastSun 2:00 0 S Rule CA 1962 1966 - Oct lastSun 2:00 0 S # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -481,6 +522,12 @@ Zone America/Los_Angeles -7:52:58 - LMT 1883 Nov 18 12:07:02 # For lack of better information, assume that Metlakatla's # abandonment of use of daylight saving resulted from the 1983 vote. +# From Steffen Thorsen (2015-11-09): +# It seems Metlakatla did go off PST on Sunday, November 1, changing +# their time to AKST and are going to follow Alaska's DST, switching +# between AKST and AKDT from now on.... +# http://www.krbd.org/2015/10/30/annette-island-times-they-are-a-changing/ + # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone America/Juneau 15:02:19 - LMT 1867 Oct 18 -8:57:41 - LMT 1900 Aug 20 12:00 @@ -506,7 +553,8 @@ Zone America/Metlakatla 15:13:42 - LMT 1867 Oct 18 -8:00 US P%sT 1946 -8:00 - PST 1969 -8:00 US P%sT 1983 Oct 30 2:00 - -8:00 - PST + -8:00 - PST 2015 Nov 1 2:00 + -9:00 US AK%sT Zone America/Yakutat 14:41:05 - LMT 1867 Oct 18 -9:18:55 - LMT 1900 Aug 20 12:00 -9:00 - YST 1942 @@ -2174,39 +2222,39 @@ Rule NT_YK 1980 2006 - Oct lastSun 2:00 0 S Rule NT_YK 1987 2006 - Apr Sun>=1 2:00 1:00 D # Zone NAME GMTOFF RULES FORMAT [UNTIL] # aka Panniqtuuq -Zone America/Pangnirtung 0 - zzz 1921 # trading post est. +Zone America/Pangnirtung 0 - -00 1921 # trading post est. -4:00 NT_YK A%sT 1995 Apr Sun>=1 2:00 -5:00 Canada E%sT 1999 Oct 31 2:00 -6:00 Canada C%sT 2000 Oct 29 2:00 -5:00 Canada E%sT # formerly Frobisher Bay -Zone America/Iqaluit 0 - zzz 1942 Aug # Frobisher Bay est. +Zone America/Iqaluit 0 - -00 1942 Aug # Frobisher Bay est. -5:00 NT_YK E%sT 1999 Oct 31 2:00 -6:00 Canada C%sT 2000 Oct 29 2:00 -5:00 Canada E%sT # aka Qausuittuq -Zone America/Resolute 0 - zzz 1947 Aug 31 # Resolute founded +Zone America/Resolute 0 - -00 1947 Aug 31 # Resolute founded -6:00 NT_YK C%sT 2000 Oct 29 2:00 -5:00 - EST 2001 Apr 1 3:00 -6:00 Canada C%sT 2006 Oct 29 2:00 -5:00 - EST 2007 Mar 11 3:00 -6:00 Canada C%sT # aka Kangiqiniq -Zone America/Rankin_Inlet 0 - zzz 1957 # Rankin Inlet founded +Zone America/Rankin_Inlet 0 - -00 1957 # Rankin Inlet founded -6:00 NT_YK C%sT 2000 Oct 29 2:00 -5:00 - EST 2001 Apr 1 3:00 -6:00 Canada C%sT # aka Iqaluktuuttiaq -Zone America/Cambridge_Bay 0 - zzz 1920 # trading post est.? +Zone America/Cambridge_Bay 0 - -00 1920 # trading post est.? -7:00 NT_YK M%sT 1999 Oct 31 2:00 -6:00 Canada C%sT 2000 Oct 29 2:00 -5:00 - EST 2000 Nov 5 0:00 -6:00 - CST 2001 Apr 1 3:00 -7:00 Canada M%sT -Zone America/Yellowknife 0 - zzz 1935 # Yellowknife founded? +Zone America/Yellowknife 0 - -00 1935 # Yellowknife founded? -7:00 NT_YK M%sT 1980 -7:00 Canada M%sT -Zone America/Inuvik 0 - zzz 1953 # Inuvik founded +Zone America/Inuvik 0 - -00 1953 # Inuvik founded -8:00 NT_YK P%sT 1979 Apr lastSun 2:00 -7:00 NT_YK M%sT 1980 -7:00 Canada M%sT @@ -2458,13 +2506,22 @@ Zone America/Merida -5:58:28 - LMT 1922 Jan 1 0:01:32 -6:00 - CST 1981 Dec 23 -5:00 - EST 1982 Dec 2 -6:00 Mexico C%sT -# Coahuila, Durango, Nuevo León, Tamaulipas (near US border) +# Coahuila, Nuevo León, Tamaulipas (near US border) +# This includes the following municipalities: +# in Coahuila: Ocampo, Acuña, Zaragoza, Jiménez, Piedras Negras, Nava, +# Guerrero, Hidalgo. +# in Nuevo León: Anáhuac, Los Aldama. +# in Tamaulipas: Nuevo Laredo, Guerrero, Mier, Miguel Alemán, Camargo, +# Gustavo Díaz Ordaz, Reynosa, Río Bravo, Valle Hermoso, Matamoros. +# See: Inicia mañana Horario de Verano en zona fronteriza, El Universal, +# 2016-03-12 +# http://www.eluniversal.com.mx/articulo/estados/2016/03/12/inicia-manana-horario-de-verano-en-zona-fronteriza Zone America/Matamoros -6:40:00 - LMT 1921 Dec 31 23:20:00 -6:00 - CST 1988 -6:00 US C%sT 1989 -6:00 Mexico C%sT 2010 -6:00 US C%sT -# Coahuila, Durango, Nuevo León, Tamaulipas (away from US border) +# Durango; Coahuila, Nuevo León, Tamaulipas (away from US border) Zone America/Monterrey -6:41:16 - LMT 1921 Dec 31 23:18:44 -6:00 - CST 1988 -6:00 US C%sT 1989 @@ -2480,6 +2537,9 @@ Zone America/Mexico_City -6:36:36 - LMT 1922 Jan 1 0:23:24 -6:00 - CST 2002 Feb 20 -6:00 Mexico C%sT # Chihuahua (near US border) +# This includes the municipalities of Janos, Ascensión, Juárez, Guadalupe, +# Práxedis G Guerrero, Coyame del Sotol, Ojinaga, and Manuel Benavides. +# (See the 2016-03-12 El Universal source mentioned above.) Zone America/Ojinaga -6:57:40 - LMT 1922 Jan 1 0:02:20 -7:00 - MST 1927 Jun 10 23:00 -6:00 - CST 1930 Nov 15 @@ -2567,7 +2627,7 @@ Zone America/Bahia_Banderas -7:01:00 - LMT 1921 Dec 31 23:59:00 -7:00 Mexico M%sT 2010 Apr 4 2:00 -6:00 Mexico C%sT -# Baja California (near US border) +# Baja California Zone America/Tijuana -7:48:04 - LMT 1922 Jan 1 0:11:56 -7:00 - MST 1924 -8:00 - PST 1927 Jun 10 23:00 @@ -2587,25 +2647,6 @@ Zone America/Tijuana -7:48:04 - LMT 1922 Jan 1 0:11:56 -8:00 US P%sT 2002 Feb 20 -8:00 Mexico P%sT 2010 -8:00 US P%sT -# Baja California (away from US border) -Zone America/Santa_Isabel -7:39:28 - LMT 1922 Jan 1 0:20:32 - -7:00 - MST 1924 - -8:00 - PST 1927 Jun 10 23:00 - -7:00 - MST 1930 Nov 15 - -8:00 - PST 1931 Apr 1 - -8:00 1:00 PDT 1931 Sep 30 - -8:00 - PST 1942 Apr 24 - -8:00 1:00 PWT 1945 Aug 14 23:00u - -8:00 1:00 PPT 1945 Nov 12 # Peace - -8:00 - PST 1948 Apr 5 - -8:00 1:00 PDT 1949 Jan 14 - -8:00 - PST 1954 - -8:00 CA P%sT 1961 - -8:00 - PST 1976 - -8:00 US P%sT 1996 - -8:00 Mexico P%sT 2001 - -8:00 US P%sT 2002 Feb 20 - -8:00 Mexico P%sT # From Paul Eggert (2006-03-22): # Formerly there was an America/Ensenada zone, which differed from # America/Tijuana only in that it did not observe DST from 1976 @@ -2618,6 +2659,13 @@ Zone America/Santa_Isabel -7:39:28 - LMT 1922 Jan 1 0:20:32 # other than America/Tijuana for Baja, but it's not clear yet what its # name or contents should be. # +# From Paul Eggert (2015-10-08): +# Formerly there was an America/Santa_Isabel zone, but this appears to +# have come from a misreading of +# http://dof.gob.mx/nota_detalle.php?codigo=5127480&fecha=06/01/2010 +# It has been moved to the 'backward' file. +# +# # Revillagigedo Is # no information @@ -2692,17 +2740,7 @@ Zone Atlantic/Bermuda -4:19:18 - LMT 1930 Jan 1 2:00 # Hamilton -4:00 US A%sT # Cayman Is - -# From Paul Eggert (2015-05-15): -# The Cayman government has decided to introduce DST in 2016, the idea being -# to keep in sync with New York. The legislation hasn't passed but the change -# seems quite likely. See: Meade B. Cayman 27. -# http://www.cayman27.com.ky/2015/05/15/clock-ticks-toward-daylight-saving-time-in-cayman - -Zone America/Cayman -5:25:32 - LMT 1890 # Georgetown - -5:07:11 - KMT 1912 Feb # Kingston Mean Time - -5:00 - EST 2016 - -5:00 US E%sT +# See America/Panama. # Costa Rica @@ -3065,6 +3103,13 @@ Zone America/Guatemala -6:02:04 - LMT 1918 Oct 5 # http://radiovision2000haiti.net/public/haiti-avis-changement-dheure-dimanche/ # http://www.canalplushaiti.net/?p=6714 +# From Steffen Thorsen (2016-03-12): +# Jean Antoine, editor of www.haiti-reference.com informed us that Haiti +# are not going on DST this year. Several other resources confirm this: ... +# http://www.radiotelevisioncaraibes.com/presse/heure_d_t_pas_de_changement_d_heure_pr_vu_pour_cet_ann_e.html +# http://www.vantbefinfo.com/changement-dheure-pas-pour-haiti/ +# http://news.anmwe.com/haiti-lheure-nationale-ne-sera-ni-avancee-ni-reculee-cette-annee/ + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Haiti 1983 only - May 8 0:00 1:00 D Rule Haiti 1984 1987 - Apr lastSun 0:00 1:00 D @@ -3075,8 +3120,8 @@ Rule Haiti 1988 1997 - Apr Sun>=1 1:00s 1:00 D Rule Haiti 1988 1997 - Oct lastSun 1:00s 0 S Rule Haiti 2005 2006 - Apr Sun>=1 0:00 1:00 D Rule Haiti 2005 2006 - Oct lastSun 0:00 0 S -Rule Haiti 2012 max - Mar Sun>=8 2:00 1:00 D -Rule Haiti 2012 max - Nov Sun>=1 2:00 0 S +Rule Haiti 2012 2015 - Mar Sun>=8 2:00 1:00 D +Rule Haiti 2012 2015 - Nov Sun>=1 2:00 0 S # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone America/Port-au-Prince -4:49:20 - LMT 1890 -4:49 - PPMT 1917 Jan 24 12:00 # P-a-P MT @@ -3225,6 +3270,7 @@ Zone America/Managua -5:45:08 - LMT 1890 Zone America/Panama -5:18:08 - LMT 1890 -5:19:36 - CMT 1908 Apr 22 # Colón Mean Time -5:00 - EST +Link America/Panama America/Cayman # Puerto Rico # There are too many San Juans elsewhere, so we'll use 'Puerto_Rico'. @@ -3266,7 +3312,7 @@ Zone America/Miquelon -3:44:40 - LMT 1911 May 15 # St Pierre # indicating that the normal ET rules are followed. # # From Paul Eggert (2014-08-19): -# The 2014-08-13 Cabinet meeting decided to stay on UTC-4 year-round. See: +# The 2014-08-13 Cabinet meeting decided to stay on UT -04 year-round. See: # http://tcweeklynews.com/daylight-savings-time-to-be-maintained-p5353-127.htm # Model this as a switch from EST/EDT to AST ... # From Chris Walton (2014-11-04): diff --git a/contrib/tzdata/southamerica b/contrib/tzdata/southamerica index 50d118e..5321451 100644 --- a/contrib/tzdata/southamerica +++ b/contrib/tzdata/southamerica @@ -410,9 +410,9 @@ Rule Arg 2008 only - Oct Sun>=15 0:00 1:00 S # stuck on Summer daylight savings time even though the summer is over. # From Paul Eggert (2013-09-05): -# Perhaps San Luis operates on the legal fiction that it is at UTC-4 +# Perhaps San Luis operates on the legal fiction that it is at -04 # with perpetual summer time, but ordinary usage typically seems to -# just say it's at UTC-3; see, for example, +# just say it's at -03; see, for example, # http://es.wikipedia.org/wiki/Hora_oficial_argentina # We've documented similar situations as being plain changes to # standard time, so let's do that here too. This does not change UTC @@ -1221,6 +1221,20 @@ Zone America/Rio_Branco -4:31:12 - LMT 1914 # From Paul Eggert (2015-03-03): # For now, assume that the extension will persist indefinitely. +# From Juan Correa (2016-03-18): +# The decree regarding DST has been published in today's Official Gazette: +# http://www.diariooficial.interior.gob.cl/versiones-anteriores/do/20160318/ +# http://www.leychile.cl/Navegar?idNorma=1088502 +# It does consider the second Saturday of May and August as the dates +# for the transition; and it lists DST dates until 2019, but I think +# this scheme will stick. +# +# From Paul Eggert (2016-03-18): +# For now, assume the pattern holds for the indefinite future. +# The decree says transitions occur at 24:00; in practice this appears +# to mean 24:00 mainland time, not 24:00 local time, so that Easter +# Island is always two hours behind the mainland. + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Chile 1927 1931 - Sep 1 0:00 1:00 S Rule Chile 1928 1932 - Apr 1 0:00 0 - @@ -1252,8 +1266,10 @@ Rule Chile 2009 only - Mar Sun>=9 3:00u 0 - Rule Chile 2010 only - Apr Sun>=1 3:00u 0 - Rule Chile 2011 only - May Sun>=2 3:00u 0 - Rule Chile 2011 only - Aug Sun>=16 4:00u 1:00 S -Rule Chile 2012 2015 - Apr Sun>=23 3:00u 0 - +Rule Chile 2012 2014 - Apr Sun>=23 3:00u 0 - Rule Chile 2012 2014 - Sep Sun>=2 4:00u 1:00 S +Rule Chile 2016 max - May Sun>=9 3:00u 0 - +Rule Chile 2016 max - Aug Sun>=9 4:00u 1:00 S # IATA SSIM anomalies: (1992-02) says 1992-03-14; # (1996-09) says 1998-03-08. Ignore these. # Zone NAME GMTOFF RULES FORMAT [UNTIL] @@ -1270,13 +1286,11 @@ Zone America/Santiago -4:42:46 - LMT 1890 -4:00 1:00 CLST 1946 Sep 1 # central Chile -4:00 - CLT 1947 Apr 1 -5:00 - CLT 1947 May 21 23:00 - -4:00 Chile CL%sT 2015 Apr 26 3:00u - -3:00 - CLT + -4:00 Chile CL%sT Zone Pacific/Easter -7:17:28 - LMT 1890 -7:17:28 - EMT 1932 Sep # Easter Mean Time -7:00 Chile EAS%sT 1982 Mar 14 3:00u # Easter Time - -6:00 Chile EAS%sT 2015 Apr 26 3:00u - -5:00 - EAST + -6:00 Chile EAS%sT # # Salas y Gómez Island is uninhabited. # Other Chilean locations, including Juan Fernández Is, Desventuradas Is, @@ -1295,11 +1309,10 @@ Zone Pacific/Easter -7:17:28 - LMT 1890 # Palmer used to be supplied from Argentina. # # Zone NAME GMTOFF RULES FORMAT [UNTIL] -Zone Antarctica/Palmer 0 - zzz 1965 +Zone Antarctica/Palmer 0 - -00 1965 -4:00 Arg AR%sT 1969 Oct 5 -3:00 Arg AR%sT 1982 May - -4:00 Chile CL%sT 2015 Apr 26 3:00u - -3:00 - CLT + -4:00 Chile CL%sT # Colombia @@ -1742,9 +1755,25 @@ Zone America/Montevideo -3:44:44 - LMT 1898 Jun 28 # resolution publication) # http://www.globovision.com/news.php?nid=72208 +# From Alexander Krivenyshev (2016-04-15): +# https://actualidad.rt.com/actualidad/204758-venezuela-modificar-huso-horario-sequia-elnino +# +# From Paul Eggert (2016-04-15): +# Clocks advance 30 minutes on 2016-05-01 at 02:30.... +# "'Venezuela's new time-zone: hours without light, hours without water, +# hours of presidential broadcasts, hours of lines,' quipped comedian +# Jean Mary Curró ...". See: Cawthorne A, Kai D. Venezuela scraps +# half-hour time difference set by Chavez. Reuters 2016-04-15 14:50 -0400 +# http://www.reuters.com/article/us-venezuela-timezone-idUSKCN0XC2BE +# +# From Matt Johnson (2016-04-20): +# ... published in the official Gazette [2016-04-18], here: +# http://historico.tsj.gob.ve/gaceta_ext/abril/1842016/E-1842016-4551.pdf + # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone America/Caracas -4:27:44 - LMT 1890 -4:27:40 - CMT 1912 Feb 12 # Caracas Mean Time? -4:30 - VET 1965 Jan 1 0:00 # Venezuela T. -4:00 - VET 2007 Dec 9 3:00 - -4:30 - VET + -4:30 - VET 2016 May 1 2:30 + -4:00 - VET diff --git a/contrib/tzdata/zone.tab b/contrib/tzdata/zone.tab index 935143f..cf774b5 100644 --- a/contrib/tzdata/zone.tab +++ b/contrib/tzdata/zone.tab @@ -30,22 +30,22 @@ AI +1812-06304 America/Anguilla AL +4120+01950 Europe/Tirane AM +4011+04430 Asia/Yerevan AO -0848+01314 Africa/Luanda -AQ -7750+16636 Antarctica/McMurdo McMurdo, South Pole, Scott (New Zealand time) -AQ -6734-06808 Antarctica/Rothera Rothera Station, Adelaide Island -AQ -6448-06406 Antarctica/Palmer Palmer Station, Anvers Island -AQ -6736+06253 Antarctica/Mawson Mawson Station, Holme Bay -AQ -6835+07758 Antarctica/Davis Davis Station, Vestfold Hills -AQ -6617+11031 Antarctica/Casey Casey Station, Bailey Peninsula -AQ -7824+10654 Antarctica/Vostok Vostok Station, Lake Vostok -AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville Station, Adelie Land -AQ -690022+0393524 Antarctica/Syowa Syowa Station, E Ongul I -AQ -720041+0023206 Antarctica/Troll Troll Station, Queen Maud Land +AQ -7750+16636 Antarctica/McMurdo New Zealand time - McMurdo, South Pole +AQ -6617+11031 Antarctica/Casey Casey +AQ -6835+07758 Antarctica/Davis Davis +AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville +AQ -6736+06253 Antarctica/Mawson Mawson +AQ -6448-06406 Antarctica/Palmer Palmer +AQ -6734-06808 Antarctica/Rothera Rothera +AQ -690022+0393524 Antarctica/Syowa Syowa +AQ -720041+0023206 Antarctica/Troll Troll +AQ -7824+10654 Antarctica/Vostok Vostok AR -3436-05827 America/Argentina/Buenos_Aires Buenos Aires (BA, CF) -AR -3124-06411 America/Argentina/Cordoba most locations (CB, CC, CN, ER, FM, MN, SE, SF) -AR -2447-06525 America/Argentina/Salta (SA, LP, NQ, RN) +AR -3124-06411 America/Argentina/Cordoba Argentina (most areas: CB, CC, CN, ER, FM, MN, SE, SF) +AR -2447-06525 America/Argentina/Salta Salta (SA, LP, NQ, RN) AR -2411-06518 America/Argentina/Jujuy Jujuy (JY) AR -2649-06513 America/Argentina/Tucuman Tucuman (TM) -AR -2828-06547 America/Argentina/Catamarca Catamarca (CT), Chubut (CH) +AR -2828-06547 America/Argentina/Catamarca Catamarca (CT); Chubut (CH) AR -2926-06651 America/Argentina/La_Rioja La Rioja (LR) AR -3132-06831 America/Argentina/San_Juan San Juan (SJ) AR -3253-06849 America/Argentina/Mendoza Mendoza (MZ) @@ -56,17 +56,17 @@ AS -1416-17042 Pacific/Pago_Pago AT +4813+01620 Europe/Vienna AU -3133+15905 Australia/Lord_Howe Lord Howe Island AU -5430+15857 Antarctica/Macquarie Macquarie Island -AU -4253+14719 Australia/Hobart Tasmania - most locations -AU -3956+14352 Australia/Currie Tasmania - King Island +AU -4253+14719 Australia/Hobart Tasmania (most areas) +AU -3956+14352 Australia/Currie Tasmania (King Island) AU -3749+14458 Australia/Melbourne Victoria -AU -3352+15113 Australia/Sydney New South Wales - most locations -AU -3157+14127 Australia/Broken_Hill New South Wales - Yancowinna -AU -2728+15302 Australia/Brisbane Queensland - most locations -AU -2016+14900 Australia/Lindeman Queensland - Holiday Islands +AU -3352+15113 Australia/Sydney New South Wales (most areas) +AU -3157+14127 Australia/Broken_Hill New South Wales (Yancowinna) +AU -2728+15302 Australia/Brisbane Queensland (most areas) +AU -2016+14900 Australia/Lindeman Queensland (Whitsunday Islands) AU -3455+13835 Australia/Adelaide South Australia AU -1228+13050 Australia/Darwin Northern Territory -AU -3157+11551 Australia/Perth Western Australia - most locations -AU -3143+12852 Australia/Eucla Western Australia - Eucla area +AU -3157+11551 Australia/Perth Western Australia (most areas) +AU -3143+12852 Australia/Eucla Western Australia (Eucla) AW +1230-06958 America/Aruba AX +6006+01957 Europe/Mariehamn AZ +4023+04951 Asia/Baku @@ -85,63 +85,63 @@ BN +0456+11455 Asia/Brunei BO -1630-06809 America/La_Paz BQ +120903-0681636 America/Kralendijk BR -0351-03225 America/Noronha Atlantic islands -BR -0127-04829 America/Belem Amapa, E Para -BR -0343-03830 America/Fortaleza NE Brazil (MA, PI, CE, RN, PB) +BR -0127-04829 America/Belem Para (east); Amapa +BR -0343-03830 America/Fortaleza Brazil (northeast: MA, PI, CE, RN, PB) BR -0803-03454 America/Recife Pernambuco BR -0712-04812 America/Araguaina Tocantins BR -0940-03543 America/Maceio Alagoas, Sergipe BR -1259-03831 America/Bahia Bahia -BR -2332-04637 America/Sao_Paulo S & SE Brazil (GO, DF, MG, ES, RJ, SP, PR, SC, RS) +BR -2332-04637 America/Sao_Paulo Brazil (southeast: GO, DF, MG, ES, RJ, SP, PR, SC, RS) BR -2027-05437 America/Campo_Grande Mato Grosso do Sul BR -1535-05605 America/Cuiaba Mato Grosso -BR -0226-05452 America/Santarem W Para +BR -0226-05452 America/Santarem Para (west) BR -0846-06354 America/Porto_Velho Rondonia BR +0249-06040 America/Boa_Vista Roraima -BR -0308-06001 America/Manaus E Amazonas -BR -0640-06952 America/Eirunepe W Amazonas +BR -0308-06001 America/Manaus Amazonas (east) +BR -0640-06952 America/Eirunepe Amazonas (west) BR -0958-06748 America/Rio_Branco Acre BS +2505-07721 America/Nassau BT +2728+08939 Asia/Thimphu BW -2439+02555 Africa/Gaborone BY +5354+02734 Europe/Minsk BZ +1730-08812 America/Belize -CA +4734-05243 America/St_Johns Newfoundland Time, including SE Labrador -CA +4439-06336 America/Halifax Atlantic Time - Nova Scotia (peninsula), PEI -CA +4612-05957 America/Glace_Bay Atlantic Time - Nova Scotia (Cape Breton) -CA +4606-06447 America/Moncton Atlantic Time - New Brunswick -CA +5320-06025 America/Goose_Bay Atlantic Time - Labrador - most locations -CA +5125-05707 America/Blanc-Sablon Atlantic Standard Time - Quebec - Lower North Shore -CA +4339-07923 America/Toronto Eastern Time - Ontario & Quebec - most locations -CA +4901-08816 America/Nipigon Eastern Time - Ontario & Quebec - places that did not observe DST 1967-1973 -CA +4823-08915 America/Thunder_Bay Eastern Time - Thunder Bay, Ontario -CA +6344-06828 America/Iqaluit Eastern Time - east Nunavut - most locations -CA +6608-06544 America/Pangnirtung Eastern Time - Pangnirtung, Nunavut -CA +744144-0944945 America/Resolute Central Time - Resolute, Nunavut -CA +484531-0913718 America/Atikokan Eastern Standard Time - Atikokan, Ontario and Southampton I, Nunavut -CA +624900-0920459 America/Rankin_Inlet Central Time - central Nunavut -CA +4953-09709 America/Winnipeg Central Time - Manitoba & west Ontario -CA +4843-09434 America/Rainy_River Central Time - Rainy River & Fort Frances, Ontario -CA +5024-10439 America/Regina Central Standard Time - Saskatchewan - most locations -CA +5017-10750 America/Swift_Current Central Standard Time - Saskatchewan - midwest -CA +5333-11328 America/Edmonton Mountain Time - Alberta, east British Columbia & west Saskatchewan -CA +690650-1050310 America/Cambridge_Bay Mountain Time - west Nunavut -CA +6227-11421 America/Yellowknife Mountain Time - central Northwest Territories -CA +682059-1334300 America/Inuvik Mountain Time - west Northwest Territories -CA +4906-11631 America/Creston Mountain Standard Time - Creston, British Columbia -CA +5946-12014 America/Dawson_Creek Mountain Standard Time - Dawson Creek & Fort Saint John, British Columbia -CA +5848-12242 America/Fort_Nelson Mountain Standard Time - Fort Nelson, British Columbia -CA +4916-12307 America/Vancouver Pacific Time - west British Columbia -CA +6043-13503 America/Whitehorse Pacific Time - south Yukon -CA +6404-13925 America/Dawson Pacific Time - north Yukon +CA +4734-05243 America/St_Johns Newfoundland; Labrador (southeast) +CA +4439-06336 America/Halifax Atlantic - NS (most areas); PE +CA +4612-05957 America/Glace_Bay Atlantic - NS (Cape Breton) +CA +4606-06447 America/Moncton Atlantic - New Brunswick +CA +5320-06025 America/Goose_Bay Atlantic - Labrador (most areas) +CA +5125-05707 America/Blanc-Sablon AST - QC (Lower North Shore) +CA +4339-07923 America/Toronto Eastern - ON, QC (most areas) +CA +4901-08816 America/Nipigon Eastern - ON, QC (no DST 1967-73) +CA +4823-08915 America/Thunder_Bay Eastern - ON (Thunder Bay) +CA +6344-06828 America/Iqaluit Eastern - NU (most east areas) +CA +6608-06544 America/Pangnirtung Eastern - NU (Pangnirtung) +CA +484531-0913718 America/Atikokan EST - ON (Atikokan); NU (Coral H) +CA +4953-09709 America/Winnipeg Central - ON (west); Manitoba +CA +4843-09434 America/Rainy_River Central - ON (Rainy R, Ft Frances) +CA +744144-0944945 America/Resolute Central - NU (Resolute) +CA +624900-0920459 America/Rankin_Inlet Central - NU (central) +CA +5024-10439 America/Regina CST - SK (most areas) +CA +5017-10750 America/Swift_Current CST - SK (midwest) +CA +5333-11328 America/Edmonton Mountain - AB; BC (E); SK (W) +CA +690650-1050310 America/Cambridge_Bay Mountain - NU (west) +CA +6227-11421 America/Yellowknife Mountain - NT (central) +CA +682059-1334300 America/Inuvik Mountain - NT (west) +CA +4906-11631 America/Creston MST - BC (Creston) +CA +5946-12014 America/Dawson_Creek MST - BC (Dawson Cr, Ft St John) +CA +5848-12242 America/Fort_Nelson MST - BC (Ft Nelson) +CA +4916-12307 America/Vancouver Pacific - BC (most areas) +CA +6043-13503 America/Whitehorse Pacific - Yukon (south) +CA +6404-13925 America/Dawson Pacific - Yukon (north) CC -1210+09655 Indian/Cocos -CD -0418+01518 Africa/Kinshasa west Dem. Rep. of Congo -CD -1140+02728 Africa/Lubumbashi east Dem. Rep. of Congo +CD -0418+01518 Africa/Kinshasa Dem. Rep. of Congo (west) +CD -1140+02728 Africa/Lubumbashi Dem. Rep. of Congo (east) CF +0422+01835 Africa/Bangui CG -0416+01517 Africa/Brazzaville CH +4723+00832 Europe/Zurich CI +0519-00402 Africa/Abidjan CK -2114-15946 Pacific/Rarotonga -CL -3327-07040 America/Santiago most locations +CL -3327-07040 America/Santiago Chile (most areas) CL -2709-10926 Pacific/Easter Easter Island CM +0403+00942 Africa/Douala CN +3114+12128 Asia/Shanghai Beijing Time @@ -154,28 +154,28 @@ CW +1211-06900 America/Curacao CX -1025+10543 Indian/Christmas CY +3510+03322 Asia/Nicosia CZ +5005+01426 Europe/Prague -DE +5230+01322 Europe/Berlin most locations +DE +5230+01322 Europe/Berlin Germany (most areas) DE +4742+00841 Europe/Busingen Busingen DJ +1136+04309 Africa/Djibouti DK +5540+01235 Europe/Copenhagen DM +1518-06124 America/Dominica DO +1828-06954 America/Santo_Domingo DZ +3647+00303 Africa/Algiers -EC -0210-07950 America/Guayaquil mainland +EC -0210-07950 America/Guayaquil Ecuador (mainland) EC -0054-08936 Pacific/Galapagos Galapagos Islands EE +5925+02445 Europe/Tallinn EG +3003+03115 Africa/Cairo EH +2709-01312 Africa/El_Aaiun ER +1520+03853 Africa/Asmara -ES +4024-00341 Europe/Madrid mainland -ES +3553-00519 Africa/Ceuta Ceuta & Melilla +ES +4024-00341 Europe/Madrid Spain (mainland) +ES +3553-00519 Africa/Ceuta Ceuta, Melilla ES +2806-01524 Atlantic/Canary Canary Islands ET +0902+03842 Africa/Addis_Ababa FI +6010+02458 Europe/Helsinki FJ -1808+17825 Pacific/Fiji FK -5142-05751 Atlantic/Stanley -FM +0725+15147 Pacific/Chuuk Chuuk (Truk) and Yap -FM +0658+15813 Pacific/Pohnpei Pohnpei (Ponape) +FM +0725+15147 Pacific/Chuuk Chuuk/Truk, Yap +FM +0658+15813 Pacific/Pohnpei Pohnpei/Ponape FM +0519+16259 Pacific/Kosrae Kosrae FO +6201-00646 Atlantic/Faroe FR +4852+00220 Europe/Paris @@ -187,10 +187,10 @@ GF +0456-05220 America/Cayenne GG +4927-00232 Europe/Guernsey GH +0533-00013 Africa/Accra GI +3608-00521 Europe/Gibraltar -GL +6411-05144 America/Godthab most locations -GL +7646-01840 America/Danmarkshavn east coast, north of Scoresbysund -GL +7029-02158 America/Scoresbysund Scoresbysund / Ittoqqortoormiit -GL +7634-06847 America/Thule Thule / Pituffik +GL +6411-05144 America/Godthab Greenland (most areas) +GL +7646-01840 America/Danmarkshavn National Park (east coast) +GL +7029-02158 America/Scoresbysund Scoresbysund/Ittoqqortoormiit +GL +7634-06847 America/Thule Thule/Pituffik GM +1328-01639 Africa/Banjul GN +0931-01343 Africa/Conakry GP +1614-06132 America/Guadeloupe @@ -206,10 +206,10 @@ HN +1406-08713 America/Tegucigalpa HR +4548+01558 Europe/Zagreb HT +1832-07220 America/Port-au-Prince HU +4730+01905 Europe/Budapest -ID -0610+10648 Asia/Jakarta Java & Sumatra -ID -0002+10920 Asia/Pontianak west & central Borneo -ID -0507+11924 Asia/Makassar east & south Borneo, Sulawesi (Celebes), Bali, Nusa Tengarra, west Timor -ID -0232+14042 Asia/Jayapura west New Guinea (Irian Jaya) & Malukus (Moluccas) +ID -0610+10648 Asia/Jakarta Java, Sumatra +ID -0002+10920 Asia/Pontianak Borneo (west, central) +ID -0507+11924 Asia/Makassar Borneo (east, south); Sulawesi/Celebes, Bali, Nusa Tengarra; Timor (west) +ID -0232+14042 Asia/Jayapura New Guinea (West Papua / Irian Jaya); Malukus/Moluccas IE +5320-00615 Europe/Dublin IL +314650+0351326 Asia/Jerusalem IM +5409-00428 Europe/Isle_of_Man @@ -235,10 +235,10 @@ KP +3901+12545 Asia/Pyongyang KR +3733+12658 Asia/Seoul KW +2920+04759 Asia/Kuwait KY +1918-08123 America/Cayman -KZ +4315+07657 Asia/Almaty most locations -KZ +4448+06528 Asia/Qyzylorda Qyzylorda (Kyzylorda, Kzyl-Orda) -KZ +5017+05710 Asia/Aqtobe Aqtobe (Aktobe) -KZ +4431+05016 Asia/Aqtau Atyrau (Atirau, Gur'yev), Mangghystau (Mankistau) +KZ +4315+07657 Asia/Almaty Kazakhstan (most areas) +KZ +4448+06528 Asia/Qyzylorda Qyzylorda/Kyzylorda/Kzyl-Orda +KZ +5017+05710 Asia/Aqtobe Aqtobe/Aktobe +KZ +4431+05016 Asia/Aqtau Atyrau/Atirau/Gur'yev, Mangghystau/Mankistau KZ +5113+05121 Asia/Oral West Kazakhstan LA +1758+10236 Asia/Vientiane LB +3353+03530 Asia/Beirut @@ -257,12 +257,12 @@ MD +4700+02850 Europe/Chisinau ME +4226+01916 Europe/Podgorica MF +1804-06305 America/Marigot MG -1855+04731 Indian/Antananarivo -MH +0709+17112 Pacific/Majuro most locations +MH +0709+17112 Pacific/Majuro Marshall Islands (most areas) MH +0905+16720 Pacific/Kwajalein Kwajalein MK +4159+02126 Europe/Skopje ML +1239-00800 Africa/Bamako -MM +1647+09610 Asia/Rangoon -MN +4755+10653 Asia/Ulaanbaatar most locations +MM +1647+09610 Asia/Yangon +MN +4755+10653 Asia/Ulaanbaatar Mongolia (most areas) MN +4801+09139 Asia/Hovd Bayan-Olgiy, Govi-Altai, Hovd, Uvs, Zavkhan MN +4804+11430 Asia/Choibalsan Dornod, Sukhbaatar MO +2214+11335 Asia/Macau @@ -274,20 +274,19 @@ MT +3554+01431 Europe/Malta MU -2010+05730 Indian/Mauritius MV +0410+07330 Indian/Maldives MW -1547+03500 Africa/Blantyre -MX +1924-09909 America/Mexico_City Central Time - most locations +MX +1924-09909 America/Mexico_City Central Time MX +2105-08646 America/Cancun Eastern Standard Time - Quintana Roo MX +2058-08937 America/Merida Central Time - Campeche, Yucatan -MX +2540-10019 America/Monterrey Mexican Central Time - Coahuila, Durango, Nuevo Leon, Tamaulipas away from US border -MX +2550-09730 America/Matamoros US Central Time - Coahuila, Durango, Nuevo Leon, Tamaulipas near US border -MX +2313-10625 America/Mazatlan Mountain Time - S Baja, Nayarit, Sinaloa -MX +2838-10605 America/Chihuahua Mexican Mountain Time - Chihuahua away from US border -MX +2934-10425 America/Ojinaga US Mountain Time - Chihuahua near US border +MX +2540-10019 America/Monterrey Central Time - Durango; Coahuila, Nuevo Leon, Tamaulipas (most areas) +MX +2550-09730 America/Matamoros Central Time US - Coahuila, Nuevo Leon, Tamaulipas (US border) +MX +2313-10625 America/Mazatlan Mountain Time - Baja California Sur, Nayarit, Sinaloa +MX +2838-10605 America/Chihuahua Mountain Time - Chihuahua (most areas) +MX +2934-10425 America/Ojinaga Mountain Time US - Chihuahua (US border) MX +2904-11058 America/Hermosillo Mountain Standard Time - Sonora -MX +3232-11701 America/Tijuana US Pacific Time - Baja California near US border -MX +3018-11452 America/Santa_Isabel Mexican Pacific Time - Baja California away from US border -MX +2048-10515 America/Bahia_Banderas Mexican Central Time - Bahia de Banderas -MY +0310+10142 Asia/Kuala_Lumpur peninsular Malaysia -MY +0133+11020 Asia/Kuching Sabah & Sarawak +MX +3232-11701 America/Tijuana Pacific Time US - Baja California +MX +2048-10515 America/Bahia_Banderas Central Time - Bahia de Banderas +MY +0310+10142 Asia/Kuala_Lumpur Malaysia (peninsula) +MY +0133+11020 Asia/Kuching Sabah, Sarawak MZ -2558+03235 Africa/Maputo NA -2234+01706 Africa/Windhoek NC -2216+16627 Pacific/Noumea @@ -300,7 +299,7 @@ NO +5955+01045 Europe/Oslo NP +2743+08519 Asia/Kathmandu NR -0031+16655 Pacific/Nauru NU -1901-16955 Pacific/Niue -NZ -3652+17446 Pacific/Auckland most locations +NZ -3652+17446 Pacific/Auckland New Zealand (most areas) NZ -4357-17633 Pacific/Chatham Chatham Islands OM +2336+05835 Asia/Muscat PA +0858-07932 America/Panama @@ -308,7 +307,7 @@ PE -1203-07703 America/Lima PF -1732-14934 Pacific/Tahiti Society Islands PF -0900-13930 Pacific/Marquesas Marquesas Islands PF -2308-13457 Pacific/Gambier Gambier Islands -PG -0930+14710 Pacific/Port_Moresby most locations +PG -0930+14710 Pacific/Port_Moresby Papua New Guinea (most areas) PG -0613+15534 Pacific/Bougainville Bougainville PH +1435+12100 Asia/Manila PK +2452+06703 Asia/Karachi @@ -318,7 +317,7 @@ PN -2504-13005 Pacific/Pitcairn PR +182806-0660622 America/Puerto_Rico PS +3130+03428 Asia/Gaza Gaza Strip PS +313200+0350542 Asia/Hebron West Bank -PT +3843-00908 Europe/Lisbon mainland +PT +3843-00908 Europe/Lisbon Portugal (mainland) PT +3238-01654 Atlantic/Madeira Madeira Islands PT +3744-02540 Atlantic/Azores Azores PW +0720+13429 Pacific/Palau @@ -327,27 +326,32 @@ QA +2517+05132 Asia/Qatar RE -2052+05528 Indian/Reunion RO +4426+02606 Europe/Bucharest RS +4450+02030 Europe/Belgrade -RU +5443+02030 Europe/Kaliningrad Moscow-01 - Kaliningrad -RU +554521+0373704 Europe/Moscow Moscow+00 - west Russia -RU +4457+03406 Europe/Simferopol Moscow+00 - Crimea -RU +4844+04425 Europe/Volgograd Moscow+00 - Caspian Sea -RU +5312+05009 Europe/Samara Moscow+00 (Moscow+01 after 2014-10-26) - Samara, Udmurtia -RU +5651+06036 Asia/Yekaterinburg Moscow+02 - Urals -RU +5500+07324 Asia/Omsk Moscow+03 - west Siberia -RU +5502+08255 Asia/Novosibirsk Moscow+03 - Novosibirsk -RU +5345+08707 Asia/Novokuznetsk Moscow+03 (Moscow+04 after 2014-10-26) - Kemerovo -RU +5601+09250 Asia/Krasnoyarsk Moscow+04 - Yenisei River -RU +5216+10420 Asia/Irkutsk Moscow+05 - Lake Baikal -RU +5203+11328 Asia/Chita Moscow+06 (Moscow+05 after 2014-10-26) - Zabaykalsky -RU +6200+12940 Asia/Yakutsk Moscow+06 - Lena River -RU +623923+1353314 Asia/Khandyga Moscow+06 - Tomponsky, Ust-Maysky -RU +4310+13156 Asia/Vladivostok Moscow+07 - Amur River -RU +4658+14242 Asia/Sakhalin Moscow+07 - Sakhalin Island -RU +643337+1431336 Asia/Ust-Nera Moscow+07 - Oymyakonsky -RU +5934+15048 Asia/Magadan Moscow+08 (Moscow+07 after 2014-10-26) - Magadan -RU +6728+15343 Asia/Srednekolymsk Moscow+08 - E Sakha, N Kuril Is -RU +5301+15839 Asia/Kamchatka Moscow+08 (Moscow+09 after 2014-10-26) - Kamchatka -RU +6445+17729 Asia/Anadyr Moscow+08 (Moscow+09 after 2014-10-26) - Bering Sea +RU +5443+02030 Europe/Kaliningrad MSK-01 - Kaliningrad +RU +554521+0373704 Europe/Moscow MSK+00 - Moscow area +RU +4457+03406 Europe/Simferopol MSK+00 - Crimea +RU +4844+04425 Europe/Volgograd MSK+00 - Volgograd, Saratov +RU +5836+04939 Europe/Kirov MSK+00 - Kirov +RU +4621+04803 Europe/Astrakhan MSK+01 - Astrakhan +RU +5312+05009 Europe/Samara MSK+01 - Samara, Udmurtia +RU +5420+04824 Europe/Ulyanovsk MSK+01 - Ulyanovsk +RU +5651+06036 Asia/Yekaterinburg MSK+02 - Urals +RU +5500+07324 Asia/Omsk MSK+03 - Omsk +RU +5502+08255 Asia/Novosibirsk MSK+03 - Novosibirsk +RU +5322+08345 Asia/Barnaul MSK+04 - Altai +RU +5630+08458 Asia/Tomsk MSK+04 - Tomsk +RU +5345+08707 Asia/Novokuznetsk MSK+04 - Kemerovo +RU +5601+09250 Asia/Krasnoyarsk MSK+04 - Krasnoyarsk area +RU +5216+10420 Asia/Irkutsk MSK+05 - Irkutsk, Buryatia +RU +5203+11328 Asia/Chita MSK+06 - Zabaykalsky +RU +6200+12940 Asia/Yakutsk MSK+06 - Lena River +RU +623923+1353314 Asia/Khandyga MSK+06 - Tomponsky, Ust-Maysky +RU +4310+13156 Asia/Vladivostok MSK+07 - Amur River +RU +643337+1431336 Asia/Ust-Nera MSK+07 - Oymyakonsky +RU +5934+15048 Asia/Magadan MSK+08 - Magadan +RU +4658+14242 Asia/Sakhalin MSK+08 - Sakhalin Island +RU +6728+15343 Asia/Srednekolymsk MSK+08 - Sakha (E); North Kuril Is +RU +5301+15839 Asia/Kamchatka MSK+09 - Kamchatka +RU +6445+17729 Asia/Anadyr MSK+09 - Bering Sea RW -0157+03004 Africa/Kigali SA +2438+04643 Asia/Riyadh SB -0932+16012 Pacific/Guadalcanal @@ -386,45 +390,45 @@ TT +1039-06131 America/Port_of_Spain TV -0831+17913 Pacific/Funafuti TW +2503+12130 Asia/Taipei TZ -0648+03917 Africa/Dar_es_Salaam -UA +5026+03031 Europe/Kiev most locations +UA +5026+03031 Europe/Kiev Ukraine (most areas) UA +4837+02218 Europe/Uzhgorod Ruthenia -UA +4750+03510 Europe/Zaporozhye Zaporozh'ye, E Lugansk / Zaporizhia, E Luhansk +UA +4750+03510 Europe/Zaporozhye Zaporozh'ye/Zaporizhia; Lugansk/Luhansk (east) UG +0019+03225 Africa/Kampala UM +1645-16931 Pacific/Johnston Johnston Atoll UM +2813-17722 Pacific/Midway Midway Islands UM +1917+16637 Pacific/Wake Wake Island -US +404251-0740023 America/New_York Eastern Time -US +421953-0830245 America/Detroit Eastern Time - Michigan - most locations -US +381515-0854534 America/Kentucky/Louisville Eastern Time - Kentucky - Louisville area -US +364947-0845057 America/Kentucky/Monticello Eastern Time - Kentucky - Wayne County -US +394606-0860929 America/Indiana/Indianapolis Eastern Time - Indiana - most locations -US +384038-0873143 America/Indiana/Vincennes Eastern Time - Indiana - Daviess, Dubois, Knox & Martin Counties -US +410305-0863611 America/Indiana/Winamac Eastern Time - Indiana - Pulaski County -US +382232-0862041 America/Indiana/Marengo Eastern Time - Indiana - Crawford County -US +382931-0871643 America/Indiana/Petersburg Eastern Time - Indiana - Pike County -US +384452-0850402 America/Indiana/Vevay Eastern Time - Indiana - Switzerland County -US +415100-0873900 America/Chicago Central Time -US +375711-0864541 America/Indiana/Tell_City Central Time - Indiana - Perry County -US +411745-0863730 America/Indiana/Knox Central Time - Indiana - Starke County -US +450628-0873651 America/Menominee Central Time - Michigan - Dickinson, Gogebic, Iron & Menominee Counties -US +470659-1011757 America/North_Dakota/Center Central Time - North Dakota - Oliver County -US +465042-1012439 America/North_Dakota/New_Salem Central Time - North Dakota - Morton County (except Mandan area) -US +471551-1014640 America/North_Dakota/Beulah Central Time - North Dakota - Mercer County -US +394421-1045903 America/Denver Mountain Time -US +433649-1161209 America/Boise Mountain Time - south Idaho & east Oregon -US +332654-1120424 America/Phoenix Mountain Standard Time - Arizona (except Navajo) -US +340308-1181434 America/Los_Angeles Pacific Time -US +550737-1313435 America/Metlakatla Pacific Standard Time - Annette Island, Alaska -US +611305-1495401 America/Anchorage Alaska Time -US +581807-1342511 America/Juneau Alaska Time - Alaska panhandle -US +571035-1351807 America/Sitka Alaska Time - southeast Alaska panhandle -US +593249-1394338 America/Yakutat Alaska Time - Alaska panhandle neck -US +643004-1652423 America/Nome Alaska Time - west Alaska +US +404251-0740023 America/New_York Eastern (most areas) +US +421953-0830245 America/Detroit Eastern - MI (most areas) +US +381515-0854534 America/Kentucky/Louisville Eastern - KY (Louisville area) +US +364947-0845057 America/Kentucky/Monticello Eastern - KY (Wayne) +US +394606-0860929 America/Indiana/Indianapolis Eastern - IN (most areas) +US +384038-0873143 America/Indiana/Vincennes Eastern - IN (Da, Du, K, Mn) +US +410305-0863611 America/Indiana/Winamac Eastern - IN (Pulaski) +US +382232-0862041 America/Indiana/Marengo Eastern - IN (Crawford) +US +382931-0871643 America/Indiana/Petersburg Eastern - IN (Pike) +US +384452-0850402 America/Indiana/Vevay Eastern - IN (Switzerland) +US +415100-0873900 America/Chicago Central (most areas) +US +375711-0864541 America/Indiana/Tell_City Central - IN (Perry) +US +411745-0863730 America/Indiana/Knox Central - IN (Starke) +US +450628-0873651 America/Menominee Central - MI (Wisconsin border) +US +470659-1011757 America/North_Dakota/Center Central - ND (Oliver) +US +465042-1012439 America/North_Dakota/New_Salem Central - ND (Morton rural) +US +471551-1014640 America/North_Dakota/Beulah Central - ND (Mercer) +US +394421-1045903 America/Denver Mountain (most areas) +US +433649-1161209 America/Boise Mountain - ID (south); OR (east) +US +332654-1120424 America/Phoenix MST - Arizona (except Navajo) +US +340308-1181434 America/Los_Angeles Pacific +US +611305-1495401 America/Anchorage Alaska (most areas) +US +581807-1342511 America/Juneau Alaska - Juneau area +US +571035-1351807 America/Sitka Alaska - Sitka area +US +550737-1313435 America/Metlakatla Alaska - Annette Island +US +593249-1394338 America/Yakutat Alaska - Yakutat +US +643004-1652423 America/Nome Alaska (west) US +515248-1763929 America/Adak Aleutian Islands US +211825-1575130 Pacific/Honolulu Hawaii UY -3453-05611 America/Montevideo -UZ +3940+06648 Asia/Samarkand west Uzbekistan -UZ +4120+06918 Asia/Tashkent east Uzbekistan +UZ +3940+06648 Asia/Samarkand Uzbekistan (west) +UZ +4120+06918 Asia/Tashkent Uzbekistan (east) VA +415408+0122711 Europe/Vatican VC +1309-06114 America/St_Vincent VE +1030-06656 America/Caracas diff --git a/contrib/tzdata/zone1970.tab b/contrib/tzdata/zone1970.tab index a66f0f6..8286303 100644 --- a/contrib/tzdata/zone1970.tab +++ b/contrib/tzdata/zone1970.tab @@ -39,21 +39,21 @@ AE,OM +2518+05518 Asia/Dubai AF +3431+06912 Asia/Kabul AL +4120+01950 Europe/Tirane AM +4011+04430 Asia/Yerevan -AQ -6734-06808 Antarctica/Rothera Rothera Station, Adelaide Island -AQ -6448-06406 Antarctica/Palmer Palmer Station, Anvers Island -AQ -6736+06253 Antarctica/Mawson Mawson Station, Holme Bay -AQ -6835+07758 Antarctica/Davis Davis Station, Vestfold Hills -AQ -6617+11031 Antarctica/Casey Casey Station, Bailey Peninsula -AQ -7824+10654 Antarctica/Vostok Vostok Station, Lake Vostok -AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville Station, Adélie Land -AQ -690022+0393524 Antarctica/Syowa Syowa Station, E Ongul I -AQ -720041+0023206 Antarctica/Troll Troll Station, Queen Maud Land +AQ -6617+11031 Antarctica/Casey Casey +AQ -6835+07758 Antarctica/Davis Davis +AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville +AQ -6736+06253 Antarctica/Mawson Mawson +AQ -6448-06406 Antarctica/Palmer Palmer +AQ -6734-06808 Antarctica/Rothera Rothera +AQ -690022+0393524 Antarctica/Syowa Syowa +AQ -720041+0023206 Antarctica/Troll Troll +AQ -7824+10654 Antarctica/Vostok Vostok AR -3436-05827 America/Argentina/Buenos_Aires Buenos Aires (BA, CF) -AR -3124-06411 America/Argentina/Cordoba most locations (CB, CC, CN, ER, FM, MN, SE, SF) -AR -2447-06525 America/Argentina/Salta (SA, LP, NQ, RN) +AR -3124-06411 America/Argentina/Cordoba Argentina (most areas: CB, CC, CN, ER, FM, MN, SE, SF) +AR -2447-06525 America/Argentina/Salta Salta (SA, LP, NQ, RN) AR -2411-06518 America/Argentina/Jujuy Jujuy (JY) AR -2649-06513 America/Argentina/Tucuman Tucumán (TM) -AR -2828-06547 America/Argentina/Catamarca Catamarca (CT), Chubut (CH) +AR -2828-06547 America/Argentina/Catamarca Catamarca (CT); Chubut (CH) AR -2926-06651 America/Argentina/La_Rioja La Rioja (LR) AR -3132-06831 America/Argentina/San_Juan San Juan (SJ) AR -3253-06849 America/Argentina/Mendoza Mendoza (MZ) @@ -64,17 +64,17 @@ AS,UM -1416-17042 Pacific/Pago_Pago Samoa, Midway AT +4813+01620 Europe/Vienna AU -3133+15905 Australia/Lord_Howe Lord Howe Island AU -5430+15857 Antarctica/Macquarie Macquarie Island -AU -4253+14719 Australia/Hobart Tasmania - most locations -AU -3956+14352 Australia/Currie Tasmania - King Island +AU -4253+14719 Australia/Hobart Tasmania (most areas) +AU -3956+14352 Australia/Currie Tasmania (King Island) AU -3749+14458 Australia/Melbourne Victoria -AU -3352+15113 Australia/Sydney New South Wales - most locations -AU -3157+14127 Australia/Broken_Hill New South Wales - Yancowinna -AU -2728+15302 Australia/Brisbane Queensland - most locations -AU -2016+14900 Australia/Lindeman Queensland - Holiday Islands +AU -3352+15113 Australia/Sydney New South Wales (most areas) +AU -3157+14127 Australia/Broken_Hill New South Wales (Yancowinna) +AU -2728+15302 Australia/Brisbane Queensland (most areas) +AU -2016+14900 Australia/Lindeman Queensland (Whitsunday Islands) AU -3455+13835 Australia/Adelaide South Australia AU -1228+13050 Australia/Darwin Northern Territory -AU -3157+11551 Australia/Perth Western Australia - most locations -AU -3143+12852 Australia/Eucla Western Australia - Eucla area +AU -3157+11551 Australia/Perth Western Australia (most areas) +AU -3143+12852 Australia/Eucla Western Australia (Eucla) AZ +4023+04951 Asia/Baku BB +1306-05937 America/Barbados BD +2343+09025 Asia/Dhaka @@ -84,58 +84,58 @@ BM +3217-06446 Atlantic/Bermuda BN +0456+11455 Asia/Brunei BO -1630-06809 America/La_Paz BR -0351-03225 America/Noronha Atlantic islands -BR -0127-04829 America/Belem Amapá, E Pará -BR -0343-03830 America/Fortaleza NE Brazil (MA, PI, CE, RN, PB) +BR -0127-04829 America/Belem Pará (east); Amapá +BR -0343-03830 America/Fortaleza Brazil (northeast: MA, PI, CE, RN, PB) BR -0803-03454 America/Recife Pernambuco BR -0712-04812 America/Araguaina Tocantins BR -0940-03543 America/Maceio Alagoas, Sergipe BR -1259-03831 America/Bahia Bahia -BR -2332-04637 America/Sao_Paulo S & SE Brazil (GO, DF, MG, ES, RJ, SP, PR, SC, RS) +BR -2332-04637 America/Sao_Paulo Brazil (southeast: GO, DF, MG, ES, RJ, SP, PR, SC, RS) BR -2027-05437 America/Campo_Grande Mato Grosso do Sul BR -1535-05605 America/Cuiaba Mato Grosso -BR -0226-05452 America/Santarem W Pará +BR -0226-05452 America/Santarem Pará (west) BR -0846-06354 America/Porto_Velho Rondônia BR +0249-06040 America/Boa_Vista Roraima -BR -0308-06001 America/Manaus E Amazonas -BR -0640-06952 America/Eirunepe W Amazonas +BR -0308-06001 America/Manaus Amazonas (east) +BR -0640-06952 America/Eirunepe Amazonas (west) BR -0958-06748 America/Rio_Branco Acre BS +2505-07721 America/Nassau BT +2728+08939 Asia/Thimphu BY +5354+02734 Europe/Minsk BZ +1730-08812 America/Belize -CA +4734-05243 America/St_Johns Newfoundland Time, including SE Labrador -CA +4439-06336 America/Halifax Atlantic Time - Nova Scotia (peninsula), PEI -CA +4612-05957 America/Glace_Bay Atlantic Time - Nova Scotia (Cape Breton) -CA +4606-06447 America/Moncton Atlantic Time - New Brunswick -CA +5320-06025 America/Goose_Bay Atlantic Time - Labrador - most locations -CA +5125-05707 America/Blanc-Sablon Atlantic Standard Time - Quebec - Lower North Shore -CA +4339-07923 America/Toronto Eastern Time - Ontario & Quebec - most locations -CA +4901-08816 America/Nipigon Eastern Time - Ontario & Quebec - places that did not observe DST 1967-1973 -CA +4823-08915 America/Thunder_Bay Eastern Time - Thunder Bay, Ontario -CA +6344-06828 America/Iqaluit Eastern Time - east Nunavut - most locations -CA +6608-06544 America/Pangnirtung Eastern Time - Pangnirtung, Nunavut -CA +744144-0944945 America/Resolute Central Time - Resolute, Nunavut -CA +484531-0913718 America/Atikokan Eastern Standard Time - Atikokan, Ontario and Southampton I, Nunavut -CA +624900-0920459 America/Rankin_Inlet Central Time - central Nunavut -CA +4953-09709 America/Winnipeg Central Time - Manitoba & west Ontario -CA +4843-09434 America/Rainy_River Central Time - Rainy River & Fort Frances, Ontario -CA +5024-10439 America/Regina Central Standard Time - Saskatchewan - most locations -CA +5017-10750 America/Swift_Current Central Standard Time - Saskatchewan - midwest -CA +5333-11328 America/Edmonton Mountain Time - Alberta, east British Columbia & west Saskatchewan -CA +690650-1050310 America/Cambridge_Bay Mountain Time - west Nunavut -CA +6227-11421 America/Yellowknife Mountain Time - central Northwest Territories -CA +682059-1334300 America/Inuvik Mountain Time - west Northwest Territories -CA +4906-11631 America/Creston Mountain Standard Time - Creston, British Columbia -CA +5946-12014 America/Dawson_Creek Mountain Standard Time - Dawson Creek & Fort Saint John, British Columbia -CA +5848-12242 America/Fort_Nelson Mountain Standard Time - Fort Nelson, British Columbia -CA +4916-12307 America/Vancouver Pacific Time - west British Columbia -CA +6043-13503 America/Whitehorse Pacific Time - south Yukon -CA +6404-13925 America/Dawson Pacific Time - north Yukon +CA +4734-05243 America/St_Johns Newfoundland; Labrador (southeast) +CA +4439-06336 America/Halifax Atlantic - NS (most areas); PE +CA +4612-05957 America/Glace_Bay Atlantic - NS (Cape Breton) +CA +4606-06447 America/Moncton Atlantic - New Brunswick +CA +5320-06025 America/Goose_Bay Atlantic - Labrador (most areas) +CA +5125-05707 America/Blanc-Sablon AST - QC (Lower North Shore) +CA +4339-07923 America/Toronto Eastern - ON, QC (most areas) +CA +4901-08816 America/Nipigon Eastern - ON, QC (no DST 1967-73) +CA +4823-08915 America/Thunder_Bay Eastern - ON (Thunder Bay) +CA +6344-06828 America/Iqaluit Eastern - NU (most east areas) +CA +6608-06544 America/Pangnirtung Eastern - NU (Pangnirtung) +CA +484531-0913718 America/Atikokan EST - ON (Atikokan); NU (Coral H) +CA +4953-09709 America/Winnipeg Central - ON (west); Manitoba +CA +4843-09434 America/Rainy_River Central - ON (Rainy R, Ft Frances) +CA +744144-0944945 America/Resolute Central - NU (Resolute) +CA +624900-0920459 America/Rankin_Inlet Central - NU (central) +CA +5024-10439 America/Regina CST - SK (most areas) +CA +5017-10750 America/Swift_Current CST - SK (midwest) +CA +5333-11328 America/Edmonton Mountain - AB; BC (E); SK (W) +CA +690650-1050310 America/Cambridge_Bay Mountain - NU (west) +CA +6227-11421 America/Yellowknife Mountain - NT (central) +CA +682059-1334300 America/Inuvik Mountain - NT (west) +CA +4906-11631 America/Creston MST - BC (Creston) +CA +5946-12014 America/Dawson_Creek MST - BC (Dawson Cr, Ft St John) +CA +5848-12242 America/Fort_Nelson MST - BC (Ft Nelson) +CA +4916-12307 America/Vancouver Pacific - BC (most areas) +CA +6043-13503 America/Whitehorse Pacific - Yukon (south) +CA +6404-13925 America/Dawson Pacific - Yukon (north) CC -1210+09655 Indian/Cocos CH,DE,LI +4723+00832 Europe/Zurich Swiss time CI,BF,GM,GN,ML,MR,SH,SL,SN,ST,TG +0519-00402 Africa/Abidjan CK -2114-15946 Pacific/Rarotonga -CL -3327-07040 America/Santiago most locations +CL -3327-07040 America/Santiago Chile (most areas) CL -2709-10926 Pacific/Easter Easter Island CN +3114+12128 Asia/Shanghai Beijing Time CN +4348+08735 Asia/Urumqi Xinjiang Time @@ -147,23 +147,23 @@ CW,AW,BQ,SX +1211-06900 America/Curacao CX -1025+10543 Indian/Christmas CY +3510+03322 Asia/Nicosia CZ,SK +5005+01426 Europe/Prague -DE +5230+01322 Europe/Berlin Berlin time +DE +5230+01322 Europe/Berlin Germany (most areas) DK +5540+01235 Europe/Copenhagen DO +1828-06954 America/Santo_Domingo DZ +3647+00303 Africa/Algiers -EC -0210-07950 America/Guayaquil mainland +EC -0210-07950 America/Guayaquil Ecuador (mainland) EC -0054-08936 Pacific/Galapagos Galápagos Islands EE +5925+02445 Europe/Tallinn EG +3003+03115 Africa/Cairo EH +2709-01312 Africa/El_Aaiun -ES +4024-00341 Europe/Madrid mainland -ES +3553-00519 Africa/Ceuta Ceuta & Melilla +ES +4024-00341 Europe/Madrid Spain (mainland) +ES +3553-00519 Africa/Ceuta Ceuta, Melilla ES +2806-01524 Atlantic/Canary Canary Islands FI,AX +6010+02458 Europe/Helsinki FJ -1808+17825 Pacific/Fiji FK -5142-05751 Atlantic/Stanley -FM +0725+15147 Pacific/Chuuk Chuuk (Truk) and Yap -FM +0658+15813 Pacific/Pohnpei Pohnpei (Ponape) +FM +0725+15147 Pacific/Chuuk Chuuk/Truk, Yap +FM +0658+15813 Pacific/Pohnpei Pohnpei/Ponape FM +0519+16259 Pacific/Kosrae Kosrae FO +6201-00646 Atlantic/Faroe FR +4852+00220 Europe/Paris @@ -172,10 +172,10 @@ GE +4143+04449 Asia/Tbilisi GF +0456-05220 America/Cayenne GH +0533-00013 Africa/Accra GI +3608-00521 Europe/Gibraltar -GL +6411-05144 America/Godthab most locations -GL +7646-01840 America/Danmarkshavn east coast, north of Scoresbysund -GL +7029-02158 America/Scoresbysund Scoresbysund / Ittoqqortoormiit -GL +7634-06847 America/Thule Thule / Pituffik +GL +6411-05144 America/Godthab Greenland (most areas) +GL +7646-01840 America/Danmarkshavn National Park (east coast) +GL +7029-02158 America/Scoresbysund Scoresbysund/Ittoqqortoormiit +GL +7634-06847 America/Thule Thule/Pituffik GR +3758+02343 Europe/Athens GS -5416-03632 Atlantic/South_Georgia GT +1438-09031 America/Guatemala @@ -186,10 +186,10 @@ HK +2217+11409 Asia/Hong_Kong HN +1406-08713 America/Tegucigalpa HT +1832-07220 America/Port-au-Prince HU +4730+01905 Europe/Budapest -ID -0610+10648 Asia/Jakarta Java & Sumatra -ID -0002+10920 Asia/Pontianak west & central Borneo -ID -0507+11924 Asia/Makassar east & south Borneo, Sulawesi (Celebes), Bali, Nusa Tengarra, west Timor -ID -0232+14042 Asia/Jayapura west New Guinea (Irian Jaya) & Malukus (Moluccas) +ID -0610+10648 Asia/Jakarta Java, Sumatra +ID -0002+10920 Asia/Pontianak Borneo (west, central) +ID -0507+11924 Asia/Makassar Borneo (east, south); Sulawesi/Celebes, Bali, Nusa Tengarra; Timor (west) +ID -0232+14042 Asia/Jayapura New Guinea (West Papua / Irian Jaya); Malukus/Moluccas IE +5320-00615 Europe/Dublin IL +314650+0351326 Asia/Jerusalem IN +2232+08822 Asia/Kolkata @@ -208,11 +208,10 @@ KI -0308-17105 Pacific/Enderbury Phoenix Islands KI +0152-15720 Pacific/Kiritimati Line Islands KP +3901+12545 Asia/Pyongyang KR +3733+12658 Asia/Seoul -KY +1918-08123 America/Cayman -KZ +4315+07657 Asia/Almaty most locations -KZ +4448+06528 Asia/Qyzylorda Qyzylorda (Kyzylorda, Kzyl-Orda) -KZ +5017+05710 Asia/Aqtobe Aqtobe (Aktobe) -KZ +4431+05016 Asia/Aqtau Atyrau (Atirau, Gur'yev), Mangghystau (Mankistau) +KZ +4315+07657 Asia/Almaty Kazakhstan (most areas) +KZ +4448+06528 Asia/Qyzylorda Qyzylorda/Kyzylorda/Kzyl-Orda +KZ +5017+05710 Asia/Aqtobe Aqtobe/Aktobe +KZ +4431+05016 Asia/Aqtau Atyrau/Atirau/Gur'yev, Mangghystau/Mankistau KZ +5113+05121 Asia/Oral West Kazakhstan LB +3353+03530 Asia/Beirut LK +0656+07951 Asia/Colombo @@ -224,10 +223,10 @@ LY +3254+01311 Africa/Tripoli MA +3339-00735 Africa/Casablanca MC +4342+00723 Europe/Monaco MD +4700+02850 Europe/Chisinau -MH +0709+17112 Pacific/Majuro most locations +MH +0709+17112 Pacific/Majuro Marshall Islands (most areas) MH +0905+16720 Pacific/Kwajalein Kwajalein -MM +1647+09610 Asia/Rangoon -MN +4755+10653 Asia/Ulaanbaatar most locations +MM +1647+09610 Asia/Yangon +MN +4755+10653 Asia/Ulaanbaatar Mongolia (most areas) MN +4801+09139 Asia/Hovd Bayan-Ölgii, Govi-Altai, Hovd, Uvs, Zavkhan MN +4804+11430 Asia/Choibalsan Dornod, Sükhbaatar MO +2214+11335 Asia/Macau @@ -235,25 +234,24 @@ MQ +1436-06105 America/Martinique MT +3554+01431 Europe/Malta MU -2010+05730 Indian/Mauritius MV +0410+07330 Indian/Maldives -MX +1924-09909 America/Mexico_City Central Time - most locations +MX +1924-09909 America/Mexico_City Central Time MX +2105-08646 America/Cancun Eastern Standard Time - Quintana Roo MX +2058-08937 America/Merida Central Time - Campeche, Yucatán -MX +2540-10019 America/Monterrey Mexican Central Time - Coahuila, Durango, Nuevo León, Tamaulipas away from US border -MX +2550-09730 America/Matamoros US Central Time - Coahuila, Durango, Nuevo León, Tamaulipas near US border -MX +2313-10625 America/Mazatlan Mountain Time - S Baja, Nayarit, Sinaloa -MX +2838-10605 America/Chihuahua Mexican Mountain Time - Chihuahua away from US border -MX +2934-10425 America/Ojinaga US Mountain Time - Chihuahua near US border +MX +2540-10019 America/Monterrey Central Time - Durango; Coahuila, Nuevo León, Tamaulipas (most areas) +MX +2550-09730 America/Matamoros Central Time US - Coahuila, Nuevo León, Tamaulipas (US border) +MX +2313-10625 America/Mazatlan Mountain Time - Baja California Sur, Nayarit, Sinaloa +MX +2838-10605 America/Chihuahua Mountain Time - Chihuahua (most areas) +MX +2934-10425 America/Ojinaga Mountain Time US - Chihuahua (US border) MX +2904-11058 America/Hermosillo Mountain Standard Time - Sonora -MX +3232-11701 America/Tijuana US Pacific Time - Baja California near US border -MX +3018-11452 America/Santa_Isabel Mexican Pacific Time - Baja California away from US border -MX +2048-10515 America/Bahia_Banderas Mexican Central Time - Bahía de Banderas -MY +0310+10142 Asia/Kuala_Lumpur peninsular Malaysia -MY +0133+11020 Asia/Kuching Sabah & Sarawak -MZ,BI,BW,CD,MW,RW,ZM,ZW -2558+03235 Africa/Maputo Central Africa Time (UTC+2) +MX +3232-11701 America/Tijuana Pacific Time US - Baja California +MX +2048-10515 America/Bahia_Banderas Central Time - Bahía de Banderas +MY +0310+10142 Asia/Kuala_Lumpur Malaysia (peninsula) +MY +0133+11020 Asia/Kuching Sabah, Sarawak +MZ,BI,BW,CD,MW,RW,ZM,ZW -2558+03235 Africa/Maputo Central Africa Time NA -2234+01706 Africa/Windhoek NC -2216+16627 Pacific/Noumea NF -2903+16758 Pacific/Norfolk -NG,AO,BJ,CD,CF,CG,CM,GA,GQ,NE +0627+00324 Africa/Lagos West Africa Time (UTC+1) +NG,AO,BJ,CD,CF,CG,CM,GA,GQ,NE +0627+00324 Africa/Lagos West Africa Time NI +1209-08617 America/Managua NL +5222+00454 Europe/Amsterdam NO,SJ +5955+01045 Europe/Oslo @@ -262,12 +260,12 @@ NR -0031+16655 Pacific/Nauru NU -1901-16955 Pacific/Niue NZ,AQ -3652+17446 Pacific/Auckland New Zealand time NZ -4357-17633 Pacific/Chatham Chatham Islands -PA +0858-07932 America/Panama +PA,KY +0858-07932 America/Panama PE -1203-07703 America/Lima PF -1732-14934 Pacific/Tahiti Society Islands PF -0900-13930 Pacific/Marquesas Marquesas Islands PF -2308-13457 Pacific/Gambier Gambier Islands -PG -0930+14710 Pacific/Port_Moresby most locations +PG -0930+14710 Pacific/Port_Moresby Papua New Guinea (most areas) PG -0613+15534 Pacific/Bougainville Bougainville PH +1435+12100 Asia/Manila PK +2452+06703 Asia/Karachi @@ -277,36 +275,41 @@ PN -2504-13005 Pacific/Pitcairn PR +182806-0660622 America/Puerto_Rico PS +3130+03428 Asia/Gaza Gaza Strip PS +313200+0350542 Asia/Hebron West Bank -PT +3843-00908 Europe/Lisbon mainland +PT +3843-00908 Europe/Lisbon Portugal (mainland) PT +3238-01654 Atlantic/Madeira Madeira Islands PT +3744-02540 Atlantic/Azores Azores PW +0720+13429 Pacific/Palau PY -2516-05740 America/Asuncion QA,BH +2517+05132 Asia/Qatar -RE,TF -2052+05528 Indian/Reunion Réunion, Crozet Is, Scattered Is +RE,TF -2052+05528 Indian/Reunion Réunion, Crozet, Scattered Islands RO +4426+02606 Europe/Bucharest RS,BA,HR,ME,MK,SI +4450+02030 Europe/Belgrade -RU +5443+02030 Europe/Kaliningrad Moscow-01 - Kaliningrad -RU +554521+0373704 Europe/Moscow Moscow+00 - west Russia -RU +4457+03406 Europe/Simferopol Moscow+00 - Crimea -RU +4844+04425 Europe/Volgograd Moscow+00 - Caspian Sea -RU +5312+05009 Europe/Samara Moscow+00 (Moscow+01 after 2014-10-26) - Samara, Udmurtia -RU +5651+06036 Asia/Yekaterinburg Moscow+02 - Urals -RU +5500+07324 Asia/Omsk Moscow+03 - west Siberia -RU +5502+08255 Asia/Novosibirsk Moscow+03 - Novosibirsk -RU +5345+08707 Asia/Novokuznetsk Moscow+03 (Moscow+04 after 2014-10-26) - Kemerovo -RU +5601+09250 Asia/Krasnoyarsk Moscow+04 - Yenisei River -RU +5216+10420 Asia/Irkutsk Moscow+05 - Lake Baikal -RU +5203+11328 Asia/Chita Moscow+06 (Moscow+05 after 2014-10-26) - Zabaykalsky -RU +6200+12940 Asia/Yakutsk Moscow+06 - Lena River -RU +623923+1353314 Asia/Khandyga Moscow+06 - Tomponsky, Ust-Maysky -RU +4310+13156 Asia/Vladivostok Moscow+07 - Amur River -RU +4658+14242 Asia/Sakhalin Moscow+07 - Sakhalin Island -RU +643337+1431336 Asia/Ust-Nera Moscow+07 - Oymyakonsky -RU +5934+15048 Asia/Magadan Moscow+08 (Moscow+07 after 2014-10-26) - Magadan -RU +6728+15343 Asia/Srednekolymsk Moscow+08 - E Sakha, N Kuril Is -RU +5301+15839 Asia/Kamchatka Moscow+08 (Moscow+09 after 2014-10-26) - Kamchatka -RU +6445+17729 Asia/Anadyr Moscow+08 (Moscow+09 after 2014-10-26) - Bering Sea +RU +5443+02030 Europe/Kaliningrad MSK-01 - Kaliningrad +RU +554521+0373704 Europe/Moscow MSK+00 - Moscow area +RU +4457+03406 Europe/Simferopol MSK+00 - Crimea +RU +4844+04425 Europe/Volgograd MSK+00 - Volgograd, Saratov +RU +5836+04939 Europe/Kirov MSK+00 - Kirov +RU +4621+04803 Europe/Astrakhan MSK+01 - Astrakhan +RU +5312+05009 Europe/Samara MSK+01 - Samara, Udmurtia +RU +5420+04824 Europe/Ulyanovsk MSK+01 - Ulyanovsk +RU +5651+06036 Asia/Yekaterinburg MSK+02 - Urals +RU +5500+07324 Asia/Omsk MSK+03 - Omsk +RU +5502+08255 Asia/Novosibirsk MSK+03 - Novosibirsk +RU +5322+08345 Asia/Barnaul MSK+04 - Altai +RU +5630+08458 Asia/Tomsk MSK+04 - Tomsk +RU +5345+08707 Asia/Novokuznetsk MSK+04 - Kemerovo +RU +5601+09250 Asia/Krasnoyarsk MSK+04 - Krasnoyarsk area +RU +5216+10420 Asia/Irkutsk MSK+05 - Irkutsk, Buryatia +RU +5203+11328 Asia/Chita MSK+06 - Zabaykalsky +RU +6200+12940 Asia/Yakutsk MSK+06 - Lena River +RU +623923+1353314 Asia/Khandyga MSK+06 - Tomponsky, Ust-Maysky +RU +4310+13156 Asia/Vladivostok MSK+07 - Amur River +RU +643337+1431336 Asia/Ust-Nera MSK+07 - Oymyakonsky +RU +5934+15048 Asia/Magadan MSK+08 - Magadan +RU +4658+14242 Asia/Sakhalin MSK+08 - Sakhalin Island +RU +6728+15343 Asia/Srednekolymsk MSK+08 - Sakha (E); North Kuril Is +RU +5301+15839 Asia/Kamchatka MSK+09 - Kamchatka +RU +6445+17729 Asia/Anadyr MSK+09 - Bering Sea SA,KW,YE +2438+04643 Asia/Riyadh SB -0932+16012 Pacific/Guadalcanal SC -0440+05528 Indian/Mahe @@ -318,8 +321,8 @@ SV +1342-08912 America/El_Salvador SY +3330+03618 Asia/Damascus TC +2128-07108 America/Grand_Turk TD +1207+01503 Africa/Ndjamena -TF -492110+0701303 Indian/Kerguelen Kerguelen, St Paul I, Amsterdam I -TH,KH,LA,VN +1345+10031 Asia/Bangkok most of Indochina +TF -492110+0701303 Indian/Kerguelen Kerguelen, St Paul Island, Amsterdam Island +TH,KH,LA,VN +1345+10031 Asia/Bangkok Indochina (most areas) TJ +3835+06848 Asia/Dushanbe TK -0922-17114 Pacific/Fakaofo TL -0833+12535 Asia/Dili @@ -330,44 +333,44 @@ TR +4101+02858 Europe/Istanbul TT,AG,AI,BL,DM,GD,GP,KN,LC,MF,MS,VC,VG,VI +1039-06131 America/Port_of_Spain TV -0831+17913 Pacific/Funafuti TW +2503+12130 Asia/Taipei -UA +5026+03031 Europe/Kiev most locations +UA +5026+03031 Europe/Kiev Ukraine (most areas) UA +4837+02218 Europe/Uzhgorod Ruthenia -UA +4750+03510 Europe/Zaporozhye Zaporozh'ye, E Lugansk / Zaporizhia, E Luhansk +UA +4750+03510 Europe/Zaporozhye Zaporozh'ye/Zaporizhia; Lugansk/Luhansk (east) UM +1917+16637 Pacific/Wake Wake Island -US +404251-0740023 America/New_York Eastern Time -US +421953-0830245 America/Detroit Eastern Time - Michigan - most locations -US +381515-0854534 America/Kentucky/Louisville Eastern Time - Kentucky - Louisville area -US +364947-0845057 America/Kentucky/Monticello Eastern Time - Kentucky - Wayne County -US +394606-0860929 America/Indiana/Indianapolis Eastern Time - Indiana - most locations -US +384038-0873143 America/Indiana/Vincennes Eastern Time - Indiana - Daviess, Dubois, Knox & Martin Counties -US +410305-0863611 America/Indiana/Winamac Eastern Time - Indiana - Pulaski County -US +382232-0862041 America/Indiana/Marengo Eastern Time - Indiana - Crawford County -US +382931-0871643 America/Indiana/Petersburg Eastern Time - Indiana - Pike County -US +384452-0850402 America/Indiana/Vevay Eastern Time - Indiana - Switzerland County -US +415100-0873900 America/Chicago Central Time -US +375711-0864541 America/Indiana/Tell_City Central Time - Indiana - Perry County -US +411745-0863730 America/Indiana/Knox Central Time - Indiana - Starke County -US +450628-0873651 America/Menominee Central Time - Michigan - Dickinson, Gogebic, Iron & Menominee Counties -US +470659-1011757 America/North_Dakota/Center Central Time - North Dakota - Oliver County -US +465042-1012439 America/North_Dakota/New_Salem Central Time - North Dakota - Morton County (except Mandan area) -US +471551-1014640 America/North_Dakota/Beulah Central Time - North Dakota - Mercer County -US +394421-1045903 America/Denver Mountain Time -US +433649-1161209 America/Boise Mountain Time - south Idaho & east Oregon -US +332654-1120424 America/Phoenix Mountain Standard Time - Arizona (except Navajo) -US +340308-1181434 America/Los_Angeles Pacific Time -US +550737-1313435 America/Metlakatla Pacific Standard Time - Annette Island, Alaska -US +611305-1495401 America/Anchorage Alaska Time -US +581807-1342511 America/Juneau Alaska Time - Alaska panhandle -US +571035-1351807 America/Sitka Alaska Time - southeast Alaska panhandle -US +593249-1394338 America/Yakutat Alaska Time - Alaska panhandle neck -US +643004-1652423 America/Nome Alaska Time - west Alaska +US +404251-0740023 America/New_York Eastern (most areas) +US +421953-0830245 America/Detroit Eastern - MI (most areas) +US +381515-0854534 America/Kentucky/Louisville Eastern - KY (Louisville area) +US +364947-0845057 America/Kentucky/Monticello Eastern - KY (Wayne) +US +394606-0860929 America/Indiana/Indianapolis Eastern - IN (most areas) +US +384038-0873143 America/Indiana/Vincennes Eastern - IN (Da, Du, K, Mn) +US +410305-0863611 America/Indiana/Winamac Eastern - IN (Pulaski) +US +382232-0862041 America/Indiana/Marengo Eastern - IN (Crawford) +US +382931-0871643 America/Indiana/Petersburg Eastern - IN (Pike) +US +384452-0850402 America/Indiana/Vevay Eastern - IN (Switzerland) +US +415100-0873900 America/Chicago Central (most areas) +US +375711-0864541 America/Indiana/Tell_City Central - IN (Perry) +US +411745-0863730 America/Indiana/Knox Central - IN (Starke) +US +450628-0873651 America/Menominee Central - MI (Wisconsin border) +US +470659-1011757 America/North_Dakota/Center Central - ND (Oliver) +US +465042-1012439 America/North_Dakota/New_Salem Central - ND (Morton rural) +US +471551-1014640 America/North_Dakota/Beulah Central - ND (Mercer) +US +394421-1045903 America/Denver Mountain (most areas) +US +433649-1161209 America/Boise Mountain - ID (south); OR (east) +US +332654-1120424 America/Phoenix MST - Arizona (except Navajo) +US +340308-1181434 America/Los_Angeles Pacific +US +611305-1495401 America/Anchorage Alaska (most areas) +US +581807-1342511 America/Juneau Alaska - Juneau area +US +571035-1351807 America/Sitka Alaska - Sitka area +US +550737-1313435 America/Metlakatla Alaska - Annette Island +US +593249-1394338 America/Yakutat Alaska - Yakutat +US +643004-1652423 America/Nome Alaska (west) US +515248-1763929 America/Adak Aleutian Islands -US,UM +211825-1575130 Pacific/Honolulu Hawaii time +US,UM +211825-1575130 Pacific/Honolulu Hawaii UY -3453-05611 America/Montevideo -UZ +3940+06648 Asia/Samarkand west Uzbekistan -UZ +4120+06918 Asia/Tashkent east Uzbekistan +UZ +3940+06648 Asia/Samarkand Uzbekistan (west) +UZ +4120+06918 Asia/Tashkent Uzbekistan (east) VE +1030-06656 America/Caracas -VN +1045+10640 Asia/Ho_Chi_Minh south Vietnam +VN +1045+10640 Asia/Ho_Chi_Minh Vietnam (south) VU -1740+16825 Pacific/Efate WF -1318-17610 Pacific/Wallis WS -1350-17144 Pacific/Apia diff --git a/games/morse/morse.6 b/games/morse/morse.6 index f0d30d7..94645bb 100644 --- a/games/morse/morse.6 +++ b/games/morse/morse.6 @@ -177,7 +177,7 @@ device file Sound support for .Nm added by -.An Lyndon Nerenberg (VE6BBM) Aq lyndon@orthanc.ca . +.An Lyndon Nerenberg (VE6BBM) Aq Mt lyndon@orthanc.ca . .Pp Ability to key an external device added by .An J\(:org Wunsch diff --git a/games/random/random.6 b/games/random/random.6 index 5481576..bd38ba6 100644 --- a/games/random/random.6 +++ b/games/random/random.6 @@ -117,7 +117,7 @@ instead of newlines. .Sh HISTORY The functionality to randomizing lines and words was added in 2003 by -.An "Sean Chittenden" Aq seanc@FreeBSD.org . +.An Sean Chittenden Aq Mt seanc@FreeBSD.org . .Sh BUGS No index is used when printing out tokens from the list which makes it rather slow for large files (10MB+). diff --git a/lib/libc/sys/_exit.2 b/lib/libc/sys/_exit.2 index b35e7c4..120ef1c 100644 --- a/lib/libc/sys/_exit.2 +++ b/lib/libc/sys/_exit.2 @@ -28,7 +28,7 @@ .\" @(#)_exit.2 8.1 (Berkeley) 6/4/93 .\" $FreeBSD$ .\" -.Dd June 4, 1993 +.Dd September 8, 2016 .Dt EXIT 2 .Os .Sh NAME @@ -64,9 +64,11 @@ is set as defined by .Xr wait 2 . .It The parent process-ID of all of the calling process's existing child -processes are set to 1; the initialization process +processes are set to the process-ID of the calling process's reaper; +the reaper (normally the initialization process) inherits each of these processes (see +.Xr procctl 2 , .Xr init 8 and the .Sx DEFINITIONS diff --git a/lib/libc/sys/intro.2 b/lib/libc/sys/intro.2 index bc10b1d..fb25195 100644 --- a/lib/libc/sys/intro.2 +++ b/lib/libc/sys/intro.2 @@ -28,7 +28,7 @@ .\" @(#)intro.2 8.5 (Berkeley) 2/27/95 .\" $FreeBSD$ .\" -.Dd May 4, 2013 +.Dd September 8, 2016 .Dt INTRO 2 .Os .Sh NAME @@ -486,7 +486,10 @@ A new process is created by a currently active process (see .Xr fork 2 ) . The parent process ID of a process is initially the process ID of its creator. If the creating process exits, -the parent process ID of each child is set to the ID of a system process, +the parent process ID of each child is set to the ID of the calling process's +reaper (see +.Xr procctl 2 ) , +normally .Xr init 8 . .It Process Group Each active process is a member of a process group that is identified by @@ -535,7 +538,7 @@ when none of its members has a parent process that is in the same session as the group, but is in a different process group. Note that when a process exits, the parent process for its children -is changed to be +is normally changed to be .Xr init 8 , which is in a separate session. Not all members of an orphaned process group are necessarily orphaned diff --git a/lib/libstand/nfs.c b/lib/libstand/nfs.c index a0b726c..16d6233 100644 --- a/lib/libstand/nfs.c +++ b/lib/libstand/nfs.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <sys/stat.h> #include <string.h> +#include <stddef.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -50,7 +51,8 @@ __FBSDID("$FreeBSD$"); #define NFS_DEBUGxx -#define NFSREAD_SIZE 1024 +#define NFSREAD_MIN_SIZE 1024 +#define NFSREAD_MAX_SIZE 4096 /* Define our own NFS attributes without NQNFS stuff. */ #ifdef OLD_NFSV2 @@ -83,7 +85,7 @@ struct nfs_read_repl { n_long errno; struct nfsv2_fattrs fa; n_long count; - u_char data[NFSREAD_SIZE]; + u_char data[NFSREAD_MAX_SIZE]; }; #ifndef NFS_NOSYMLINK @@ -210,6 +212,8 @@ struct fs_ops nfs_fsops = { nfs_readdir }; +static int nfs_read_size = NFSREAD_MIN_SIZE; + #ifdef OLD_NFSV2 /* * Fetch the root file handle (call mount daemon) @@ -264,6 +268,17 @@ nfs_getrootfh(struct iodesc *d, char *path, u_char *fhp) if (repl->errno) return (ntohl(repl->errno)); bcopy(repl->fh, fhp, sizeof(repl->fh)); + + /* + * Improve boot performance over NFS + */ + if (getenv("nfs.read_size") != NULL) + nfs_read_size = strtol(getenv("nfs.read_size"), NULL, 0); + if (nfs_read_size < NFSREAD_MIN_SIZE) + nfs_read_size = NFSREAD_MIN_SIZE; + if (nfs_read_size > NFSREAD_MAX_SIZE) + nfs_read_size = NFSREAD_MAX_SIZE; + return (0); } @@ -401,11 +416,11 @@ nfs_readdata(struct nfs_iodesc *d, off_t off, void *addr, size_t len) bcopy(d->fh, args->fh, NFS_FHSIZE); args->off = htonl((n_long)off); - if (len > NFSREAD_SIZE) - len = NFSREAD_SIZE; + if (len > nfs_read_size) + len = nfs_read_size; args->len = htonl((n_long)len); args->xxx = htonl((n_long)0); - hlen = sizeof(*repl) - NFSREAD_SIZE; + hlen = offsetof(struct nfs_read_rpl, data[0]); cc = rpc_call(d->iodesc, NFS_PROG, NFS_VER2, NFSPROC_READ, args, sizeof(*args), @@ -1022,7 +1037,7 @@ nfs_readdata(struct nfs_iodesc *d, off_t off, void *addr, size_t len) uint32_t count; uint32_t eof; uint32_t len; - u_char data[NFSREAD_SIZE]; + u_char data[NFSREAD_MAX_SIZE]; } *repl; struct { uint32_t h[RPC_HEADER_WORDS]; @@ -1045,10 +1060,10 @@ nfs_readdata(struct nfs_iodesc *d, off_t off, void *addr, size_t len) pos = roundup(d->fhsize, sizeof(uint32_t)) / sizeof(uint32_t); args->fhoffcnt[pos++] = 0; args->fhoffcnt[pos++] = htonl((uint32_t)off); - if (len > NFSREAD_SIZE) - len = NFSREAD_SIZE; + if (len > nfs_read_size) + len = nfs_read_size; args->fhoffcnt[pos] = htonl((uint32_t)len); - hlen = sizeof(*repl) - NFSREAD_SIZE; + hlen = offsetof(struct repl, data[0]); cc = rpc_call(d->iodesc, NFS_PROG, NFS_VER3, NFSPROCV3_READ, args, 4 * sizeof(uint32_t) + roundup(d->fhsize, sizeof(uint32_t)), diff --git a/sbin/adjkerntz/adjkerntz.8 b/sbin/adjkerntz/adjkerntz.8 index 6f0f15e..1bd1391 100644 --- a/sbin/adjkerntz/adjkerntz.8 +++ b/sbin/adjkerntz/adjkerntz.8 @@ -190,4 +190,4 @@ The utility appeared in .Fx 1.0 . .Sh AUTHORS -.An Andrey A. Chernov Aq ache@astral.msk.su +.An Andrey A. Chernov Aq Mt ache@astral.msk.su diff --git a/sbin/atm/atmconfig/atmconfig.8 b/sbin/atm/atmconfig/atmconfig.8 index 1ff76a1..7975ea2 100644 --- a/sbin/atm/atmconfig/atmconfig.8 +++ b/sbin/atm/atmconfig/atmconfig.8 @@ -1,7 +1,7 @@ .\" .\" Copyright (c) 2001-2003 .\" Fraunhofer Institute for Open Communication Systems (FhG Fokus). -.\" All rights reserved. +.\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions @@ -28,7 +28,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 11, 2003 +.Dd October 5, 2016 .Dt ATMCONFIG 8 .Os .Sh NAME @@ -315,5 +315,10 @@ List all NATM routes. .Xr natm 4 , .Xr natmip 4 , .Xr atm 8 +.Sh HISTORY +An +.Nm +command appeared in +.Fx 3.0 . .Sh AUTHORS -.An Hartmut Brandt Aq harti@FreeBSD.org +.An Hartmut Brandt Aq Mt harti@FreeBSD.org diff --git a/sbin/bsdlabel/bsdlabel.8 b/sbin/bsdlabel/bsdlabel.8 index 8137171..cfc3ac6 100644 --- a/sbin/bsdlabel/bsdlabel.8 +++ b/sbin/bsdlabel/bsdlabel.8 @@ -31,7 +31,7 @@ .\" @(#)disklabel.8 8.2 (Berkeley) 4/19/94 .\" $FreeBSD$ .\" -.Dd October 1, 2013 +.Dd October 5, 2016 .Dt BSDLABEL 8 .Os .Sh NAME @@ -466,7 +466,7 @@ which could be used as a source file for 8 partitions: # size offset fstype [fsize bsize bps/cpg] - a: 400M 16 4.2BSD 4096 16384 75 # (Cyl. 0 - 812*) + a: 400M 16 4.2BSD 4096 16384 75 # (Cyl. 0 - 812*) b: 1G * swap c: * * unused e: 204800 * 4.2BSD @@ -500,3 +500,8 @@ are not generally compatible. .Xr boot0cfg 8 , .Xr gpart 8 , .Xr newfs 8 +.Sh HISTORY +The +.Nm disklabel +utility appeared in +.Bx 4.3 Tahoe . diff --git a/sbin/camcontrol/camcontrol.8 b/sbin/camcontrol/camcontrol.8 index 40a821a..9f2211c 100644 --- a/sbin/camcontrol/camcontrol.8 +++ b/sbin/camcontrol/camcontrol.8 @@ -2366,7 +2366,7 @@ and first appeared in in .Fx 2.0.5 . .Sh AUTHORS -.An Kenneth Merry Aq ken@FreeBSD.org +.An Kenneth Merry Aq Mt ken@FreeBSD.org .Sh BUGS The code that parses the generic command line arguments does not know that some of the subcommands take multiple arguments. diff --git a/sbin/clri/clri.8 b/sbin/clri/clri.8 index c3c3796..504d431 100644 --- a/sbin/clri/clri.8 +++ b/sbin/clri/clri.8 @@ -28,7 +28,7 @@ .\" @(#)clri.8 8.2 (Berkeley) 4/19/94 .\" $FreeBSD$ .\" -.Dd April 19, 1994 +.Dd October 5, 2016 .Dt CLRI 8 .Os .Sh NAME @@ -70,6 +70,11 @@ will be able to clean up the resulting mess. .Sh SEE ALSO .Xr fsck 8 , .Xr fsdb 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.At v6 . .Sh BUGS If the file is open, the work of .Nm diff --git a/sbin/devd/devd.8 b/sbin/devd/devd.8 index 12a92d9..7faf188 100644 --- a/sbin/devd/devd.8 +++ b/sbin/devd/devd.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 14, 2014 +.Dd October 5, 2016 .Dt DEVD 8 .Os .Sh NAME @@ -62,7 +62,8 @@ The default connection limit is 10. Do not process all pending events before becoming a daemon. Instead, call daemon right away. .It Fl q -Quiet mode. Only log messages at priority LOG_WARNING or above. +Quiet mode. +Only log messages at priority LOG_WARNING or above. .El .Sh IMPLEMENTATION NOTES The @@ -153,5 +154,10 @@ A deprecated socket retained for use with old clients. .Sh SEE ALSO .Xr devctl 4 , .Xr devd.conf 5 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 5.0 . .Sh AUTHORS .An M. Warner Losh diff --git a/sbin/devd/devd.conf.5 b/sbin/devd/devd.conf.5 index 978485d..39aabae 100644 --- a/sbin/devd/devd.conf.5 +++ b/sbin/devd/devd.conf.5 @@ -41,7 +41,7 @@ .\" ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS .\" SOFTWARE. .\" -.Dd April 14, 2016 +.Dd October 8, 2016 .Dt DEVD.CONF 5 .Os .Sh NAME @@ -637,8 +637,8 @@ The installed .Pa /etc/devd.conf has many additional examples. .Sh SEE ALSO +.Xr cam 4 , .Xr coretemp 4 , .Xr devfs 5 , .Xr re_format 7 , -.Xr devd 8 , -.Xr cam 4 +.Xr devd 8 diff --git a/sbin/devfs/devfs.8 b/sbin/devfs/devfs.8 index 8bbdfcc..01aaf29 100644 --- a/sbin/devfs/devfs.8 +++ b/sbin/devfs/devfs.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd July 12, 2013 +.Dd October 5, 2016 .Dt DEVFS 8 .Os .Sh NAME @@ -249,7 +249,8 @@ configuration file. .It Pa /etc/devfs.rules Local .Nm -configuration file. Rulesets in here override those in +configuration file. +Rulesets in here override those in .Pa /etc/defaults/devfs.rules with the same ruleset number, otherwise the two files are effectively merged. .It Pa /etc/devfs.conf @@ -374,5 +375,10 @@ this feature can be used to copy rulesets: .Xr chown 8 , .Xr jail 8 , .Xr mknod 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 5.0 . .Sh AUTHORS .An Dima Dorfman diff --git a/sbin/dhclient/dhclient-script.8 b/sbin/dhclient/dhclient-script.8 index fdb46a7..2fae9f6 100644 --- a/sbin/dhclient/dhclient-script.8 +++ b/sbin/dhclient/dhclient-script.8 @@ -273,7 +273,7 @@ but current scripts do not do this. The original version of .Nm was written for the Internet Software Consortium by -.An Ted Lemon Aq mellon@fugue.com +.An Ted Lemon Aq Mt mellon@fugue.com in cooperation with Vixie Enterprises. .Pp The @@ -281,7 +281,7 @@ The implementation of .Nm was written by -.An Kenneth R. Westerback Aq krw@openbsd.org . +.An Kenneth R. Westerback Aq Mt krw@openbsd.org . .Sh BUGS If more than one interface is being used, there is no obvious way to avoid clashes between server-supplied configuration parameters - for diff --git a/sbin/dhclient/dhclient.8 b/sbin/dhclient/dhclient.8 index d2b8f437..c6940da 100644 --- a/sbin/dhclient/dhclient.8 +++ b/sbin/dhclient/dhclient.8 @@ -188,9 +188,9 @@ The .Nm utility was written by -.An Ted Lemon Aq mellon@fugue.com +.An Ted Lemon Aq Mt mellon@fugue.com and -.An Elliot Poger Aq elliot@poger.com . +.An Elliot Poger Aq Mt elliot@poger.com . .Pp The current implementation was reworked by -.An Henning Brauer Aq henning@openbsd.org . +.An Henning Brauer Aq Mt henning@openbsd.org . diff --git a/sbin/dhclient/dhclient.conf.5 b/sbin/dhclient/dhclient.conf.5 index 167239e..3b6ae04 100644 --- a/sbin/dhclient/dhclient.conf.5 +++ b/sbin/dhclient/dhclient.conf.5 @@ -537,8 +537,8 @@ The .Xr dhclient 8 utility was written by -.An Ted Lemon Aq mellon@vix.com +.An Ted Lemon Aq Mt mellon@vix.com under a contract with Vixie Labs. .Pp The current implementation was reworked by -.An Henning Brauer Aq henning@openbsd.org . +.An Henning Brauer Aq Mt henning@openbsd.org . diff --git a/sbin/dhclient/dhclient.leases.5 b/sbin/dhclient/dhclient.leases.5 index b1f0f3d..f48b106 100644 --- a/sbin/dhclient/dhclient.leases.5 +++ b/sbin/dhclient/dhclient.leases.5 @@ -88,8 +88,8 @@ The .Xr dhclient 8 utility was written by -.An Ted Lemon Aq mellon@vix.com +.An Ted Lemon Aq Mt mellon@vix.com under a contract with Vixie Labs. .Pp The current implementation was reworked by -.An Henning Brauer Aq henning@openbsd.org . +.An Henning Brauer Aq Mt henning@openbsd.org . diff --git a/sbin/dhclient/dhcp-options.5 b/sbin/dhclient/dhcp-options.5 index 1405839..4b65fa7 100644 --- a/sbin/dhclient/dhcp-options.5 +++ b/sbin/dhclient/dhcp-options.5 @@ -603,8 +603,8 @@ The .Xr dhcpd 8 utility was written by -.An Ted Lemon Aq mellon@vix.com +.An Ted Lemon Aq Mt mellon@vix.com under a contract with Vixie Labs. .Pp The current implementation was reworked by -.An Henning Brauer Aq henning@openbsd.org . +.An Henning Brauer Aq Mt henning@openbsd.org . diff --git a/sbin/fdisk/fdisk.8 b/sbin/fdisk/fdisk.8 index 6894ab9..87862ec 100644 --- a/sbin/fdisk/fdisk.8 +++ b/sbin/fdisk/fdisk.8 @@ -1,6 +1,6 @@ .\" $FreeBSD$ .\" -.Dd October 1, 2013 +.Dd October 5, 2016 .Dt FDISK 8 .Os .Sh NAME @@ -177,19 +177,19 @@ An example follows: Information from DOS bootblock is: The data for partition 1 is: sysid 165,(FreeBSD/NetBSD/386BSD) - start 495, size 380160 (185 Meg), flag 0 + start 495, size 380160 (185 Meg), flag 0 beg: cyl 1/ sector 1/ head 0; end: cyl 768/ sector 33/ head 14 The data for partition 2 is: sysid 164,(unknown) - start 378180, size 2475 (1 Meg), flag 0 + start 378180, size 2475 (1 Meg), flag 0 beg: cyl 764/ sector 1/ head 0; end: cyl 768/ sector 33/ head 14 The data for partition 3 is: <UNUSED> The data for partition 4 is: sysid 99,(ISC UNIX, other System V/386, GNU HURD or Mach) - start 380656, size 224234 (109 Meg), flag 80 + start 380656, size 224234 (109 Meg), flag 80 beg: cyl 769/ sector 2/ head 0; end: cyl 197/ sector 33/ head 14 .Ed @@ -485,6 +485,21 @@ The default boot code. .Xr bsdlabel 8 , .Xr gpart 8 , .Xr newfs 8 +.Sh HISTORY +A version of +.Nm +first appeared in the Mach Operating System. +It was subsequently ported to +.Bx 386 . +.Sh AUTHORS +.An -nosplit +.Nm +for Mach Operating System was written by +.An Robert Baron Aq Mt rvb@cs.cmu.edu . +It was ported to +.Bx 386 +by +.An Julian Elischer Aq Mt julian@tfs.com . .Sh BUGS The default boot code will not necessarily handle all slice types correctly, in particular those introduced since diff --git a/sbin/fdisk_pc98/fdisk.8 b/sbin/fdisk_pc98/fdisk.8 index aa3a6c4..eea5559 100644 --- a/sbin/fdisk_pc98/fdisk.8 +++ b/sbin/fdisk_pc98/fdisk.8 @@ -1,6 +1,6 @@ .\" $FreeBSD$ .\" -.Dd April 30, 2007 +.Dd October 5, 2016 .Dt FDISK 8 .Os .Sh NAME @@ -448,6 +448,21 @@ Example: to make slice 1 the active slice: .Xr bsdlabel 8 , .Xr gpart 8 , .Xr newfs 8 +.Sh HISTORY +A version of +.Nm +first appeared in the Mach Operating System. +It was subsequently ported to +.Bx 386 . +.Sh AUTHORS +.An -nosplit +.Nm +for Mach Operating System was written by +.An Robert Baron Aq Mt rvb@cs.cmu.edu . +It was ported to +.Bx 386 +by +.An Julian Elischer Aq Mt julian@tfs.com . .Sh BUGS The default boot code will not necessarily handle all slice types correctly, in particular those introduced since diff --git a/sbin/ffsinfo/ffsinfo.8 b/sbin/ffsinfo/ffsinfo.8 index 0c114bd..4bf5cdd 100644 --- a/sbin/ffsinfo/ffsinfo.8 +++ b/sbin/ffsinfo/ffsinfo.8 @@ -134,9 +134,9 @@ The utility first appeared in .Fx 4.4 . .Sh AUTHORS -.An Christoph Herrmann Aq chm@FreeBSD.org -.An Thomas-Henning von Kamptz Aq tomsoft@FreeBSD.org -.An The GROWFS team Aq growfs@Tomsoft.COM +.An Christoph Herrmann Aq Mt chm@FreeBSD.org +.An Thomas-Henning von Kamptz Aq Mt tomsoft@FreeBSD.org +.An The GROWFS team Aq Mt growfs@Tomsoft.COM .Sh BUGS Snapshots are handled like plain files. They should get their own level to provide for independent control of the diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h index 57e0773..27f0889 100644 --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -305,7 +305,7 @@ extern u_int real_dev_bsize; /* actual disk sector size, not overridden */ extern char nflag; /* assume a no response */ extern char yflag; /* assume a yes response */ extern int bkgrdflag; /* use a snapshot to run on an active system */ -extern int bflag; /* location of alternate super block */ +extern ufs2_daddr_t bflag; /* location of alternate super block */ extern int debug; /* output debugging info */ extern int Eflag; /* delete empty data blocks */ extern int Zflag; /* zero empty data blocks */ diff --git a/sbin/fsck_ffs/fsck_ffs.8 b/sbin/fsck_ffs/fsck_ffs.8 index 828df27..f11b5d4 100644 --- a/sbin/fsck_ffs/fsck_ffs.8 +++ b/sbin/fsck_ffs/fsck_ffs.8 @@ -29,7 +29,7 @@ .\" @(#)fsck.8 8.4 (Berkeley) 5/9/95 .\" $FreeBSD$ .\" -.Dd July 30, 2013 +.Dd October 5, 2016 .Dt FSCK_FFS 8 .Os .Sh NAME @@ -268,9 +268,9 @@ do not open the file system for writing. Preen file systems (see above). .It Fl R Instruct fsck_ffs to restart itself if it encounters certain errors that -warrant another run. It will limit itself to a maximum of 10 restarts -in a given run in order to avoid an endless loop with extremely corrupted -filesystems. +warrant another run. +It will limit itself to a maximum of 10 restarts in a given run in order +to avoid an endless loop with extremely corrupted filesystems. .It Fl r Free up excess unused inodes. Decreasing the number of preallocated inodes reduces the @@ -393,3 +393,14 @@ are fully enumerated and explained in Appendix A of .Xr fsdb 8 , .Xr newfs 8 , .Xr reboot 8 +.Sh HISTORY +A +.Nm fsck +utility appeared in +.Bx 4.0 . +It became +.Nm +in +.Fx 5.0 +with the introduction of the filesystem independent wrapper as +.Nm fsck . diff --git a/sbin/fsck_ffs/globs.c b/sbin/fsck_ffs/globs.c index e910bc9..8f1e5d8 100644 --- a/sbin/fsck_ffs/globs.c +++ b/sbin/fsck_ffs/globs.c @@ -77,7 +77,7 @@ u_int real_dev_bsize; /* actual disk sector size, not overridden */ char nflag; /* assume a no response */ char yflag; /* assume a yes response */ int bkgrdflag; /* use a snapshot to run on an active system */ -int bflag; /* location of alternate super block */ +ufs2_daddr_t bflag; /* location of alternate super block */ int debug; /* output debugging info */ int Eflag; /* delete empty data blocks */ int Zflag; /* zero empty data blocks */ diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c index c5b3b7a..7e79629 100644 --- a/sbin/fsck_ffs/main.c +++ b/sbin/fsck_ffs/main.c @@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$"); #include <errno.h> #include <fstab.h> #include <grp.h> +#include <inttypes.h> #include <mntopts.h> #include <paths.h> #include <stdint.h> @@ -68,7 +69,7 @@ __FBSDID("$FreeBSD$"); int restarts; static void usage(void) __dead2; -static int argtoi(int flag, const char *req, const char *str, int base); +static intmax_t argtoimax(int flag, const char *req, const char *str, int base); static int checkfilesys(char *filesys); static int chkdoreload(struct statfs *mntp); static struct statfs *getmntpt(const char *); @@ -88,8 +89,8 @@ main(int argc, char *argv[]) switch (ch) { case 'b': skipclean = 0; - bflag = argtoi('b', "number", optarg, 10); - printf("Alternate super block location: %d\n", bflag); + bflag = argtoimax('b', "number", optarg, 10); + printf("Alternate super block location: %jd\n", bflag); break; case 'B': @@ -98,7 +99,8 @@ main(int argc, char *argv[]) case 'c': skipclean = 0; - cvtlevel = argtoi('c', "conversion level", optarg, 10); + cvtlevel = argtoimax('c', "conversion level", optarg, + 10); if (cvtlevel < 3) errx(EEXIT, "cannot do level %d conversion", cvtlevel); @@ -121,7 +123,7 @@ main(int argc, char *argv[]) break; case 'm': - lfmode = argtoi('m', "mode", optarg, 8); + lfmode = argtoimax('m', "mode", optarg, 8); if (lfmode &~ 07777) errx(EEXIT, "bad mode to -m: %o", lfmode); printf("** lost+found creation mode %o\n", lfmode); @@ -203,13 +205,13 @@ main(int argc, char *argv[]) exit(ret); } -static int -argtoi(int flag, const char *req, const char *str, int base) +static intmax_t +argtoimax(int flag, const char *req, const char *str, int base) { char *cp; - int ret; + intmax_t ret; - ret = (int)strtol(str, &cp, base); + ret = strtoimax(str, &cp, base); if (cp == str || *cp) errx(EEXIT, "-%c flag requires a %s", flag, req); return (ret); diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c index 85e9548..7bcaf13 100644 --- a/sbin/fsck_ffs/setup.c +++ b/sbin/fsck_ffs/setup.c @@ -198,7 +198,7 @@ setup(char *dev) bflag = 0; return(0); } - pwarn("USING ALTERNATE SUPERBLOCK AT %d\n", bflag); + pwarn("USING ALTERNATE SUPERBLOCK AT %jd\n", bflag); bflag = 0; } if (skipclean && ckclean && sblock.fs_clean) { @@ -332,7 +332,7 @@ readsb(int listerr) } if (sblock.fs_magic != FS_UFS1_MAGIC && sblock.fs_magic != FS_UFS2_MAGIC) { - fprintf(stderr, "%d is not a file system superblock\n", + fprintf(stderr, "%jd is not a file system superblock\n", bflag); return (0); } diff --git a/sbin/fsirand/fsirand.8 b/sbin/fsirand/fsirand.8 index 3156787..f3fbd8d 100644 --- a/sbin/fsirand/fsirand.8 +++ b/sbin/fsirand/fsirand.8 @@ -107,7 +107,7 @@ A version first appeared in .Fx 2.2.5 . .Sh AUTHORS -.An Todd C. Miller Aq Todd.Miller@courtesan.com +.An Todd C. Miller Aq Mt Todd.Miller@courtesan.com .Sh CAVEATS Since .Nm diff --git a/sbin/gbde/gbde.8 b/sbin/gbde/gbde.8 index aafc6e0..3780cd8 100644 --- a/sbin/gbde/gbde.8 +++ b/sbin/gbde/gbde.8 @@ -243,7 +243,7 @@ To destroy all copies of the masterkey: This software was developed for the .Fx Project by -.An "Poul-Henning Kamp" +.An Poul-Henning Kamp and NAI Labs, the Security Research Division of Network Associates, Inc.\& under DARPA/SPAWAR contract N66001-01-C-8035 .Pq Dq CBOSS , @@ -253,7 +253,7 @@ DARPA CHATS research program. first appeared in .Fx 5.0 . .Sh AUTHORS -.An "Poul-Henning Kamp" Aq phk@FreeBSD.org +.An Poul-Henning Kamp Aq Mt phk@FreeBSD.org .Sh BUGS The cryptographic algorithms and the overall design have not been attacked mercilessly for over 10 years by a gang of cryptoanalysts. diff --git a/sbin/geom/class/cache/gcache.8 b/sbin/geom/class/cache/gcache.8 index b9f03bd..b0f1c7a 100644 --- a/sbin/geom/class/cache/gcache.8 +++ b/sbin/geom/class/cache/gcache.8 @@ -189,4 +189,4 @@ The utility appeared in .Fx 7.0 . .Sh AUTHORS -.An Ruslan Ermilov Aq ru@FreeBSD.org +.An Ruslan Ermilov Aq Mt ru@FreeBSD.org diff --git a/sbin/geom/class/concat/gconcat.8 b/sbin/geom/class/concat/gconcat.8 index b797a1c..d874b08 100644 --- a/sbin/geom/class/concat/gconcat.8 +++ b/sbin/geom/class/concat/gconcat.8 @@ -194,4 +194,4 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/eli/geli.8 b/sbin/geom/class/eli/geli.8 index 431b615..c435859 100644 --- a/sbin/geom/class/eli/geli.8 +++ b/sbin/geom/class/eli/geli.8 @@ -1057,4 +1057,4 @@ metadata version supported by the given FreeBSD version: .It Li 10.0 Ta 7 .El .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/journal/gjournal.8 b/sbin/geom/class/journal/gjournal.8 index edef902..6eb8cde 100644 --- a/sbin/geom/class/journal/gjournal.8 +++ b/sbin/geom/class/journal/gjournal.8 @@ -343,4 +343,4 @@ The utility appeared in .Fx 7.0 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/label/glabel.8 b/sbin/geom/class/label/glabel.8 index c1c7bc4..8111ca5 100644 --- a/sbin/geom/class/label/glabel.8 +++ b/sbin/geom/class/label/glabel.8 @@ -241,4 +241,4 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/mirror/gmirror.8 b/sbin/geom/class/mirror/gmirror.8 index bb48eca..5bd9bab 100644 --- a/sbin/geom/class/mirror/gmirror.8 +++ b/sbin/geom/class/mirror/gmirror.8 @@ -378,7 +378,7 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org .Sh BUGS There should be a way to change a component's priority inside a running mirror. .Pp diff --git a/sbin/geom/class/mountver/gmountver.8 b/sbin/geom/class/mountver/gmountver.8 index 8f271a0..44736d5 100644 --- a/sbin/geom/class/mountver/gmountver.8 +++ b/sbin/geom/class/mountver/gmountver.8 @@ -127,4 +127,4 @@ The utility appeared in .Fx 9.0 . .Sh AUTHORS -.An Edward Tomasz Napierala Aq trasz@FreeBSD.org +.An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org diff --git a/sbin/geom/class/multipath/gmultipath.8 b/sbin/geom/class/multipath/gmultipath.8 index fa8cd2a..cdb2e8a 100644 --- a/sbin/geom/class/multipath/gmultipath.8 +++ b/sbin/geom/class/multipath/gmultipath.8 @@ -360,6 +360,6 @@ GEOM_MULTIPATH: da2 added to FRED .Xr mount 8 , .Xr newfs 8 , .Xr sysctl 8 -.Sh AUTHOR -.An Matthew Jacob Aq mjacob@FreeBSD.org -.An Alexander Motin Aq mav@FreeBSD.org +.Sh AUTHORS +.An Matthew Jacob Aq Mt mjacob@FreeBSD.org +.An Alexander Motin Aq Mt mav@FreeBSD.org diff --git a/sbin/geom/class/nop/gnop.8 b/sbin/geom/class/nop/gnop.8 index ae350d0..fc7732d 100644 --- a/sbin/geom/class/nop/gnop.8 +++ b/sbin/geom/class/nop/gnop.8 @@ -183,4 +183,4 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/part/gpart.8 b/sbin/geom/class/part/gpart.8 index ba579bf..31c6cf7 100644 --- a/sbin/geom/class/part/gpart.8 +++ b/sbin/geom/class/part/gpart.8 @@ -1331,4 +1331,4 @@ The utility appeared in .Fx 7.0 . .Sh AUTHORS -.An Marcel Moolenaar Aq marcel@FreeBSD.org +.An Marcel Moolenaar Aq Mt marcel@FreeBSD.org diff --git a/sbin/geom/class/raid/graid.8 b/sbin/geom/class/raid/graid.8 index 47d7fa2..496e44e 100644 --- a/sbin/geom/class/raid/graid.8 +++ b/sbin/geom/class/raid/graid.8 @@ -320,5 +320,5 @@ The utility appeared in .Fx 9.0 . .Sh AUTHORS -.An Alexander Motin Aq mav@FreeBSD.org -.An M. Warner Losh Aq imp@FreeBSD.org +.An Alexander Motin Aq Mt mav@FreeBSD.org +.An M. Warner Losh Aq Mt imp@FreeBSD.org diff --git a/sbin/geom/class/raid3/graid3.8 b/sbin/geom/class/raid3/graid3.8 index a82d388..426c94d 100644 --- a/sbin/geom/class/raid3/graid3.8 +++ b/sbin/geom/class/raid3/graid3.8 @@ -248,7 +248,7 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org .Sh BUGS There should be a section with an implementation description. .Pp diff --git a/sbin/geom/class/sched/gsched.8 b/sbin/geom/class/sched/gsched.8 index ae04865..facb5c1 100644 --- a/sbin/geom/class/sched/gsched.8 +++ b/sbin/geom/class/sched/gsched.8 @@ -158,5 +158,5 @@ The utility first appeared in .Fx 8.1 . .Sh AUTHORS -.An Fabio Checconi Aq fabio@FreeBSD.org -.An Luigi Rizzo Aq luigi@FreeBSD.org +.An Fabio Checconi Aq Mt fabio@FreeBSD.org +.An Luigi Rizzo Aq Mt luigi@FreeBSD.org diff --git a/sbin/geom/class/shsec/gshsec.8 b/sbin/geom/class/shsec/gshsec.8 index f72c31c..dcfd2b3 100644 --- a/sbin/geom/class/shsec/gshsec.8 +++ b/sbin/geom/class/shsec/gshsec.8 @@ -127,4 +127,4 @@ The utility appeared in .Fx 5.4 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/stripe/gstripe.8 b/sbin/geom/class/stripe/gstripe.8 index 33ef30b..f1f34fe 100644 --- a/sbin/geom/class/stripe/gstripe.8 +++ b/sbin/geom/class/stripe/gstripe.8 @@ -240,4 +240,4 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/geom/class/virstor/gvirstor.8 b/sbin/geom/class/virstor/gvirstor.8 index 0273cb4..bf04c1b 100644 --- a/sbin/geom/class/virstor/gvirstor.8 +++ b/sbin/geom/class/virstor/gvirstor.8 @@ -293,7 +293,7 @@ and all their structures will be physically allocated at the start of the first virstor component. This could have a significant impact on file system performance .Pq which can in some rare cases be even positive . -.Sh AUTHOR -.An Ivan Voras Aq ivoras@FreeBSD.org +.Sh AUTHORS +.An Ivan Voras Aq Mt ivoras@FreeBSD.org .Pp Sponsored by Google Summer of Code 2006. diff --git a/sbin/geom/core/geom.8 b/sbin/geom/core/geom.8 index 6e79880..ab960ff 100644 --- a/sbin/geom/core/geom.8 +++ b/sbin/geom/core/geom.8 @@ -203,4 +203,4 @@ The utility appeared in .Fx 5.3 . .Sh AUTHORS -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org diff --git a/sbin/ggate/ggatec/ggatec.8 b/sbin/ggate/ggatec/ggatec.8 index 12a1c53..8545baf 100644 --- a/sbin/ggate/ggatec/ggatec.8 +++ b/sbin/ggate/ggatec/ggatec.8 @@ -177,4 +177,4 @@ client# mount_cd9660 /dev/ggate0 /cdrom The .Nm utility as well as this manual page was written by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org . +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org . diff --git a/sbin/ggate/ggated/ggated.8 b/sbin/ggate/ggated/ggated.8 index 04b0513..7b44c86 100644 --- a/sbin/ggate/ggated/ggated.8 +++ b/sbin/ggate/ggated/ggated.8 @@ -120,4 +120,4 @@ Export CD-ROM device and a file: The .Nm utility as well as this manual page was written by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org . +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org . diff --git a/sbin/ggate/ggatel/ggatel.8 b/sbin/ggate/ggatel/ggatel.8 index 6bfcedf..37e11cd 100644 --- a/sbin/ggate/ggatel/ggatel.8 +++ b/sbin/ggate/ggatel/ggatel.8 @@ -154,4 +154,4 @@ ggatel destroy -u 5 The .Nm utility as well as this manual page was written by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org . +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org . diff --git a/sbin/growfs/growfs.8 b/sbin/growfs/growfs.8 index d29a4dc..42dc530 100644 --- a/sbin/growfs/growfs.8 +++ b/sbin/growfs/growfs.8 @@ -120,10 +120,10 @@ utility first appeared in The ability to resize mounted file systems was added in .Fx 10.0 . .Sh AUTHORS -.An Christoph Herrmann Aq chm@FreeBSD.org -.An Thomas-Henning von Kamptz Aq tomsoft@FreeBSD.org -.An The GROWFS team Aq growfs@Tomsoft.COM -.An Edward Tomasz Napierala Aq trasz@FreeBSD.org +.An Christoph Herrmann Aq Mt chm@FreeBSD.org +.An Thomas-Henning von Kamptz Aq Mt tomsoft@FreeBSD.org +.An The GROWFS team Aq Mt growfs@Tomsoft.COM +.An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org .Sh CAVEATS When expanding a file system mounted read-write, any writes to that file system will be temporarily suspended until the expansion is finished. diff --git a/sbin/gvinum/gvinum.8 b/sbin/gvinum/gvinum.8 index 703a810..a8a448c 100644 --- a/sbin/gvinum/gvinum.8 +++ b/sbin/gvinum/gvinum.8 @@ -395,9 +395,9 @@ and through the 2007 Google Summer of Code program. The documentation have been updated to reflect the new functionality. .Sh AUTHORS -.An Lukas Ertl Aq le@FreeBSD.org -.An Chris Jones Aq soc-cjones@FreeBSD.org -.An Ulf Lilleengen Aq lulf@FreeBSD.org +.An Lukas Ertl Aq Mt le@FreeBSD.org +.An Chris Jones Aq Mt soc-cjones@FreeBSD.org +.An Ulf Lilleengen Aq Mt lulf@FreeBSD.org .Sh BUGS Currently, .Nm diff --git a/sbin/hastctl/hastctl.8 b/sbin/hastctl/hastctl.8 index bdca80d..397d4cf 100644 --- a/sbin/hastctl/hastctl.8 +++ b/sbin/hastctl/hastctl.8 @@ -224,5 +224,5 @@ nodeB# application_start The .Nm was developed by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5 index 3d921e4..c7e2e33 100644 --- a/sbin/hastd/hast.conf.5 +++ b/sbin/hastd/hast.conf.5 @@ -445,5 +445,5 @@ resource tank { The .Nm was written by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8 index 68c98cb..e30a11a 100644 --- a/sbin/hastd/hastd.8 +++ b/sbin/hastd/hastd.8 @@ -228,5 +228,5 @@ nodeA# mount -o noatime /dev/hast/shared /shared The .Nm was developed by -.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +.An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org under sponsorship of the FreeBSD Foundation. diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index 9ebd5d9..ba79c8c 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -3381,7 +3381,7 @@ options have been added by various developer over the years. .Pp .An -nosplit In-kernel NAT support written by -.An Paolo Pisati Aq piso@FreeBSD.org +.An Paolo Pisati Aq Mt piso@FreeBSD.org as part of a Summer of Code 2005 project. .Pp SCTP diff --git a/sbin/kldconfig/kldconfig.8 b/sbin/kldconfig/kldconfig.8 index 18fbaf6..3cc288f 100644 --- a/sbin/kldconfig/kldconfig.8 +++ b/sbin/kldconfig/kldconfig.8 @@ -105,4 +105,4 @@ The utility first appeared in .Fx 4.4 . .Sh AUTHORS -.An Peter Pentchev Aq roam@FreeBSD.org +.An Peter Pentchev Aq Mt roam@FreeBSD.org diff --git a/sbin/kldload/kldload.8 b/sbin/kldload/kldload.8 index ee913ed..b84b863 100644 --- a/sbin/kldload/kldload.8 +++ b/sbin/kldload/kldload.8 @@ -126,4 +126,4 @@ replacing the .Nm lkm interface. .Sh AUTHORS -.An Doug Rabson Aq dfr@FreeBSD.org +.An Doug Rabson Aq Mt dfr@FreeBSD.org diff --git a/sbin/kldstat/kldstat.8 b/sbin/kldstat/kldstat.8 index 63f07a6..266b8e5 100644 --- a/sbin/kldstat/kldstat.8 +++ b/sbin/kldstat/kldstat.8 @@ -78,4 +78,4 @@ replacing the .Nm lkm interface. .Sh AUTHORS -.An Doug Rabson Aq dfr@FreeBSD.org +.An Doug Rabson Aq Mt dfr@FreeBSD.org diff --git a/sbin/kldunload/kldunload.8 b/sbin/kldunload/kldunload.8 index 0a7937c..5e371e6 100644 --- a/sbin/kldunload/kldunload.8 +++ b/sbin/kldunload/kldunload.8 @@ -78,4 +78,4 @@ replacing the .Nm lkm interface. .Sh AUTHORS -.An Doug Rabson Aq dfr@FreeBSD.org +.An Doug Rabson Aq Mt dfr@FreeBSD.org diff --git a/sbin/md5/md5.1 b/sbin/md5/md5.1 index 77c40a1..e191cd1 100644 --- a/sbin/md5/md5.1 +++ b/sbin/md5/md5.1 @@ -154,4 +154,4 @@ This program is placed in the public domain for free general use by RSA Data Security. .Pp Support for SHA-1 and RIPEMD-160 has been added by -.An Oliver Eikemeier Aq eik@FreeBSD.org . +.An Oliver Eikemeier Aq Mt eik@FreeBSD.org . diff --git a/sbin/mdconfig/mdconfig.8 b/sbin/mdconfig/mdconfig.8 index d273128..2fd6063 100644 --- a/sbin/mdconfig/mdconfig.8 +++ b/sbin/mdconfig/mdconfig.8 @@ -321,5 +321,4 @@ combo. The .Nm utility was written by -.An Poul-Henning Kamp -.Aq phk@FreeBSD.org . +.An Poul-Henning Kamp Aq Mt phk@FreeBSD.org . diff --git a/sbin/mount/mount.conf.8 b/sbin/mount/mount.conf.8 index c3296c3..45b8257 100644 --- a/sbin/mount/mount.conf.8 +++ b/sbin/mount/mount.conf.8 @@ -247,6 +247,6 @@ The root mount logic in the kernel which parses .Pa /.mount.conf was written by -.An Marcel Moolenaar Aq marcel@FreeBSD.org . +.An Marcel Moolenaar Aq Mt marcel@FreeBSD.org . This man page was written by -.An Craig Rodrigues Aq rodrigc@FreeBSD.org . +.An Craig Rodrigues Aq Mt rodrigc@FreeBSD.org . diff --git a/sbin/mount_cd9660/mount_cd9660.8 b/sbin/mount_cd9660/mount_cd9660.8 index b471686..974ab3b 100644 --- a/sbin/mount_cd9660/mount_cd9660.8 +++ b/sbin/mount_cd9660/mount_cd9660.8 @@ -147,7 +147,7 @@ utility first appeared in .Bx 4.4 . .Pp The Unicode conversion routine was added by -.An Ryuichiro Imura Aq imura@ryu16.org +.An Ryuichiro Imura Aq Mt imura@ryu16.org in 2003. .Sh BUGS POSIX device node mapping is currently not supported. diff --git a/sbin/mount_msdosfs/mount_msdosfs.8 b/sbin/mount_msdosfs/mount_msdosfs.8 index e7bc764..3e4b08d 100644 --- a/sbin/mount_msdosfs/mount_msdosfs.8 +++ b/sbin/mount_msdosfs/mount_msdosfs.8 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd October 1, 2013 +.Dd October 3, 2016 .Dt MOUNT_MSDOSFS 8 .Os .Sh NAME @@ -200,21 +200,32 @@ To mount a Japanese MS-DOS file system located in List of Localized MS Operating Systems: .Pa http://www.microsoft.com/globaldev/reference/oslocversion.mspx . .Sh HISTORY -The -.Nm -utility first appeared in +The predecessor to +.Nm mount_msdos +utility named +.Nm mount_pcfs +appeared in +.Bx 386 . +It was rewritten in +.Nx 1.0 +and first appeared in .Fx 2.0 . -Its predecessor, the +.Nm mount_msdos +was renamed to the more aptly-named +.Nm +in +.Fx 5.0. +The character code conversion routine was added in 2003. +.Sh AUTHORS +Initial implementation as .Nm mount_pcfs -utility appeared in -.Fx 1.0 , -and was abandoned in favor -of the more aptly-named -.Nm . -.Pp +was written by +.An -nosplit +.An Paul Popelka Aq Mt paulp@uts.amdahl.com . +It was rewritten by +.An Christopher G. Demetriou Aq Mt cgd@NetBSD.org . The character code conversion routine was added by -.An Ryuichiro Imura Aq imura@ryu16.org -at 2003. +.An Ryuichiro Imura Aq Mt imura@ryu16.org . .Sh CAVEATS The use of the .Fl 9 diff --git a/sbin/mount_nullfs/mount_nullfs.8 b/sbin/mount_nullfs/mount_nullfs.8 index 177ecdd..da4586a 100644 --- a/sbin/mount_nullfs/mount_nullfs.8 +++ b/sbin/mount_nullfs/mount_nullfs.8 @@ -33,7 +33,7 @@ .\" @(#)mount_null.8 8.6 (Berkeley) 5/1/95 .\" $FreeBSD$ .\" -.Dd May 1, 1995 +.Dd October 3, 2016 .Dt MOUNT_NULLFS 8 .Os .Sh NAME @@ -240,6 +240,10 @@ UCLA Technical Report CSD-910056, .Em "Stackable Layers: an Architecture for File System Development" . .Sh HISTORY The -.Nm +.Nm mount_null utility first appeared in .Bx 4.4 . +It was renamed to +.Nm +in +.Fx 5.0 . diff --git a/sbin/mount_unionfs/mount_unionfs.8 b/sbin/mount_unionfs/mount_unionfs.8 index b9290db..faab704 100644 --- a/sbin/mount_unionfs/mount_unionfs.8 +++ b/sbin/mount_unionfs/mount_unionfs.8 @@ -31,7 +31,7 @@ .\" @(#)mount_union.8 8.6 (Berkeley) 3/27/94 .\" $FreeBSD$ .\" -.Dd November 30, 2006 +.Dd October 3, 2016 .Dt MOUNT_UNIONFS 8 .Os .Sh NAME @@ -328,9 +328,13 @@ accessible via .Xr mount_nullfs 8 .Sh HISTORY The -.Nm +.Nm mount_null utility first appeared in .Bx 4.4 . +It was renamed to +.Nm +in +.Fx 5.0 . .Pp The .Fl r @@ -342,9 +346,9 @@ because this is identical to using .An -nosplit In .Fx 7.0 , -.An Masanori OZAWA Aq ozawa@ongs.co.jp +.An Masanori OZAWA Aq Mt ozawa@ongs.co.jp reimplemented handling of locking, whiteout, and file mode bits, and -.An Hiroki Sato Aq hrs@FreeBSD.org +.An Hiroki Sato Aq Mt hrs@FreeBSD.org wrote about the changes in this manual page. .Sh BUGS THIS FILE SYSTEM TYPE IS NOT YET FULLY SUPPORTED (READ: IT DOESN'T WORK) @@ -357,7 +361,7 @@ BATTERIES NOT INCLUDED. .Pp This code also needs an owner in order to be less dangerous - serious hackers can apply by sending mail to -.Aq freebsd-fs@FreeBSD.org +.Aq Mt freebsd-fs@FreeBSD.org and announcing their intent to take it over. .Pp diff --git a/sbin/natd/natd.8 b/sbin/natd/natd.8 index b4b37ed..71f971a 100644 --- a/sbin/natd/natd.8 +++ b/sbin/natd/natd.8 @@ -1,5 +1,5 @@ .\" $FreeBSD$ -.Dd June 23, 2008 +.Dd October 5, 2016 .Dt NATD 8 .Os .Sh NAME @@ -426,7 +426,7 @@ Options can be divided to several sections. Each section applies to own .Nm instance. -This ability allows to configure one +This ability allows the configuration of one .Nm process for several NAT instances. The first instance that always exists is a "default" instance. @@ -809,23 +809,28 @@ are forwarded to the appropriate router on that interface. .Xr init 8 , .Xr ipfw 8 , .Xr ppp 8 +.Sh HISTORY +The +.Nm +utility appeared in +.Fx 3.0 . .Sh AUTHORS This program is the result of the efforts of many people at different times: .Pp -.An Archie Cobbs Aq archie@FreeBSD.org +.An Archie Cobbs Aq Mt archie@FreeBSD.org (divert sockets) -.An Charles Mott Aq cm@linktel.net +.An Charles Mott Aq Mt cm@linktel.net (packet aliasing) -.An Eivind Eklund Aq perhaps@yes.no +.An Eivind Eklund Aq Mt perhaps@yes.no (IRC support & misc additions) -.An Ari Suutari Aq suutari@iki.fi +.An Ari Suutari Aq Mt suutari@iki.fi (natd) -.An Dru Nelson Aq dnelson@redwoodsoft.com +.An Dru Nelson Aq Mt dnelson@redwoodsoft.com (early PPTP support) -.An Brian Somers Aq brian@awfulhak.org +.An Brian Somers Aq Mt brian@awfulhak.org (glue) -.An Ruslan Ermilov Aq ru@FreeBSD.org +.An Ruslan Ermilov Aq Mt ru@FreeBSD.org (natd, packet aliasing, glue) -.An Poul-Henning Kamp Aq phk@FreeBSD.org +.An Poul-Henning Kamp Aq Mt phk@FreeBSD.org (multiple instances) diff --git a/sbin/newfs_msdos/newfs_msdos.8 b/sbin/newfs_msdos/newfs_msdos.8 index 8901673..967e151 100644 --- a/sbin/newfs_msdos/newfs_msdos.8 +++ b/sbin/newfs_msdos/newfs_msdos.8 @@ -236,4 +236,4 @@ The utility first appeared in .Fx 3.0 . .Sh AUTHORS -.An Robert Nordier Aq rnordier@FreeBSD.org . +.An Robert Nordier Aq Mt rnordier@FreeBSD.org diff --git a/sbin/newfs_nandfs/newfs_nandfs.8 b/sbin/newfs_nandfs/newfs_nandfs.8 index b357547..fe32163 100644 --- a/sbin/newfs_nandfs/newfs_nandfs.8 +++ b/sbin/newfs_nandfs/newfs_nandfs.8 @@ -70,5 +70,5 @@ The .Nm utility first appeared in .Fx 10.0 . -.Sh AUTHOR +.Sh AUTHORS .An Grzegorz Bernacki diff --git a/sbin/nos-tun/nos-tun.8 b/sbin/nos-tun/nos-tun.8 index 405d430..ca235d7 100644 --- a/sbin/nos-tun/nos-tun.8 +++ b/sbin/nos-tun/nos-tun.8 @@ -8,7 +8,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 11, 1998 +.Dd October 5, 2016 .Dt NOS-TUN 8 .Os .Sh NAME @@ -80,13 +80,18 @@ tunnel mode nos tunnel destination 192.168.59.34 tunnel source 192.168.56.45 .Ed +.Sh HISTORY +The +.Nm +utility appeared in +.Fx 3.0 . .Sh AUTHORS .An -nosplit -.An Nickolay N. Dudorov Aq nnd@itfs.nsk.su +.An Nickolay N. Dudorov Aq Mt nnd@itfs.nsk.su wrote the program, -.An Poul-Henning Kamp Aq phk@FreeBSD.org +.An Poul-Henning Kamp Aq Mt phk@FreeBSD.org wrote the man-page. -.An Isao SEKI Aq iseki@gongon.com +.An Isao SEKI Aq Mt iseki@gongon.com added a new flag, IP protocol number. .Sh BUGS We do not allow for setting our source address for multihomed machines. diff --git a/sbin/nvmecontrol/nvmecontrol.8 b/sbin/nvmecontrol/nvmecontrol.8 index 010f106..3b4b5c2 100644 --- a/sbin/nvmecontrol/nvmecontrol.8 +++ b/sbin/nvmecontrol/nvmecontrol.8 @@ -124,7 +124,7 @@ nvme0 controller and activate it on the next reset. .An -nosplit .Nm was developed by Intel and originally written by -.An Jim Harris Aq jimharris@FreeBSD.org . +.An Jim Harris Aq Mt jimharris@FreeBSD.org . .Pp This man page was written by -.An Jim Harris Aq jimharris@FreeBSD.org . +.An Jim Harris Aq Mt jimharris@FreeBSD.org . diff --git a/sbin/pfctl/pfctl.8 b/sbin/pfctl/pfctl.8 index 5c0e7b3..92da91e 100644 --- a/sbin/pfctl/pfctl.8 +++ b/sbin/pfctl/pfctl.8 @@ -26,7 +26,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 21, 2011 +.Dd October 3, 2016 .Dt PFCTL 8 .Os .Sh NAME @@ -391,7 +391,7 @@ Note that the optimization done automatically by the kernel will skip evaluation of rules where possible. Packets passed statefully are counted in the rule that created the state -(even though the rule isn't evaluated more than once for the entire +(even though the rule is not evaluated more than once for the entire connection). .It Fl s Cm Anchors Show the currently loaded anchors directly attached to the main ruleset. @@ -594,7 +594,7 @@ counters are incremented instead of the .Dq Pass counters when a .Dq stateful -packet is passed but doesn't match the table anymore. +packet is passed but does not match the table anymore. This will happen in our example if someone flushes the table while the .Xr ping 8 command is running. @@ -610,7 +610,7 @@ The flags are defined as follows: For constant tables, which cannot be altered outside .Xr pf.conf 5 . .It p -For persistent tables, which don't get automatically killed when no rules +For persistent tables, which do not get automatically killed when no rules refer to them. .It a For tables which are part of the @@ -649,7 +649,7 @@ Set the debug .Pp .Bl -tag -width xxxxxxxxxxxx -compact .It Fl x Cm none -Don't generate debug messages. +Do not generate debug messages. .It Fl x Cm urgent Generate debug messages only for serious errors. .It Fl x Cm misc @@ -683,5 +683,9 @@ The .Nm program and the .Xr pf 4 -filter mechanism first appeared in +filter mechanism appeared in .Ox 3.0 . +They first appeared in +.Fx 5.3 +ported from the version in +.Ox 3.5 diff --git a/sbin/rcorder/rcorder.8 b/sbin/rcorder/rcorder.8 index b43383a..995ef68 100644 --- a/sbin/rcorder/rcorder.8 +++ b/sbin/rcorder/rcorder.8 @@ -165,9 +165,9 @@ utility first appeared in .Sh AUTHORS .An -nosplit Written by -.An Perry E. Metzger Aq perry@piermont.com +.An Perry E. Metzger Aq Mt perry@piermont.com and -.An Matthew R. Green Aq mrg@eterna.com.au . +.An Matthew R. Green Aq Mt mrg@eterna.com.au . .Sh BUGS The .Dq Li REQUIRE diff --git a/sbin/reboot/nextboot.8 b/sbin/reboot/nextboot.8 index 0d72398..14573e2 100644 --- a/sbin/reboot/nextboot.8 +++ b/sbin/reboot/nextboot.8 @@ -118,7 +118,7 @@ appeared in .Fx 5.0 . .Sh AUTHORS This manual page was written by -.An Gordon Tetlow Aq gordon@FreeBSD.org . +.An Gordon Tetlow Aq Mt gordon@FreeBSD.org . .Sh BUGS The .Nm diff --git a/sbin/recoverdisk/recoverdisk.1 b/sbin/recoverdisk/recoverdisk.1 index b3924c5..fd42f65 100644 --- a/sbin/recoverdisk/recoverdisk.1 +++ b/sbin/recoverdisk/recoverdisk.1 @@ -137,9 +137,9 @@ utility first appeared in .Sh AUTHORS .An -nosplit The original implementation was done by -.An Poul-Henning Kamp Aq phk@FreeBSD.org +.An Poul-Henning Kamp Aq Mt phk@FreeBSD.org with minor improvements from -.An Ulrich Sp\(:orlein Aq uqs@FreeBSD.org . +.An Ulrich Sp\(:orlein Aq Mt uqs@FreeBSD.org . .Pp This manual page was written by .An Ulrich Sp\(:orlein . diff --git a/sbin/sconfig/sconfig.8 b/sbin/sconfig/sconfig.8 index 3fe7abc..713f76d 100644 --- a/sbin/sconfig/sconfig.8 +++ b/sbin/sconfig/sconfig.8 @@ -11,7 +11,7 @@ .\" works or modified versions. .\" .\" $FreeBSD$ -.Dd May 19, 2004 +.Dd October 3, 2016 .Dt SCONFIG 8 i386 .Os .Sh NAME @@ -575,7 +575,13 @@ Test error (G.703 only). .Xr spppcontrol 8 .\"-------------------------------------------------------------- .Sh HISTORY -This utility is a replacement for the +The +.Nm +utility appeared in +.Fx 5.2 . +The +.Nm +utility is a replacement for the .Nm cxconfig and .Nm ctconfig @@ -595,7 +601,7 @@ versions of the utility are not fully compatible. .\"-------------------------------------------------------------- .Sh AUTHORS -.An Cronyx Engineering Aq info@cronyx.ru +.An Cronyx Engineering Aq Mt info@cronyx.ru .Pp .Pa http://www.cronyx.ru .\"-------------------------------------------------------------- diff --git a/share/examples/mdoc/example.1 b/share/examples/mdoc/example.1 index a578502..dde029e 100644 --- a/share/examples/mdoc/example.1 +++ b/share/examples/mdoc/example.1 @@ -150,6 +150,6 @@ manual page example first appeared in .Sh AUTHORS This manual page was written by -.An Mike Pritchard Aq mpp@FreeBSD.org . +.An Mike Pritchard Aq Mt mpp@FreeBSD.org . .Sh BUGS The actual code for this command is vaporware. diff --git a/share/examples/mdoc/example.3 b/share/examples/mdoc/example.3 index bbdd068..fc3b0fd 100644 --- a/share/examples/mdoc/example.3 +++ b/share/examples/mdoc/example.3 @@ -330,6 +330,6 @@ manual page example first appeared in .Sh AUTHORS This manual page was written by -.An Mike Pritchard Aq mpp@FreeBSD.org . +.An Mike Pritchard Aq Mt mpp@FreeBSD.org . .Sh BUGS The actual code for this function is vaporware. diff --git a/share/examples/mdoc/example.4 b/share/examples/mdoc/example.4 index 7185715..946dd3a 100644 --- a/share/examples/mdoc/example.4 +++ b/share/examples/mdoc/example.4 @@ -118,6 +118,6 @@ manual page example first appeared in .At v6 . .Sh AUTHORS This manual page was written by -.An Mike Pritchard Aq mpp@FreeBSD.org . +.An Mike Pritchard Aq Mt mpp@FreeBSD.org . .Sh BUGS The actual code for this device driver is vaporware. diff --git a/share/examples/mdoc/example.9 b/share/examples/mdoc/example.9 index 0fb5a10..97b57ff 100644 --- a/share/examples/mdoc/example.9 +++ b/share/examples/mdoc/example.9 @@ -338,6 +338,6 @@ manual page example first appeared in .Sh AUTHORS This manual page was written by -.An Giorgos Keramidas Aq keramida@FreeBSD.org . +.An Giorgos Keramidas Aq Mt keramida@FreeBSD.org . .Sh BUGS The actual code for this function is vaporware. diff --git a/share/examples/scsi_target/scsi_target.8 b/share/examples/scsi_target/scsi_target.8 index 677e2b4..4cb6442 100644 --- a/share/examples/scsi_target/scsi_target.8 +++ b/share/examples/scsi_target/scsi_target.8 @@ -155,4 +155,4 @@ and was written by It was rewritten for .Fx 5.0 by -.An Nate Lawson Aq nate@root.org . +.An Nate Lawson Aq Mt nate@root.org . diff --git a/share/man/man3/queue.3 b/share/man/man3/queue.3 index 2776c34..9908acf 100644 --- a/share/man/man3/queue.3 +++ b/share/man/man3/queue.3 @@ -32,12 +32,13 @@ .\" @(#)queue.3 8.2 (Berkeley) 1/24/94 .\" $FreeBSD$ .\" -.Dd June 24, 2015 +.Dd August 15, 2016 .Dt QUEUE 3 .Os .Sh NAME .Nm SLIST_CLASS_ENTRY , .Nm SLIST_CLASS_HEAD , +.Nm SLIST_CONCAT , .Nm SLIST_EMPTY , .Nm SLIST_ENTRY , .Nm SLIST_FIRST , @@ -79,6 +80,7 @@ .Nm STAILQ_SWAP , .Nm LIST_CLASS_ENTRY , .Nm LIST_CLASS_HEAD , +.Nm LIST_CONCAT , .Nm LIST_EMPTY , .Nm LIST_ENTRY , .Nm LIST_FIRST , @@ -129,6 +131,7 @@ lists and tail queues .\" .Fn SLIST_CLASS_ENTRY "CLASSTYPE" .Fn SLIST_CLASS_HEAD "HEADNAME" "CLASSTYPE" +.Fn SLIST_CONCAT "SLIST_HEAD *head1" "SLIST_HEAD *head2" "TYPE" "SLIST_ENTRY NAME" .Fn SLIST_EMPTY "SLIST_HEAD *head" .Fn SLIST_ENTRY "TYPE" .Fn SLIST_FIRST "SLIST_HEAD *head" @@ -172,6 +175,7 @@ lists and tail queues .\" .Fn LIST_CLASS_ENTRY "CLASSTYPE" .Fn LIST_CLASS_HEAD "HEADNAME" "CLASSTYPE" +.Fn LIST_CONCAT "LIST_HEAD *head1" "LIST_HEAD *head2" "TYPE" "LIST_ENTRY NAME" .Fn LIST_EMPTY "LIST_HEAD *head" .Fn LIST_ENTRY "TYPE" .Fn LIST_FIRST "LIST_HEAD *head" @@ -253,6 +257,8 @@ Singly-linked lists add the following functionality: .Bl -enum -compact -offset indent .It O(n) removal of any entry in the list. +.It +O(n) concatenation of two lists. .El .Pp Singly-linked tail queues add the following functionality: @@ -300,6 +306,8 @@ Linked lists are the simplest of the doubly linked data structures. They add the following functionality over the above: .Bl -enum -compact -offset indent .It +O(n) concatenation of two lists. +.It They may be traversed backwards. .El However: @@ -405,6 +413,19 @@ evaluates to an initializer for the list .Fa head . .Pp The macro +.Nm SLIST_CONCAT +concatenates the list headed by +.Fa head2 +onto the end of the one headed by +.Fa head1 +removing all entries from the former. +Use of this macro should be avoided as it traverses the entirety of the +.Fa head1 +list. +A singly-linked tail queue should be used if this macro is needed in +high-usage code paths or to operate on long lists. +.Pp +The macro .Nm SLIST_EMPTY evaluates to true if there are no elements in the list. .Pp @@ -512,6 +533,9 @@ The macro removes the element .Fa elm from the list. +Use of this macro should be avoided as it traverses the entire list. +A doubly-linked list should be used if this macro is needed in +high-usage code paths or to operate on long lists. .Pp The macro .Nm SLIST_SWAP @@ -728,6 +752,9 @@ The macro removes the element .Fa elm from the tail queue. +Use of this macro should be avoided as it traverses the entire list. +A doubly-linked tail queue should be used if this macro is needed in +high-usage code paths or to operate on long tail queues. .Pp The macro .Nm STAILQ_SWAP @@ -827,6 +854,19 @@ evaluates to an initializer for the list .Fa head . .Pp The macro +.Nm LIST_CONCAT +concatenates the list headed by +.Fa head2 +onto the end of the one headed by +.Fa head1 +removing all entries from the former. +Use of this macro should be avoided as it traverses the entirety of the +.Fa head1 +list. +A tail queue should be used if this macro is needed in +high-usage code paths or to operate on long lists. +.Pp +The macro .Nm LIST_EMPTY evaluates to true if there are no elements in the list. .Pp diff --git a/share/misc/pci_vendors b/share/misc/pci_vendors index a4a8b76..2bc34d3 100644 --- a/share/misc/pci_vendors +++ b/share/misc/pci_vendors @@ -3,11 +3,11 @@ # # List of PCI ID's # -# Version: 2015.07.31 -# Date: 2015-07-31 03:15:02 +# Version: 2016.10.03 +# Date: 2016-10-03 03:15:01 # -# Maintained by Martin Mares <mj@ucw.cz> and other volunteers from the -# PCI ID Project at http://pci-ids.ucw.cz/. +# Maintained by Albert Pool, Martin Mares, and other volunteers from +# the PCI ID Project at http://pci-ids.ucw.cz/. # # New data are always welcome, especially if they are accurate. If you have # anything to contribute, please follow the instructions at the web site. @@ -23,6 +23,7 @@ # device device_name <-- single tab # subvendor subdevice subsystem_name <-- two tabs +0001 SafeNet (wrong ID) 0010 Allied Telesis, Inc (Wrong ID) # This is a relabelled RTL-8139 8139 AT-2500TX V3 Ethernet @@ -243,6 +244,19 @@ 1000 1000 LSI53C895A PCI to Ultra2 SCSI Controller 0013 53c875a 1000 1000 LSI53C875A PCI to Ultra SCSI Controller + 0014 MegaRAID Tri-Mode SAS3516 + 1d49 0602 ThinkSystem RAID 930-16i 4GB Flash PCIe 12Gb Adapter + 0016 MegaRAID Tri-Mode SAS3508 + 1d49 0601 ThinkSystem RAID 930-8i 2GB Flash PCIe 12Gb Adapter + 1d49 0603 ThinkSystem RAID 930-24i 4GB Flash PCIe 12Gb Adapter + 1d49 0604 ThinkSystem RAID 930-8e 4GB Flash PCIe 12Gb Adapter + 0017 MegaRAID Tri-Mode SAS3408 + 1d49 0500 ThinkSystem RAID 530-8i PCIe 12Gb Adapter + 1d49 0502 ThinkSystem RAID 530-8i Dense Adapter + 001b MegaRAID Tri-Mode SAS3504 + 1d49 0605 ThinkSystem RAID 930-4i 2GB Flash Flex Adapter + 001c MegaRAID Tri-Mode SAS3404 + 1d49 0501 ThinkSystem RAID 530-4i Flex Adapter 0020 53c1010 Ultra3 SCSI Adapter 1000 1000 LSI53C1010-33 PCI to Dual Channel Ultra160 SCSI Controller 107b 1040 Server Onboard 53C1010-33 @@ -272,6 +286,7 @@ 103c 12c5 Ultra320 SCSI [A7173A] 103c 1323 Core I/O LAN/SCSI Combo [AB314A] 103c 3108 Single Channel Ultra320 SCSI HBA G2 + 103c 322a SC11Xe Ultra320 Single Channel PCIe x4 SCSI Host Bus Adapter (412911-B21) 124b 1170 PMC-USCSI320 # VMware's emulation of this device. Was missing from the list. 15ad 1976 LSI Logic Parallel SCSI Controller @@ -287,6 +302,10 @@ 0050 SAS1064 PCI-X Fusion-MPT SAS 1028 1f04 SAS 5/E 1028 1f09 SAS 5i/R + 0052 MegaRAID SAS-3 3216/3224 [Cutlass] + 0053 MegaRAID SAS-3 3216/3224 [Cutlass] + 1000 9350 MegaRAID SAS 9341-16i + 1000 9351 MegaRAID SAS 9341-24i 0054 SAS1068 PCI-X Fusion-MPT SAS 1028 1f04 SAS 5/E Adapter Controller 1028 1f05 SAS 5/i Adapter Controller @@ -345,6 +364,8 @@ 005c SAS1064A PCI-X Fusion-MPT SAS 005d MegaRAID SAS-3 3108 [Invader] 1000 9361 MegaRAID SAS 9361-8i + 1000 9364 MegaRAID SAS 9364-8i + 1000 936a MegaRAID SAS 9364-8i 1028 1f41 PERC H830 Adapter 1028 1f42 PERC H730P Adapter 1028 1f43 PERC H730 Adapter @@ -357,6 +378,7 @@ 1028 1f54 PERC FD33xD 17aa 1052 ThinkServer RAID 720i 17aa 1053 ThinkServer RAID 720ix + 1d49 0600 ThinkSystem RAID 730-8i 1GB Cache PCIe 12Gb Adapter 005e SAS1066 PCI-X Fusion-MPT SAS 005f MegaRAID SAS-3 3008 [Fury] 1028 1f44 PERC H330 Adapter @@ -364,6 +386,7 @@ 1028 1f4c PERC H330 Mini (for blades) 1028 1f4d PERC H330 Embedded (for monolithic) 1054 306a SAS 3004 iMR ROMB + 1d49 04db ServeRAID M1210 SAS/SATA Controller 0060 MegaRAID SAS 1078 1000 1006 MegaRAID SAS 8888ELP 1000 100a MegaRAID SAS 8708ELP @@ -474,6 +497,21 @@ 007c MegaRAID SAS 1078DE 1014 0395 ServeRAID-AR10is SAS/SATA Controller 007e SSS6200 PCI-Express Flash SSD + 1000 0504 Nytro NWD-BLP4-800 + 1000 0507 Nytro NWD-BLP4-1600 + 1000 0581 Nytro NWD-BLP4-400 + 1000 100d Nytro NWD-BFH6-1200 + 1000 100e Nytro NWD-BFH8-1600 + 1000 107e Nytro NWD-BFH8-3200 + 1000 1310 Nytro XP6302-8B1536 + 1000 1311 Nytro XP6302-8B2048 + 1000 1314 Nytro XP6302-8B4096 + 1000 150c Nytro XP6210-4A2048 + 1000 150f Nytro XP6210-4B2048 + 1000 160b Nytro XP6209-4A1024 + 1000 1613 Nytro XP6209-4B2048 + 108e 050a Nytro ELP4x200_4d_n + 108e 0581 Nytro ELP4x100_4d_n 0080 SAS2208 PCI-Express Fusion-MPT SAS-2 0081 SAS2208 PCI-Express Fusion-MPT SAS-2 0082 SAS2208 PCI-Express Fusion-MPT SAS-2 @@ -482,6 +520,9 @@ 0085 SAS2208 PCI-Express Fusion-MPT SAS-2 0086 SAS2308 PCI-Express Fusion-MPT SAS-2 0087 SAS2308 PCI-Express Fusion-MPT SAS-2 + 1000 3020 9207-8i SAS2.1 HBA + 1000 3040 9207-8e SAS2.1 HBA + 1000 3050 SAS9217-8i 1590 0044 H220i 008f 53c875J 1092 8000 FirePort 40 SCSI Controller @@ -492,8 +533,38 @@ 0095 SAS3108 PCI-Express Fusion-MPT SAS-3 0096 SAS3004 PCI-Express Fusion-MPT SAS-3 0097 SAS3008 PCI-Express Fusion-MPT SAS-3 + 1000 3090 SAS9311-8i + 1000 30e0 SAS9300-8i 1028 1f45 12GB/s HBA internal 1028 1f46 12Gbps HBA + 00ab SAS3516 Fusion-MPT Tri-Mode RAID On Chip (ROC) + 00ac SAS3416 Fusion-MPT Tri-Mode I/O Controller Chip (IOC) + 1d49 0201 ThinkSystem 9400-16i PCIe 12Gb HBA + 1d49 0203 ThinkSystem 9400-16e PCIe 12Gb HBA + 00ae SAS3508 Fusion-MPT Tri-Mode RAID On Chip (ROC) + 00af SAS3408 Fusion-MPT Tri-Mode I/O Controller Chip (IOC) + 1d49 0200 ThinkSystem 9400-8i PCIe 12Gb HBA + 1d49 0202 ThinkSystem 9400-8e PCIe 12Gb HBA + 00be SAS3504 Fusion-MPT Tri-Mode RAID On Chip (ROC) + 00bf SAS3404 Fusion-MPT Tri-Mode I/O Controller Chip (IOC) + 00c0 SAS3324 PCI-Express Fusion-MPT SAS-3 + 00c1 SAS3324 PCI-Express Fusion-MPT SAS-3 + 00c2 SAS3324 PCI-Express Fusion-MPT SAS-3 + 00c3 SAS3324 PCI-Express Fusion-MPT SAS-3 + 00c4 SAS3224 PCI-Express Fusion-MPT SAS-3 + 00c5 SAS3316 PCI-Express Fusion-MPT SAS-3 + 00c6 SAS3316 PCI-Express Fusion-MPT SAS-3 + 00c7 SAS3316 PCI-Express Fusion-MPT SAS-3 + 00c8 SAS3316 PCI-Express Fusion-MPT SAS-3 + 00c9 SAS3216 PCI-Express Fusion-MPT SAS-3 + 00ce MegaRAID SAS-3 3316 [Intruder] + 1000 9371 MegaRAID SAS 9361-16i + 1000 9390 MegaRAID SAS 9380-8i8e + 00cf MegaRAID SAS-3 3324 [Intruder] + 1000 9370 MegaRAID SAS 9361-24i + 00d0 SAS3716 Fusion-MPT Tri-Mode RAID Controller Chip (ROC) + 00d1 SAS3616 Fusion-MPT Tri-Mode I/O Controller Chip (IOC) + 00d3 MegaRAID Tri-Mode SAS3716W 0407 MegaRAID 1000 0530 MegaRAID 530 SCSI 320-0X RAID Controller 1000 0531 MegaRAID 531 SCSI 320-4X RAID Controller @@ -1260,7 +1331,7 @@ 5652 RV410/M26 [Mobility Radeon X700] 5653 RV410/M26 [Mobility Radeon X700] 1025 0080 Aspire 5024WLMi - 103c 0940 HP Compaq NW8240 Mobile Workstation + 103c 0940 Compaq NW8240 Mobile Workstation 5654 264VT [Mach64 VT] 1002 5654 Mach64VT Reference 5655 264VT3 [Mach64 VT3] @@ -1350,23 +1421,23 @@ 5a11 RD890 Northbridge only single slot PCI-e GFX Hydra part 5a12 RD890 Northbridge only dual slot (2x8) PCI-e GFX Hydra part 15d9 a811 H8DGU - 5a13 RD890 PCI to PCI bridge (external gfx0 port A) - 5a14 RD890 PCI to PCI bridge (external gfx0 port B) + 5a13 RD890S/SR5650 Host Bridge + 5a14 RD9x0/RX980 Host Bridge 5a15 RD890 PCI to PCI bridge (PCI express gpp port A) - 5a16 RD890 PCI to PCI bridge (PCI express gpp port B) - 5a17 RD890 PCI to PCI bridge (PCI express gpp port C) - 5a18 RD890 PCI to PCI bridge (PCI express gpp port D) + 5a16 RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GFX port 0) + 5a17 RD890/RD9x0 PCI to PCI bridge (PCI Express GFX port 1) + 5a18 RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 0) 15d9 a811 H8DGU - 5a19 RD890 PCI to PCI bridge (PCI express gpp port E) - 5a1a RD890 PCI to PCI bridge (PCI express gpp port F) - 5a1b RD890 PCI to PCI bridge (PCI express gpp port G) - 5a1c RD890 PCI to PCI bridge (PCI express gpp port H) - 5a1d RD890 PCI to PCI bridge (external gfx1 port A) - 5a1e RD890 PCI to PCI bridge (external gfx1 port B) - 5a1f RD890 PCI to PCI bridge (NB-SB link) + 5a19 RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 1) + 5a1a RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 2) + 5a1b RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 3) + 5a1c RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 4) + 5a1d RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP Port 5) + 5a1e RD890/RD9x0/RX980 PCI to PCI bridge (PCI Express GPP2 Port 0) + 5a1f RD890/RD990 PCI to PCI bridge (PCI Express GFX2 port 0) 15d9 a811 H8DGU - 5a20 RD890S PCI Express bridge for GPP2 port 1 - 5a23 RD990 I/O Memory Management Unit (IOMMU) + 5a20 RD890/RD990 PCI to PCI bridge (PCI Express GFX2 port 1) + 5a23 RD890S/RD990 I/O Memory Management Unit (IOMMU) 5a31 RC410 Host Bridge 5a33 RS400 Host Bridge 5a34 RS4xx PCI Express Port [ext gfx] @@ -1486,6 +1557,9 @@ 6646 Bonaire XT [Radeon R9 M280X] 6647 Bonaire PRO [Radeon R9 M270X] 6649 Bonaire [FirePro W5100] + 1002 0b0c FirePro W4300 + 103c 0b0c Bonaire [FirePro W4300] + 103c 230c FirePro W5100 6650 Bonaire 6651 Bonaire 6658 Bonaire XTX [Radeon R7 260X/360] @@ -1515,10 +1589,17 @@ 1462 3271 Radeon R9 360 OEM 1682 7360 Radeon R7 360 6660 Sun XT [Radeon HD 8670A/8670M/8690M / R5 M330] + 1028 05ea Radeon HD 8670M + 1028 06bf Radeon R5 M335 + 103c 1970 Radeon HD 8670M + 103c 80be Radeon R5 M330 + 103c 8136 Radeon R5 M330 + 17aa 3804 Radeon R5 M330 17aa 3809 Radeon R5 M330 17aa 390c Radeon R5 M330 6663 Sun PRO [Radeon HD 8570A/8570M] 1025 0846 Radeon HD 8570A + 17aa 3805 Radeon HD 8570M 6664 Jet XT [Radeon R5 M240] 6665 Jet PRO [Radeon R5 M230] 17aa 368f Radeon R5 A230 @@ -2053,9 +2134,11 @@ 1462 8036 Radeon HD 8990 OEM 148c 8990 Radeon HD 8990 OEM 679e Tahiti LE [Radeon HD 7870 XT] + 1787 2328 Radeon HD 7870 Black Edition 2 GB GDDR5 [2GBD5-2DHV3E] 679f Tahiti 67a0 Hawaii XT GL [FirePro W9100] 1002 0335 FirePro S9150 + 1002 0735 FirePro S9170 1028 031f FirePro W9100 1028 0335 FirePro S9150 67a1 Hawaii PRO GL [FirePro W8100] @@ -2065,29 +2148,69 @@ 67a8 Hawaii 67a9 Hawaii 67aa Hawaii - 67b0 Hawaii XT [Radeon R9 290X] + 67b0 Hawaii XT / Grenada XT [Radeon R9 290X/390X] + 1028 0b00 Grenada XT [Radeon R9 390X] + 103c 6566 Radeon R9 390X 1043 046a R9 290X DirectCU II 1043 046c R9 290X DirectCU II OC 1043 0474 Matrix R9 290X Platinum 1043 0476 ARES III + 1043 04d7 Radeon R9 390X + 1043 04db Radeon R9 390X + 1043 04df Radeon R9 390X + 1043 04e9 Radeon R9 390X 1458 227c R9 290X WindForce 3X OC 1458 2281 R9 290X WindForce 3X OC 1458 228c R9 290X WindForce 3X 1458 228d R9 290X WindForce 3X OC 1458 2290 R9 290X WindForce 3X + 1458 22bc Radeon R9 390X + 1458 22c1 Grenada PRO [Radeon R9 390] + 1462 2015 Radeon R9 390X 1462 3070 R9 290X Lightning 1462 3071 R9 290X Lightning 1462 3072 R9 290X Lightning LE 1462 3080 R9 290X Gaming 1462 3082 R9 290X Gaming OC 148c 2347 Devil 13 Dual Core R9 290X + 148c 2357 Grenada XT [Radeon R9 390X] 1682 9290 Double Dissipation R9 290X + 1682 9395 Grenada XT [Radeon R9 390X] + 174b 0e34 Radeon R9 390X 174b e282 Vapor-X R9 290X Tri-X OC 174b e285 R9 290X Tri-X OC + 174b e324 Grenada XT2 [Radeon R9 390X] 1787 2020 R9 290X IceQ X² Turbo - 67b1 Hawaii PRO [Radeon R9 290] + 1787 2357 Grenada XT [Radeon R9 390X] + 67b1 Hawaii PRO [Radeon R9 290/390] + 1043 04dd STRIX R9 390 + 148c 2358 Radeon R9 390 + 174b e324 Sapphire Nitro R9 390 67b9 Vesuvius [Radeon R9 295X2] 67be Hawaii LE + 67c0 Ellesmere [Polaris10] + 67df Ellesmere [Radeon RX 470/480] + 1002 0b37 Radeon RX 480 + 1043 04a8 Radeon RX 480 + 1043 04b0 Radeon RX 470 + 1043 04fb Radeon RX 480 + 1462 3411 Radeon RX 470 + 1462 3413 Radeon RX 480 + 148c 2372 Radeon RX 480 + 148c 2373 Radeon RX 470 + 1682 9470 Radeon RX 470 + 1682 9480 Radeon RX 480 + 174b e347 Radeon RX 470/480 + 174b e349 Radeon RX 470 + 1787 a470 Radeon RX 470 + 1787 a480 Radeon RX 480 + 67e0 Baffin [Polaris11] + 67e1 Baffin [Polaris11] + 67e8 Baffin [Polaris11] + 67e9 Baffin [Polaris11] + 67eb Baffin [Polaris11] + 67ef Baffin [Radeon RX 460] + 67ff Baffin [Polaris11] 6800 Wimbledon XT [Radeon HD 7970M] 1002 0124 Radeon HD 7970M 8086 2110 Radeon HD 7970M @@ -2103,7 +2226,7 @@ 1002 0310 FirePro S7000 1002 0420 Radeon Sky 500 6809 Pitcairn LE GL [FirePro W5000] - 6810 Curacao XT [Radeon R7 370 / R9 270X/370 OEM] + 6810 Curacao XT / Trinidad XT [Radeon R7 370 / R9 270X/370X] 148c 0908 Radeon R9 370 OEM 1682 7370 Radeon R7 370 6811 Curacao PRO [Radeon R7 370 / R9 270/370 OEM] @@ -2743,20 +2866,49 @@ 174b e180 Radeon HD 7350 17af 3015 Radeon HD 7350 68fe Cedar LE - 6900 Topaz XT [Radeon R7 M260/M265] - 1028 0640 Radeon R7 M265 + 6900 Topaz XT [Radeon R7 M260/M265 / M340/M360] + 1025 1056 Radeon R7 M360 / R8 M365DX + 1028 0640 Radeon R7 M260/M265 + 1028 0643 Radeon R7 M260/M265 + 1028 067f Radeon R7 M260 + 1028 130a Radeon R7 M260 + 103c 2263 Radeon R7 M260 103c 2269 Radeon R7 M260 + 103c 22c6 Radeon R7 M260 103c 22c8 Radeon R7 M260 + 103c 808c Radeon R7 M260 + 103c 8099 Radeon R7 M360 + 103c 80b5 Radeon R7 M360 + 103c 80b9 Radeon R7 M360 + 103c 811c Radeon R7 M340 + 10cf 1906 Radeon R7 M260 + 1170 9979 Radeon R7 M360 1179 f903 Radeon R7 M260 + 1179 f922 Radeon R7 M260 + 1179 f923 Radeon R7 M260 1179 f934 Radeon R7 M260 + 17aa 3822 Radeon R7 M360 + 17aa 3824 Radeon R7 M360 + 17aa 5021 Radeon R7 M260 6901 Topaz PRO [Radeon R5 M255] + 103c 1318 Radeon R6 M255DX 6921 Amethyst XT [Radeon R9 M295X] - 6929 Tonga PRO GL [FirePro Series] + 6929 Tonga XT GL [FirePro S7150] 692b Tonga PRO GL [FirePro W7100] - 692f Tonga XT GL [FirePro W8100] - 6938 Amethyst XT [Radeon R9 M295X Mac Edition] + 692f Tonga XTV GL [FirePro S7150V] + 6938 Tonga XT / Amethyst XT [Radeon R9 380X / R9 M295X] + 1043 04f5 Radeon R9 380X + 1043 04f7 Radeon R9 380X + 106b 013a Radeon R9 M295X Mac Edition + 1458 22c8 Radeon R9 380X + 148c 2350 Radeon R9 380X + 1682 9385 Radeon R9 380X + 174b e308 Radeon R9 380X Nitro 4G D5 + 17af 2006 Radeon R9 380X 6939 Tonga PRO [Radeon R9 285/380] 148c 9380 Radeon R9 380 +# Make naming scheme consistent + 174b e308 Radeon R9 380 Nitro 4G D5 700f RS100 AGP Bridge 7010 RS200/RS250 AGP Bridge 7100 R520 [Radeon X1800 XT] @@ -2877,7 +3029,12 @@ 72a8 RV570 [Radeon X1950 GT] (Secondary) 72b1 RV560 [Radeon X1650 XT] (Secondary) 72b3 RV560 [Radeon X1650 GT] (Secondary) - 7300 Fiji XT [Radeon R9 FURY X] + 7300 Fiji [Radeon R9 FURY / NANO Series] + 1002 0b36 Radeon R9 FURY X / NANO + 1002 1b36 Radeon Pro Duo + 1043 049e Radeon R9 FURY + 1043 04a0 Radeon R9 FURY X + 174b e329 Radeon R9 FURY 7833 RS350 Host Bridge 7834 RS350 [Radeon 9100 PRO/XT IGP] 7835 RS350M [Mobility Radeon 9000 IGP] @@ -3069,7 +3226,7 @@ 95cc RV620 GL [FirePro V3700] 95cd RV620 [FirePro 2450] 95cf RV620 GL [FirePro 2260] - 960f RS780 HDMI Audio [Radeon (HD) 3000 Series] + 960f RS780 HDMI Audio [Radeon 3000/3100 / HD 3200/3300] 9610 RS780 [Radeon HD 3200] 1458 d000 GA-MA78GM-S2H Motherboard 9611 RS780C [Radeon 3100] @@ -3144,10 +3301,12 @@ 985f Mullins 9874 Carrizo 9900 Trinity [Radeon HD 7660G] + 103c 1985 Pavilion 17-e163sg Notebook PC # AMD A10-5800K CPU 9901 Trinity [Radeon HD 7660D] 9902 Trinity HDMI Audio Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 9903 Trinity [Radeon HD 7640G] 103c 194e ProBook 455 G1 Notebook 103c 1952 ProBook 455 G1 Notebook @@ -3168,6 +3327,8 @@ 9917 Trinity [Radeon HD 7620G] 9918 Trinity [Radeon HD 7600G] 9919 Trinity [Radeon HD 7500G] + 9920 Liverpool [Playstation 4 APU] + 9921 Liverpool HDMI/DP Audio Controller 9990 Trinity [Radeon HD 7520G] 9991 Trinity [Radeon HD 7540D] 9992 Trinity [Radeon HD 7420G] @@ -3186,34 +3347,41 @@ 99a0 Trinity [Radeon HD 7520G] 99a2 Trinity [Radeon HD 7420G] 99a4 Trinity [Radeon HD 7400G] - aa00 R600 HDMI Audio [Radeon HD 2900 Series] + aa00 R600 HDMI Audio [Radeon HD 2900 GT/PRO/XT] + aa01 RV635 HDMI Audio [Radeon HD 3650/3730/3750] aa08 RV630 HDMI Audio [Radeon HD 2600 Series] - aa10 RV610 HDMI Audio [Radeon HD 2350/2400 Series] + aa10 RV610 HDMI Audio [Radeon HD 2350 PRO / 2400 PRO/XT / HD 3410] 174b aa10 Radeon HD 2400 PRO 18bc aa10 Radeon HD 2400 PRO aa18 RV670/680 HDMI Audio [Radeon HD 3690/3800 Series] - aa20 RV635 HDMI Audio [Radeon HD 3600 Series] - aa28 RV620 HDMI Audio [Radeon HD 3400 Series] + aa20 RV635 HDMI Audio [Radeon HD 3650/3730/3750] + aa28 RV620 HDMI Audio [Radeon HD 3450/3470/3550/3570] aa30 RV770 HDMI Audio [Radeon HD 4850/4870] 174b aa30 Radeon HD 4850 512MB GDDR3 PCI-E Dual Slot Fansink aa38 RV710/730 HDMI Audio [Radeon HD 4000 series] 103c 3628 dv6-1190en - aa50 Cypress HDMI Audio [Radeon HD 5800 Series] + aa50 Cypress HDMI Audio [Radeon HD 5830/5850/5870 / 6850/6870 Rebrand] aa58 Juniper HDMI Audio [Radeon HD 5700 Series] # 5500, 5600 and mobile 5700 series aa60 Redwood HDMI Audio [Radeon HD 5000 Series] 1025 033d Mobility Radeon HD 5650 1025 0347 Aspire 7740G - aa68 Cedar HDMI Audio [Radeon HD 5400/6300 Series] + aa68 Cedar HDMI Audio [Radeon HD 5400/6300/7300 Series] 1028 aa68 XPS 8300 - aa80 Cayman/Antilles HDMI Audio [Radeon HD 6900 Series] + aa80 Cayman/Antilles HDMI Audio [Radeon HD 6930/6950/6970/6990] aa88 Barts HDMI Audio [Radeon HD 6800 Series] - aa90 Turks/Whistler HDMI Audio [Radeon HD 6000 Series] + aa90 Turks HDMI Audio [Radeon HD 6500/6600 / 6700M Series] 1028 04a3 Precision M4600 aa98 Caicos HDMI Audio [Radeon HD 6400 Series] 174b aa98 Radeon HD 6450 1GB DDR3 - aaa0 Tahiti XT HDMI Audio [Radeon HD 7970 Series] + aaa0 Tahiti HDMI Audio [Radeon HD 7870 XT / 7950/7970] aab0 Cape Verde/Pitcairn HDMI Audio [Radeon HD 7700/7800 Series] + aac0 Tobago HDMI Audio [Radeon R7 360 / R9 360 OEM] + aac8 Hawaii HDMI Audio [Radeon R9 290/290X / 390/390X] +# I have a Tonga card and this is the HDMI Audio part + aad8 Tonga HDMI Audio [Radeon R9 285/380] + 174b aad8 Radeon R9 285/380 HDMI Audio + aae8 Fiji HDMI/DP Audio [Radeon R9 Nano / FURY/FURY X] ac00 Theater 600 Pro ac02 TV Wonder HD 600 PCIe ac12 Theater HD T507 (DVB-T) TV tuner/capture device @@ -3630,9 +3798,13 @@ 1014 049a PCIe3 x16 SAS RAID Internal Adapter 6Gb (57EE) 1014 04c7 PCIe3 x 8 Cache SAS RAID Internal Adapter 6GB(2CCA) 1014 04c8 PCIe3 x 8 Cache SAS RAID Internal Adapter 6GB(2CD2) - 1014 0c49 PCIe3 x 8 Cache SAS RAID Internal Adapter 6GB(2CCD) + 1014 04c9 PCIe3 x 8 Cache SAS RAID Internal Adapter 6GB(2CCD) 044b GenWQE Accelerator Adapter 04aa Flash Adapter 90 (PCIe2 0.9TB) + 04da PCI-E IPR SAS+ Adapter (ASIC) + 1014 04fb PCIe3 x16 20GB Cache 12Gb Quad SAS RAID+ Adapter(580B) + 1014 04fc PCIe3 x8 12Gb Quad SAS RAID+ Adapter(580A) + 04ed Internal Shared Memory (ISM) virtual PCI device 3022 QLA3022 Network Adapter 4022 QLA3022 Network Adapter ffff MPIC-2 interrupt controller @@ -3727,9 +3899,12 @@ 1404 Family 15h (Models 10h-1fh) Processor Function 4 1405 Family 15h (Models 10h-1fh) Processor Function 5 1410 Family 15h (Models 10h-1fh) Processor Root Complex + 103c 1985 Pavilion 17-e163sg Notebook PC 1412 Family 15h (Models 10h-1fh) Processor Root Port + 1022 1234 Trinity A-series APU 1413 Family 15h (Models 10h-1fh) Processor Root Port 1414 Family 15h (Models 10h-1fh) Processor Root Port + 1022 1234 Trinity A-series APU 1415 Family 15h (Models 10h-1fh) Processor Root Port 1416 Family 15h (Models 10h-1fh) Processor Root Port 1417 Family 15h (Models 10h-1fh) Processor Root Port @@ -3744,7 +3919,11 @@ 1422 Family 15h (Models 30h-3fh) Processor Root Complex 1423 Family 15h (Models 30h-3fh) I/O Memory Management Unit 1426 Family 15h (Models 30h-3fh) Processor Root Port + 1436 Liverpool Processor Root Complex + 1437 Liverpool I/O Memory Management Unit + 1438 Liverpool Processor Root Port 1439 Family 16h Processor Functions 5:1 + 145b Zeppelin Non-Transparent Bridge 1510 Family 14h Processor Root Complex 174b 1001 PURE Fusion Mini 1512 Family 14h Processor Root Port @@ -3890,27 +4069,34 @@ 7802 FCH SATA Controller [RAID mode] 7803 FCH SATA Controller [RAID mode] 7804 FCH SATA Controller [AHCI mode] + 103c 1985 Pavilion 17-e163sg Notebook PC 7805 FCH SATA Controller [RAID mode] 7806 FCH SD Flash Controller 7807 FCH USB OHCI Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 7808 FCH USB EHCI Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 7809 FCH USB OHCI Controller 103c 194e ProBook 455 G1 Notebook 780b FCH SMBus Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 780c FCH IDE Controller 780d FCH Azalia Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 1043 8444 F2A85-M Series 780e FCH LPC Bridge 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 780f FCH PCI Bridge 7812 FCH USB XHCI Controller 7813 FCH SD Flash Controller 7814 FCH USB XHCI Controller 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC 7900 FCH SATA Controller [IDE mode] 7901 FCH SATA Controller [AHCI mode] 7902 FCH SATA Controller [RAID mode] @@ -3923,6 +4109,7 @@ 790f FCH PCI Bridge 7914 FCH USB XHCI Controller 9600 RS780 Host Bridge + 1043 82ee M378A-CM Motherboard 1043 82f1 M3A78-EH Motherboard 9601 RS880 Host Bridge 1019 2120 A785GM-M @@ -4288,6 +4475,11 @@ 102b 2241 M9138 LP PCIe x16 102b 2280 M9188 ATX PCIe x16 102b 22c0 M9128 LP PCIe x16 + 0550 SV2 + 102b 00c0 MURA-IPX-I4EF + 102b 00c1 MURA-IPX-I4DF + 102b 00c3 MURA-IPX-I4DHF + 102b 00c5 MURA-IPX-I4EHF 0d10 MGA Ultima/Impression 1000 MGA G100 [Productiva] 102b ff01 Productiva G100 @@ -4479,6 +4671,7 @@ 103c 1293 USB add-in card 103c 1294 USB 2.0 add-in card 1179 0001 USB + 1186 0035 DUB-C2 USB 2.0 2-port 32-bit cardbus controller 12ee 7000 Root Hub 14c2 0105 PTI-205N USB 2.0 Host Controller 1799 0001 Root Hub @@ -4510,6 +4703,7 @@ 00ce uPD72871 [Firewarden] IEEE1394a OHCI 1.0 Link/1-port PHY Controller 00df Vr4131 00e0 uPD72010x USB 2.0 Controller + 1186 f100 DUB-C2 USB 2.0 2-port 32-bit cardbus controller 12ee 7001 Root hub 14c2 0205 PTI-205N USB 2.0 Host Controller 1799 0002 Root Hub @@ -4796,30 +4990,30 @@ 103c 3212 Smart Array E200 3239 Smart Array Gen9 Controllers 103c 21bd P244br - 103c 21be Smart Array + 103c 21be P741m 103c 21bf H240ar 103c 21c0 P440ar - 103c 21c1 Smart Array + 103c 21c1 P840ar 103c 21c2 P440 103c 21c3 P441 103c 21c4 Smart Array - 103c 21c5 Smart Array + 103c 21c5 P841 103c 21c6 H244br 103c 21c7 H240 103c 21c8 H241 103c 21c9 Smart Array - 103c 21ca Smart Array + 103c 21ca P246br 103c 21cb P840 103c 21cc Smart Array - 103c 21cd Smart Array - 103c 21ce Smart Array + 103c 21cd P240nr + 103c 21ce H240nr 323a Smart Array G6 controllers 103c 3241 Smart Array P212 103c 3243 Smart Array P410 103c 3245 Smart Array P410i 103c 3247 Smart Array P411 103c 3249 Smart Array P812 - 103c 324a HP Smart Array 712m (Mezzanine RAID controller) + 103c 324a Smart Array 712m (Mezzanine RAID controller) 103c 324b Smart Array P711m (Mezzanine RAID controller) 323b Smart Array Gen8 Controllers 103c 3350 P222 @@ -5140,6 +5334,7 @@ 803b 5-in-1 Multimedia Card Reader (SD/MMC/MS/MS PRO/xD) 103c 309f nx9420 103c 30a3 Compaq nw8440 + 104d 8212 VAIO VGN-N21E 104d 902d VAIO VGN-NR120E 803c PCIxx12 SDA Standard Compliant SD Host Controller 103c 309f nx9420 @@ -5171,6 +5366,7 @@ 1395 2201 WL22-PC 16ab 8501 WL-8305 IEEE802.11b+ Wireless LAN PCI Adapter 8401 ACX 100 22Mbps Wireless Interface + 8888 Multicore DSP+ARM KeyStone II SOC 9000 Wireless Interface (of unknown type) 9065 TMS320DM642 9066 ACX 111 54Mbps Wireless Interface @@ -5294,6 +5490,14 @@ 8056 Rockwell HCF 56K modem 808a Memory Stick Controller 81ce SxS Pro memory card + 908f Aeolia ACPI + 909e Aeolia Ethernet Controller (Marvell Yukon 2 Family) + 909f Aeolia SATA AHCI Controller + 90a0 Aeolia SD/MMC Host Controller + 90a1 Aeolia PCI Express Glue and Miscellaneous Devices + 90a2 Aeolia DMA Controller + 90a3 Aeolia Memory (DDR3/SPM) + 90a4 Aeolia USB 3.0 xHCI Host Controller 104e Oak Technology, Inc 0017 OTI-64017 0107 OTI-107 [Spitfire] @@ -5435,7 +5639,6 @@ 6400 MPC190 Security Processor (S1 family, encryption) 6405 MPC184 Security Processor (S1 family) 1058 Electronics & Telecommunications RSH -# Formerly: Teknor Industrial Computers Inc 1059 Kontron 105a Promise Technology, Inc. 0d30 PDC20265 (FastTrak100 Lite/Ultra100) @@ -5701,6 +5904,7 @@ 0074 U4 HT Bridge # should be 14e4:1645 1645 Broadcom NetXtreme BCM5701 Gigabit Ethernet + 2001 PCI Express SSD 106c Hynix Semiconductor 8139 8139c 100BaseTX Ethernet Controller 8801 Dual Pentium ISA/PCI Motherboard @@ -5757,13 +5961,59 @@ 101e 8493 QLA12160 on AMI MegaRAID 1240 ISP1240 SCSI Host Adapter 1280 ISP1280 SCSI Host Adapter + 1634 FastLinQ QL45000 Series 40GbE Controller + 1077 e4f1 FastLinQ QL45212H 40GbE Adapter + 1077 e4f2 FastLinQ QL45211H 40GbE Adapter + 1077 e4f3 FastLinQ QL45412H 40GbE Adapter + 1077 e4f4 FastLinQ QL45411H 40GbE Adapter + 1644 FastLinQ QL45000 Series 100GbE Controller + 1077 e4f8 FastLinQ QL45611H 100GbE Adapter + 1656 FastLinQ QL45000 Series 25GbE Controller + 1077 02a7 QL45212-DE 25GbE Adapter + 1077 e4f6 FastLinQ QL45211H 25GbE Adapter + 1077 e4f7 FastLinQ QL45212H 25GbE Adapter + 165c FastLinQ QL45000 Series 40GbE Controller (FCoE) + 1077 e4f1 FastLinQ QL45462H 40GbE FCoE Adapter + 1077 e4f2 FastLinQ QL45461H 40GbE FCoE Adapter + 165e FastLinQ QL45000 Series 40GbE Controller (iSCSI) + 1077 e4f1 FastLinQ QL45462H 40GbE iSCSI Adapter + 1077 e4f2 FastLinQ QL45461H 40GbE iSCSI Adapter + 1664 FastLinQ QL45000 Series Gigabit Ethernet Controller (SR-IOV VF) + 1077 e4f1 FastLinQ QL45462H 40GbE Adapter (SR-IOV VF) + 1077 e4f2 FastLinQ QL45461H 40GbE Adapter (SR-IOV VF) + 1077 e4f3 FastLinQ QL45412H 40GbE Adapter (SR-IOV VF) + 1077 e4f4 FastLinQ QL45411H 40GbE Adapter (SR-IOV VF) + 1077 e4f6 FastLinQ QL45211H 25GbE Adapter (SR-IOV VF) + 1077 e4f7 FastLinQ QL45212H 25GbE Adapter (SR-IOV VF) + 1077 e4f8 FastLinQ QL45611H 100GbE Adapter (SR-IOV VF) 2020 ISP2020A Fast!SCSI Basic Adapter 2031 ISP8324-based 16Gb Fibre Channel to PCI Express Adapter + 103c 17e7 HP SN1000Q 16Gb Single Port Fibre Channel Adapter + 103c 17e8 HP SN1000Q 16Gb Dual Port Fibre Channel Adapter + 103c 1939 HP QMH2672 16Gb Dual Port Fibre Channel Adapter 103c 8002 3830C 16G Fibre Channel Host Bus Adapter + 2071 ISP2714-based 16/32Gb Fibre Channel to PCIe Adapter + 1077 0283 QLE2764 Quad Port 32Gb Fibre Channel to PCIe Adapter + 1077 029e QLE2694 Quad Port 16Gb Fibre Channel to PCIe Adapter + 1077 02a2 QLE2694L Quad Port 16Gb Fibre Channel to PCIe Adapter + 1077 02ad QLE2694U Quad Port 16/32Gb Fibre Channel to PCIe Adapter 2100 QLA2100 64-bit Fibre Channel Adapter 1077 0001 QLA2100 64-bit Fibre Channel Adapter 2200 QLA2200 64-bit Fibre Channel Adapter 1077 0002 QLA2200 + 2261 ISP2722-based 16/32Gb Fibre Channel to PCIe Adapter + 1077 0299 QLE2740 Single Port 32Gb Fibre Channel to PCIe Adapter + 1077 029a QLE2742 Dual Port 32Gb Fibre Channel to PCIe Adapter + 1077 029b QLE2690 Single Port 16Gb Fibre Channel to PCIe Adapter + 1077 029c QLE2692 Dual Port 16Gb Fibre Channel to PCIe Adapter + 1077 02a7 QLE2690 Single Port 16Gb FC to PCIe Gen3 x8 Adapter + 1077 02a8 QLE2692 Dual Port 16Gb FC to PCIe Gen3 x8 Adapter + 1077 02ab QLE2740 Single Port 32Gb FC to PCIe Gen3 x8 Adapter + 1077 02ac QLE2742 Dual Port 32Gb FC to PCIe Gen3 x8 Adapter + 1590 00f9 HPE StoreFabric SN1100Q 16Gb Single Port Fibre Channel Host Bus Adapter + 1590 00fa HPE StoreFabric SN1100Q 16Gb Dual Port Fibre Channel Host Bus Adapter + 1590 0203 HPE StoreFabric SN1600Q 32Gb Single Port Fibre Channel Host Bus Adapter + 1590 0204 HPE StoreFabric SN1600Q 32Gb Dual Port Fibre Channel Host Bus Adapter 2300 QLA2300 64-bit Fibre Channel Adapter 2312 ISP2312-based 2Gb Fibre Channel to PCI-X HBA 103c 0131 2Gb Fibre Channel - Single port [A7538A] @@ -5776,7 +6026,9 @@ 103c 7040 FC1142SR 4Gb 1-port PCIe Fibre Channel Host Bus Adapter [HPAE311A] 2532 ISP2532-based 8Gb Fibre Channel to PCI Express HBA 103c 3262 StorageWorks 81Q + 103c 3263 StorageWorks 82Q 1077 0167 QME2572 Dual Port FC8 HBA Mezzanine + 1590 00fc HPE StoreFabric 84Q 8Gb Quad Port Fibre Channel Host Bus Adapter 3022 ISP4022-based Ethernet NIC 3032 ISP4032-based Ethernet IPv6 NIC 4010 ISP4010-based iSCSI TOE HBA @@ -5994,6 +6246,7 @@ 1093 National Instruments 0160 PCI-DIO-96 0162 PCI-MIO-16XE-50 + 0fe1 PXI-8320 1150 PCI-6533 (PCI-DIO-32HS) 1170 PCI-MIO-16XE-10 1180 PCI-MIO-16E-1 @@ -6085,7 +6338,10 @@ 7004 PXI-6551 700b PXI-5421 700c PCI-5421 + 701a VXIpc-87xB + 701b VXIpc-770 7023 PXI-2593 + 7027 PCI-MXI-2 Universal 702c PXI-7831R 702d PCI-7831R 702e PXI-7811R @@ -6354,6 +6610,7 @@ 731d PXI-2536 7322 PXIe-6124 7327 PXI-6529 + 732c VXI-8360T 7331 PXIe-5602 7332 PXIe-5601 7333 PXI-5900 @@ -6493,6 +6750,7 @@ 75e5 PXI-6683 75e6 PXI-6683H 75ef PXIe-5632 + 761c VXI-8360LT 761f PXI-2540 7620 PXIe-2540 7621 PXI-2541 @@ -6523,6 +6781,7 @@ 76a3 PXIe-6535B 76a4 PXIe-6536B 76a5 PXIe-6537B + 783e PXI-8368 9020 PXI-2501 9030 PXI-2503 9040 PXI-2527 @@ -6532,6 +6791,7 @@ 9080 PXI-2580 9090 PCI-4021 90a0 PXI-4021 + a001 PCI-MXI-2 b001 PCI-1408 b011 PXI-1408 b021 PCI-1424 @@ -6621,7 +6881,9 @@ 1093 762b PXIe-4138 1093 762c PXIe-4144 1093 762d PXIe-4145 + 1093 762e PXIe-5606 1093 7644 PXIe-4841 + 1093 764a PCIe-8237R-S 1093 7658 PXIe-5162 (4CH) 1093 76ab PXIe-4322 1093 76ad PXIe-4112 @@ -6629,14 +6891,25 @@ 1093 76b5 PXIe-7971R 1093 76b6 PXIe-7972R 1093 76b7 PXIe-7975R + 1093 76b8 PXIe-5696 + 1093 76b9 PXIe-5654 1093 76c8 PXIe-6614 1093 76c9 PXIe-6612 1093 76cb PXIe-5646R 1093 76cc PXIe-5162 (2CH) + 1093 76ce CVS-1459 1093 76d0 PXIe-5160 (2CH) 1093 76d1 PXIe-5160 (4CH) 1093 76dc PXIe-4610 + 1093 76ec PXIe-2524 + 1093 76ed PXIe-2525 + 1093 76ee PXIe-2526 + 1093 76ef PXIe-2737 + 1093 76f0 PXIe-2738 + 1093 76f1 PXIe-2739 1093 76fb PCIe-1473R-LX110 + 1093 76fc PXIe-5105 + 1093 76fd PXIe-5114 1093 76fe PXIe-5644R 1093 76ff PXIe-5644R 1093 7700 PXIe-5644R @@ -6645,24 +6918,49 @@ 1093 7703 PXIe-5645R 1093 770c PXIe-4139 1093 7711 PXIe-4464 + 1093 7712 PXIe-4463 1093 7716 PCIe-6612 + 1093 771d Unconfigured CA4 Switch 1093 771e PXIe-4339 1093 7735 cRIO-9033 + 1093 773e PXIe-5624R 1093 774b cRIO-9031 1093 774d cRIO-9034 1093 7755 cRIO-9030 + 1093 7768 PXIe-2747 + 1093 7769 PXIe-2748 + 1093 776a PXIe-2746 1093 7777 PXIe-7976R 1093 7782 PXIe-5646R 1093 7783 PXIe-5646R 1093 7784 PXIe-5646R + 1093 7790 PXIe-5170R (4CH) + 1093 7791 PXIe-5170R (8CH) + 1093 7793 PXIe-5171R (8CH) 1093 77a5 PXIe-6345 1093 77a6 PXIe-6355 1093 77a7 PXIe-6365 1093 77a8 PXIe-6375 + 1093 77aa CVS-1458 + 1093 77ad IC-3173 1093 77b4 PXIe-7820R 1093 77b5 PXIe-7821R 1093 77b6 PXIe-7822R 1093 77b9 cRIO-9038 + 1093 77ba PXIe-4136 + 1093 77bb PXIe-4137 + 1093 77c0 PXIe-5624R + 1093 77c1 PXIe-5624R + 1093 77c2 PXIe-5624R + 1093 77ca PXIe-6738 + 1093 77cb PXIe-6739 + 1093 77db cRIO-9035 + 1093 77dc cRIO-9036 + 1093 77dd cRIO-9039 + 1093 7802 PXIe-4302 + 1093 7803 PXIe-4303 + 1093 7805 PXIe-4305 + 1093 788e PXIe-4304 c801 PCI-GPIB c811 PCI-GPIB+ c821 PXI-GPIB @@ -6700,6 +6998,13 @@ e251 PXI-8460 (2 ports) e261 PCI-CAN/DS e271 PXI-8462 + f110 VMEpc-650 + f120 VXIpc-650 + fe00 VXIpc-87x + fe41 VXIpc-860 + fe51 VXIpc-74x + fe61 VXIpc-850 + fe70 VXIpc-880 1094 First International Computers [FIC] # nee CMD Technology Inc 1095 Silicon Image, Inc. @@ -7068,11 +7373,14 @@ 8696 PEX 8696 96-lane, 24-Port PCI Express Gen 2 (5.0 GT/s) Multi-Root Switch 8717 PEX 8717 16-lane, 8-Port PCI Express Gen 3 (8.0 GT/s) Switch with DMA 8718 PEX 8718 16-Lane, 5-Port PCI Express Gen 3 (8.0 GT/s) Switch + 8724 PEX 8724 24-Lane, 6-Port PCI Express Gen 3 (8 GT/s) Switch, 19 x 19mm FCBGA 8732 PEX 8732 32-lane, 8-Port PCI Express Gen 3 (8.0 GT/s) Switch 8734 PEX 8734 32-lane, 8-Port PCI Express Gen 3 (8.0GT/s) Switch 8747 PEX 8747 48-Lane, 5-Port PCI Express Gen 3 (8.0 GT/s) Switch + 8748 PEX 8748 48-Lane, 12-Port PCI Express Gen 3 (8 GT/s) Switch, 27 x 27mm FCBGA # This is the Non-Transparent-Bridge Virtualized Port as presented by the PLX PEX 8732 chip, the physical bridges show up at 10b5:8732 87b0 PEX 8732 32-lane, 8-Port PCI Express Gen 3 (8.0 GT/s) Switch + 1093 7761 PXIe-8830mc 9016 PLX 9016 8-port serial controller 9030 PCI9030 32-bit 33MHz PCI <-> IOBus Bridge 10b5 2695 Hilscher CIF50-PB/DPS Profibus @@ -7084,6 +7392,7 @@ 10b5 3025 Alpermann+Velte PCL PCI L (3V/5V): Timecode Reader Board 10b5 3068 Alpermann+Velte PCL PCI HD (3V/5V): Timecode Reader Board 10b5 3463 Alpermann+Velte PCL PCI D (v2) (3V/5V): Timecode Reader Board + 10b5 3591 PLURA PCL PCI L (v2) (3.3V/5V): Time Code Reader Board 12fe 0111 CPCI-ASIO4 (ESD 4-port Serial Interface Board) 1369 9c01 VX222v2 1369 9d01 VX222-Mic @@ -7096,7 +7405,7 @@ 1369 a801 LCM200 1397 3136 4xS0-ISDN PCI Adapter 1397 3137 S2M-E1-ISDN PCI Adapter - 1518 0200 Kontron ThinkIO-C + 1518 0200 ThinkIO-C 15ed 1002 MCCS 8-port Serial Hot Swap 15ed 1003 MCCS 16-port Serial Hot Swap # MIL-STD-1553B Board @@ -7193,7 +7502,7 @@ 10b5 1123 Sectra KK631 encryption board 10b5 9080 9080 [real subsystem ID not set] 12d9 0002 PCI Prosody Card - 12df 4422 4422PCI ["Do-All" Telemetry Data Aquisition System] + 12df 4422 4422PCI ["Do-All" Telemetry Data Acquisition System] 1369 9601 PCX822np 1369 a102 PCX822v2 1369 a201 PCX442 @@ -7208,6 +7517,7 @@ 1885 0700 Tsunami FPGA PMC with Altera Stratix S40 1885 0701 Tsunami FPGA PMC with Altera Stratix S30 9733 PEX 9733 33-lane, 9-port PCI Express Gen 3 (8.0 GT/s) Switch + 1d49 0001 ThinkSystem P310W-4P NVMe Switch Card 9749 PEX 9749 49-lane, 13-port PCI Express Gen 3 (8.0 GT/s) Switch a100 Blackmagic Design DeckLink bb04 B&B 3PCIOSD1A Isolated PCI Serial @@ -7889,7 +8199,7 @@ 1043 0c11 A7N8X Mainboard a0a0 03b4 UK79G-1394 motherboard 006a nForce2 AC97 Audio Controler (MCP) - 1043 8095 nForce2 AC97 Audio Controler (MCP) + 1043 8095 nForce2 AC97 Audio Controller (MCP) a0a0 0304 UK79G-1394 motherboard 006b nForce Audio Processing Unit 10de 006b nForce2 MCP Audio Processing Unit @@ -8751,7 +9061,7 @@ 0446 MCP65 SMBus 103c 30cf Pavilion dv9668eg Laptop 0447 MCP65 SMU - 103c 30cf Pavilion dv9668eg Laptop + 103c 30cf Pavilion dv9500/9600/9700 series 0448 MCP65 IDE 103c 30cf Pavilion dv9668eg Laptop 0449 MCP65 PCI bridge @@ -9575,6 +9885,8 @@ 0f00 GF108 [GeForce GT 630] 0f01 GF108 [GeForce GT 620] 0f02 GF108 [GeForce GT 730] + 0f06 GF108 [GeForce GT 730] + 0fb0 GM200 High Definition Audio 0fbb GM204 High Definition Audio Controller 0fc0 GK107 [GeForce GT 640 OEM] 0fc1 GK107 [GeForce GT 640] @@ -9619,6 +9931,7 @@ 0fea GK107M [GeForce GT 755M Mac Edition] 0fec GK107M [GeForce 710A] 0fed GK107M [GeForce 820M] + 0fee GK107M [GeForce 810M] 0fef GK107GL [GRID K340] 0ff1 GK107 [NVS 1000] 0ff2 GK107GL [GRID K1] @@ -9715,6 +10028,7 @@ 103c 2afb GeForce 705A 17aa 309d GeForce 705A 17aa 30b1 GeForce 800A + 17aa 30f3 GeForce 705A 17aa 36a1 GeForce 800A 107c GF119 [NVS 315] 107d GF119 [NVS 310] @@ -9746,7 +10060,9 @@ 10c3 GT218 [GeForce 8400 GS Rev. 3] 10c5 GT218 [GeForce 405] 10d8 GT218 [NVS 300] + 10f0 GP104 High Definition Audio Controller 1140 GF117M [GeForce 610M/710M/810M/820M / GT 620M/625M/630M/720M] + 1019 0799 GeForce 820M 1019 999f GeForce GT 720M 1025 0600 GeForce GT 620M 1025 0606 GeForce GT 620M @@ -9846,6 +10162,7 @@ 1028 065e GeForce 820M 1028 0662 GeForce 820M 1028 068d GeForce 820M + 1028 06c1 GeForce 820M 103c 18ef GeForce GT 630M 103c 18f9 GeForce GT 630M 103c 18fb GeForce GT 630M @@ -9939,7 +10256,7 @@ 144d c0e3 NVS 5200M 144d c0e4 NVS 5200M 144d c10d GeForce 820M - 144d c652 GeForce GT 620M + 144d c652 GeForce GT 620M on NP300E5C series laptop 144d c709 GeForce 710M 144d c711 GeForce 710M 144d c736 GeForce 710M @@ -10052,12 +10369,18 @@ 1854 0180 GeForce 710M 1854 0190 GeForce GT 720M 1854 0192 GeForce GT 720M + 1854 0224 GeForce 820M + 1b0a 01c0 GeForce 820M 1b0a 20dd GeForce GT 620M 1b0a 20df GeForce GT 620M 1b0a 210e GeForce 820M 1b0a 2202 GeForce GT 720M 1b0a 90d7 GeForce 820M 1b0a 90dd GeForce 820M + 1b50 5530 GeForce 820M + 1b6c 5531 GeForce GT 720M + 1bab 0106 GeForce 820M + 1d05 1013 GeForce 810M 1180 GK104 [GeForce GTX 680] 1043 83f1 GTX680-DC2-2GD5 3842 3682 GeForce GTX 680 Mac Edition @@ -10185,6 +10508,7 @@ 1287 GK208 [GeForce GT 730] 1288 GK208 [GeForce GT 720] 1289 GK208 [GeForce GT 710] + 128b GK208 [GeForce GT 710B] 1290 GK208M [GeForce GT 730M] 103c 2afa GeForce GT 730A 103c 2b04 GeForce GT 730A @@ -10209,7 +10533,10 @@ 1296 GK208M [GeForce 825M] 1298 GK208M [GeForce GT 720M] 1299 GK208M [GeForce 920M] - 129a GK208GL [N16V-GL] + 17aa 30bb GeForce 920A + 17aa 36a7 GeForce 920A + 17aa 36af GeForce 920M + 129a GK208M [GeForce 910M] 12a0 GK208 12b9 GK208GLM [Quadro K610M] 12ba GK208GLM [Quadro K510M] @@ -10222,7 +10549,13 @@ 1344 GM108M [GeForce 845M] 1346 GM108M [GeForce 930M] 1347 GM108M [GeForce 940M] - 137a GM108GLM [Quadro K620M] + 1348 GM108M [GeForce 945M / 945A] + 1349 GM108M [GeForce 930M] + 134d GM108M [GeForce 940MX] + 134e GM108M [GeForce 930MX] + 134f GM108M [GeForce 920MX] + 137a GM108GLM [Quadro K620M / Quadro M500M] + 17aa 505a Quadro M500M 137d GM108M [GeForce 940A] 1380 GM107 [GeForce GTX 750 Ti] 1381 GM107 [GeForce GTX 750] @@ -10236,16 +10569,27 @@ 1393 GM107M [GeForce 840M] 1398 GM107M [GeForce 845M] 139a GM107M [GeForce GTX 950M] + 17aa 362c GeForce GTX 950A + 17aa 362f GeForce GTX 950A + 17aa 363f GeForce GTX 950A + 17aa 3640 GeForce GTX 950A + 17aa 3647 GeForce GTX 950A + 17aa 36b9 GeForce GTX 950A 139b GM107M [GeForce GTX 960M] + 103c 2b4c GeForce GTX 960A 139c GM107M [GeForce 940M] + 139d GM107M [GeForce GTX 750 Ti] 13b0 GM107GLM [Quadro M2000M] 13b1 GM107GLM [Quadro M1000M] 13b2 GM107GLM [Quadro M600M] 13b3 GM107GLM [Quadro K2200M] + 13b9 GM107GL [NVS 810] 13ba GM107GL [Quadro K2200] 13bb GM107GL [Quadro K620] 13bc GM107GL [Quadro K1200] - 13bd GM107GL [GRID M40] + 13bd GM107GL [Tesla M10] + 10de 110a GRID M40 + 10de 1160 Tesla M10 13c0 GM204 [GeForce GTX 980] 1043 8504 GTX980-4GD5 13c1 GM204 @@ -10254,23 +10598,78 @@ 13d7 GM204M [GeForce GTX 980M] 13d8 GM204M [GeForce GTX 970M] 13d9 GM204M [GeForce GTX 965M] + 13da GM204M [GeForce GTX 980] 13f0 GM204GL [Quadro M5000] 13f1 GM204GL [Quadro M4000] 13f2 GM204GL [Tesla M60] + 13f3 GM204GL [Tesla M6] + 13f8 GM204GLM [Quadro M5000M] + 13f9 GM204GLM [Quadro M4000M] + 13fa GM204GLM [Quadro M3000M] + 10de 11c9 Quadro M3000 SE + 13fb GM204GLM [Quadro M5500] 1401 GM206 [GeForce GTX 960] + 1402 GM206 [GeForce GTX 950] + 1406 GM206 [GeForce GTX 960] + 1407 GM206 [GeForce GTX 750 v2] + 1427 GM206M [GeForce GTX 965M] + 1430 GM206GL [Quadro M2000] + 1431 GM206GL [Tesla M4] + 15f0 GP100GL + 15f1 GP100GL + 15f8 GP100GL + 15f9 GP100GL 1617 GM204M [GeForce GTX 980M] 1618 GM204M [GeForce GTX 970M] 1619 GM204M [GeForce GTX 965M] + 161a GM204M [GeForce GTX 980] + 1667 GM204M [GeForce GTX 965M] + 1725 GP100 + 172e GP100 + 172f GP100 17c2 GM200 [GeForce GTX TITAN X] 17c8 GM200 [GeForce GTX 980 Ti] 17f0 GM200GL [Quadro M6000] + 17f1 GM200GL [Quadro M6000 24GB] + 17fd GM200GL [Tesla M40] + 1b00 GP102 [TITAN X] + 1b01 GP102 + 1b30 GP102GL [Quadro P6000] + 1b70 GP102GL + 1b78 GP102GL + 1b80 GP104 [GeForce GTX 1080] + 1b81 GP104 [GeForce GTX 1070] + 1b82 GP104 + 1b83 GP104 + 1ba0 GP104M [GeForce GTX 1080] + 1ba1 GP104M [GeForce GTX 1070] + 1bb0 GP104GL [Quadro P5000] + 1bb1 GP104GL + 1bb4 GP104GL + 1be0 GP104M [GeForce GTX 1080] + 1be1 GP104M [GeForce GTX 1070] + 1c00 GP106 + 1c01 GP106 + 1c02 GP106 [GeForce GTX 1060 3GB] + 1c03 GP106 [GeForce GTX 1060 6GB] + 1c20 GP106M [GeForce GTX 1060] + 1c30 GP106GL + 1c60 GP106M [GeForce GTX 1060] + 1c70 GP106GL + 1c80 GP107 + 1c81 GP107 + 1c82 GP107 [GeForce GTX 1050 Ti] + 1ca7 GP107GL + 1ca8 GP107GL + 1caa GP107GL + 1d01 GP108 10df Emulex Corporation 0720 OneConnect NIC (Skyhawk) - 103c 1934 HP FlexFabric 20Gb 2-port 650M Adapter - 103c 1935 HP FlexFabric 20Gb 2-port 650FLB Adapter - 103c 21d4 HP StoreFabric CN1200E 10Gb Converged Network Adapter - 103c 220a HP FlexFabric 10Gb 2-port 556FLR-SFP+ Adapter - 103c 803f HP Ethernet 10Gb 2-port 557SFP+ Adapter + 103c 1934 FlexFabric 20Gb 2-port 650M Adapter + 103c 1935 FlexFabric 20Gb 2-port 650FLB Adapter + 103c 21d4 StoreFabric CN1200E 10Gb Converged Network Adapter + 103c 220a FlexFabric 10Gb 2-port 556FLR-SFP+ Adapter + 103c 803f Ethernet 10Gb 2-port 557SFP+ Adapter 17aa 1056 ThinkServer OCm14102-UX-L AnyFabric 17aa 1057 ThinkServer OCm14104-UX-L AnyFabric 17aa 1059 ThinkServer OCm14104-UT-L AnyFabric @@ -10294,6 +10693,7 @@ e240 OneConnect iSCSI Initiator (Lancer) e260 OneConnect FCoE Initiator (Lancer) e268 OneConnect 10Gb FCoE Converged Network Adapter (Lancer-VF) + e300 Lancer Gen6: LPe32000 Fibre Channel Host Adapter f011 Saturn: LightPulse Fibre Channel Host Adapter f015 Saturn: LightPulse Fibre Channel Host Adapter f085 LP850 Fibre Channel Host Adapter @@ -10308,6 +10708,7 @@ f0e5 Zephyr LightPulse Fibre Channel Host Adapter f0f5 Neptune LightPulse Fibre Channel Host Adapter f100 Saturn-X: LightPulse Fibre Channel Host Adapter + 103c 3282 8Gb Dual-port PCI-e FC HBA f111 Saturn-X LightPulse Fibre Channel Host Adapter f112 Saturn-X LightPulse Fibre Channel Host Adapter f180 LPSe12002 EmulexSecure Fibre Channel Adapter @@ -10423,8 +10824,15 @@ 5229 RTS5229 PCI Express Card Reader 1025 0813 Aspire R7-571 103c 194e ProBook 455 G1 Notebook + 103c 1985 Pavilion 17-e163sg Notebook PC + 522a RTS522A PCI Express Card Reader 5249 RTS5249 PCI Express Card Reader 103c 1909 ZBook 15 + 524a RTS524A PCI Express Card Reader + 5250 RTS5250 PCI Express Card Reader + 525a RTS525A PCI Express Card Reader + 5286 RTS5286 PCI Express Card Reader + 5287 RTL8411B PCI Express Card Reader 5288 RTS5288 PCI Express Card Reader 5289 RTL8411 PCI Express Card Reader 1043 1457 K55A Laptop @@ -10438,11 +10846,13 @@ 8129 RTL-8129 10ec 8129 RT8129 Fast Ethernet Adapter 11ec 8129 RTL8111/8168 PCIe Gigabit Ethernet (misconfigured) - 8136 RTL8101E/RTL8102E PCI Express Fast Ethernet controller + 8136 RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller + 103c 1985 Pavilion 17-e163sg Notebook PC + 103c 2a8c Compaq 500B Microtower 103c 2ab1 Pavilion p6774 103c 30cc Pavilion dv6700 1179 ff64 RTL8102E PCI-E Fast Ethernet NIC - 17c0 1053 AzureWave AW-NE766 802.11B/G/N Mini PCIe Card Model RT2700E + 17c0 1053 RTL8101e Medion WIM 2210 Notebook PC [MD96850] 8138 RT8139 (B/C) Cardbus Fast Ethernet Adapter 10ec 8138 RT8139 (B/C) Fast Ethernet Adapter 8139 RTL-8100/8101L/8139 PCI Fast Ethernet Adapter @@ -10505,19 +10915,20 @@ 1028 0283 Vostro 220 1028 04b2 Vostro 3350 1028 04da Vostro 3750 + 1028 06f3 Latitude 3570 103c 1611 Pavilion DM1Z-3000 103c 1950 ProBook 450/455 103c 2a6f Asus IPIBL-LB Motherboard - 1043 11f5 A6J-Q008 1043 16d5 U6V/U31J laptop 1043 81aa P5B - 1043 82c6 M3A78-EH Motherboard + 1043 82c6 M3A78 Series Motherboard 1043 83a3 M4A785TD Motherboard 1043 8432 P8P67 and other motherboards 1043 8505 P8 series motherboard 105b 0d7c D270S/D250S Motherboard 10ec 8168 RTL8111/8168 PCI Express Gigabit Ethernet controller - 1458 e000 Motherboard + 144d c652 RTL8168 on a NP300E5C series laptop + 1458 e000 Onboard Ethernet 1462 238c Onboard RTL8111b on MSI P965 Platinum Mainboard 1462 368c K9AG Neo2 1462 4180 Wind PC MS-7418 @@ -10547,10 +10958,12 @@ 8173 RTL8192SE Wireless LAN Controller 8174 RTL8192SE Wireless LAN Controller 8176 RTL8188CE 802.11b/g/n WiFi Adapter + 1043 84b5 PCE-N10 1a3b 1139 AW-NE139H Half-size Mini PCIe Card 8177 RTL8191CE PCIe Wireless Network Adapter 8178 RTL8192CE PCIe Wireless Network Adapter 8179 RTL8188EE Wireless Network Adapter + 103c 197d RTL8188EE mini-PCIe card 8180 RTL8180L 802.11b MAC 1385 4700 MA521 802.11b Wireless PC Card 1737 0019 WPC11v4 802.11b Wireless-B Notebook Adapter @@ -10560,6 +10973,7 @@ 8191 RTL8192CE PCIe Wireless Network Adapter 8192 RTL8192E/RTL8192SE Wireless LAN Controller 8193 RTL8192DE Wireless LAN Controller + 8196 RTL8196 Integrated PCI-e Bridge 8197 SmartLAN56 56K Modem 8199 RTL8187SE Wireless LAN Controller 1462 6894 MN54G2 / MS-6894 Wireless Mini PCIe Card @@ -10581,7 +10995,6 @@ 0405 Wildcard TE405P (2nd Gen) 0410 Wildcard TE410P (2nd Gen) 0600 Xilinx 6 Designs (Xilinx IP) - 2b00 Zomojo Zcard 3fc0 RME Digi96 3fc1 RME Digi96/8 3fc2 RME Digi96/8 Pro @@ -10589,6 +11002,8 @@ 3fc4 RME Digi9652 (Hammerfall) 3fc5 RME Hammerfall DSP 3fc6 RME Hammerfall DSP MADI + 7038 FPGA Card XC7VX690T + 17aa 402f FPGA XC7VX690T-3FFG1157E 8380 Ellips ProfiXpress Profibus Master 8381 Ellips Santos Frame Grabber d154 Copley Controls CAN card (PCI-CAN-02) @@ -10633,46 +11048,88 @@ 9500 INI-950 SCSI Adapter 9502 INI-950P Ultra Wide SCSI Adapter 1102 Creative Labs - 0002 SB Live! EMU10k1 + 0002 EMU10k1 [Sound Blaster Live! Series] 100a 1102 SB Live! 5.1 Digital OEM SB0220 EMU10K1-JFF - 1102 0020 CT4850 SBLive! Value + 1102 0020 CT4670/4850 SBLive! Value 1102 0021 CT4620 SBLive! - 1102 002f SBLive! mainboard implementation - 1102 100a SB Live! 5.1 Digital OEM [SB0220] + 1102 002f M002/M003 Integrated SBLive! + 1102 100a SB0220/0229 SBLive! 5.1 Digital OEM 1102 4001 E-mu APS 1102 8022 CT4780 SBLive! Value 1102 8023 CT4790 SoundBlaster PCI512 1102 8024 CT4760 SBLive! - 1102 8025 SBLive! Mainboard Implementation + 1102 8025 CT1140/SB0040 Integrated SBLive! 1102 8026 CT4830 SBLive! Value 1102 8027 CT4832 SBLive! Value - 1102 8028 CT4760 SBLive! OEM version + 1102 8028 CT4870 SBLive! Value + 1102 8029 CT4872 SBLive! Value + 1102 802a CT4890 SoundBlaster PCI256 + 1102 802b CT4891 SoundBlaster PCI256 1102 8031 CT4831 SBLive! Value + 1102 8032 CT4871 SBLive! Value + 1102 8033 CT4893 SoundBlaster PCI256 + 1102 8035 CT0060 SBLive! 1102 8040 CT4760 SBLive! + 1102 8050 CT4750 SoundBlaster PCI512 1102 8051 CT4850 SBLive! Value - 1102 8061 SBLive! Player 5.1 - 1102 8064 SBLive! 5.1 Model SB0100 - 1102 8065 SBLive! 5.1 Digital Model SB0220 - 1102 8066 Live! 5.1 Digital [SB0228] - 1102 8067 SBLive! 5.1 eMicro 28028 - 0004 SB Audigy - 1102 0051 SB0090 Audigy Player - 1102 0053 SB0090 Audigy Player/OEM - 1102 0058 SB0090 Audigy Player/OEM - 1102 1002 SB Audigy2 Platinum - 1102 1003 SB0350 Audigy 2 - 1102 1007 SB0240 Audigy 2 Platinum 6.1 - 1102 1009 SB Audigy2 OEM HP - 1102 2001 SB Audigy 2 ZS Platinum Pro - 1102 2002 SB Audigy 2 ZS (SB0350) - 1102 4001 E-MU 1010 + 1102 8061 SB060 SBLive! Player 5.1 + 1102 8062 SB0100 SBLive! 5.1 + 1102 8063 DXW Integrated SBLive! 5.1 + 1102 8064 SB0100/SB0102 SBLive! 5.1 + 1102 8065 SB0220/0222 SBLive! 5.1 Digital + 1102 8066 SB0228 SBLive! 5.1 Digital + 1102 8067 SB0220 SBLive! 5.1 + 1102 8068 CT0061 SBLive! + 1102 8069 SB0101 SBLive! 5.1 Value + 1102 806a SB0103 SBLive! 5.1 + 1102 806b SB0105 SBLive! 5.1 + 1102 806c SB0221 SBLive! 5.1 + 1102 8071 SB0150 SoundBlaster PCI512 +# EMU8008 PCI version of emu8000 chip + 0003 SB AWE64(D) + 0004 EMU10k2/CA0100/CA0102/CA10200 [Sound Blaster Audigy Series] + 1102 0040 SB0090 Audigy Player +# Probably an early engineering sample + 1102 0041 CT4820 SBLive!2 + 1102 0042 CT0070 Audigy + 1102 0043 CT0072 Audigy + 1102 0051 SB0090 Audigy Player/Platinum (EX) + 1102 0052 SB0162 Audigy ES + 1102 0053 CT0090/SB0092 Audigy Player/OEM + 1102 0054 SB0161 Audigy ES + 1102 0055 SB0192 Audigy + 1102 0056 SB0191 Audigy + 1102 0057 SB0091 Audigy + 1102 0058 SB0095 Audigy Player/OEM + 1102 0059 SB0230 Audigy + 1102 005a SB0231 Audigy + 1102 005b SB0232 Audigy + 1102 005c SB0238 Audigy + 1102 1002 SB0240 Audigy 2 Platinum 6.1 + 1102 1003 SB0350 Audigy 2 / SB0243 Audigy 2 OEM + 1102 1004 SB0242 Audigy 2 + 1102 1005 SB0280 Audigy 2 Platinum Ex + 1102 1006 SB0245 Audigy 2 OEM + 1102 1007 SB0240/SB0244 Audigy 2 Platinum + 1102 1008 SB0320 Audigy 2 + 1102 1009 SB0249 Audigy 2 OEM + 1102 100a SB0246 Audigy 2 + 1102 2001 SB0360 Audigy 2 ZS Platinum Pro + 1102 2002 SB0350 Audigy 2 ZS + 1102 2003 SB0352 Audigy 2 ZS + 1102 2004 SB0355 Audigy 2 ZS + 1102 2005 SB0359 Audigy 2 ZS + 1102 2006 SB035x Audigy 2 OEM + 1102 2007 SB0380 Audigy 4 Pro + 1102 4001 E-MU 1010 [MAEM8810] 1102 4002 E-MU 0404 - 0005 SB X-Fi + 1102 4003 E-MU 1010 + 0005 EMU20k1 [Sound Blaster X-Fi Series] 1102 0021 X-Fi Platinum 1102 002c X-Fi XtremeGamer FATAL1TY PRO 1102 1003 X-Fi XtremeMusic - 0006 [SB Live! Value] EMU10k1X - 0007 CA0106 Soundblaster + 0006 EMU10k1X [SB Live! Value/OEM Series] + 0007 CA0106/CA0111 [SB Live!/Audigy/X-Fi Series] 1102 0007 SBLive! 24bit 1102 1001 SB0310 Audigy LS 1102 1002 SB0312 Audigy LS @@ -10681,15 +11138,30 @@ 1102 1012 SB0790 X-Fi XA 1102 1013 Soundblaster X-Fi Xtreme Audio 1462 1009 K8N Diamond - 0008 SB0400 Audigy2 Value + 0008 CA0108/CA10300 [Sound Blaster Audigy Series] 1102 0008 EMU0404 Digital Audio System + 1102 1001 SB0400 Audigy 2 Value + 1102 1021 SB0610 Audigy 4 Value + 1102 1022 SBxxx Audigy 2/4 Value + 1102 1023 SB0612 Audigy 2 LS + 1102 1024 SB1550 Audigy 5/Rx + 1102 1101 SBxxxx Audigy 2 SA + 1102 2001 SB0530 Audigy 2 ZS Notebook + 1102 2021 SBxxxx Audigy 4 Notebook + 1102 4002 E-MU 0404 + 1102 4003 E-MU 1010 1102 4004 EMU1010 Digital Audio System [MAEM8960] - 0009 [SB X-Fi Xtreme Audio] CA0110-IBG - 1102 0010 [SB X-Fi Xtreme Audio] CA0110-IBG - 1102 0018 SB1040 - 000b EMU20k2 [X-Fi Titanium Series] + 1102 4005 E-MU 0404 [MAEM8984] + 1102 4007 E-MU 1010 [MAEM8982] + 1102 4201 E-MU 0202 [MAEM8950] + 0009 CA0110 [Sound Blaster X-Fi Xtreme Audio] + 1102 0010 MB0820 Integrated + 1102 0018 SB1040 PCI Express + 000b EMU20k2 [Sound Blaster X-Fi Titanium Series] 1102 0041 SB0880 [SoundBlaster X-Fi Titanium PCI-e] - 0012 SB Recon3D + 1102 0062 SB1270 [SoundBlaster X-Fi Titanium HD] + 0012 Sound Core3D [Sound Blaster Recon3D / Z-Series] + 1102 0010 SB1570 SB Audigy Fx 4001 SB Audigy FireWire Port 1102 0010 SB Audigy FireWire Port 7002 SB Live! Game Port @@ -10775,6 +11247,8 @@ 127d 0000 CineView II 8485 EM8485 REALmagic DVD/MPEG-4 A/V Decoder 8486 EM8486 REALmagic DVD/MPEG-4 A/V Decoder +# Found in Cisco DMP-4305G + c621 EM8621L Digital Media Processor c622 EM8622L MPEG-4.10 (H.264) and SMPTE 421M (VC-1) A/V Decoder 1106 VIA Technologies, Inc. 0102 Embedded VIA Ethernet Controller @@ -11010,6 +11484,7 @@ 1043 8095 A7V8X Motherboard (Realtek ALC650 codec) 1043 80a1 A7V8X-X Motherboard 1043 80b0 A7V600/K8V-X/K8V Deluxe motherboard (ADI AD1980 codec [SoundMAX]) + 1043 80f3 ASUSTek SK8V motherboard 1043 810d Asus P5VD1-X (AD1888 codec [SoundMax]) 1043 812a A8V Deluxe motherboard (Realtek ALC850 codec) 10ec 8168 High Definition Audio @@ -11600,6 +12075,7 @@ 1093 75ff PXIe-8383mc DMA 1093 7600 PXIe-8383mc DMA 1093 7602 PXIe-8384 + 808e PES24NT24G2 PCI Express Switch # 32 port / 8 lane PCIe Gen 2 packet switch 808f PES32NT8AG2 80cf F32P08xG3 [PCIe boot mode] @@ -11989,6 +12465,7 @@ 1137 012c VIC 1340 Dual 40Gb MLOM 1137 012e VIC 1227 Dual 10Gb SFP+ PCIe 1137 0137 VIC 1380 Dual 40Gb Mezzanine + 1137 014d VIC 1385 Dual 40Gb PCIe 0041 VIC PCIe Downstream Port 0042 VIC Management Controller 1137 0047 VIC P81E PCIe Management Controller @@ -11996,6 +12473,7 @@ 1137 00cd VIC 1285 PCIe Management Controller 1137 00ce VIC 1225T PCIe Management Controller 1137 012e VIC 1227 PCIe Management Controller + 1137 014d VIC 1385 PCIe Management Controller 0043 VIC Ethernet NIC 1137 0047 VIC P81E PCIe Ethernet NIC 1137 0048 VIC M81KR Mezzanine Ethernet NIC @@ -12008,6 +12486,7 @@ 1137 012c VIC 1340 MLOM Ethernet NIC 1137 012e VIC 1227 PCIe Ethernet NIC 1137 0137 VIC 1380 Mezzanine Ethernet NIC + 1137 014d VIC 1385 PCIe Ethernet NIC 0044 VIC Ethernet NIC Dynamic 1137 0047 VIC P81E PCIe Ethernet NIC Dynamic 1137 0048 VIC M81KR Mezzanine Ethernet NIC Dynamic @@ -12020,6 +12499,7 @@ 1137 012c VIC 1340 MLOM Ethernet NIC Dynamic 1137 012e VIC 1227 PCIe Ethernet NIC Dynamic 1137 0137 VIC 1380 Mezzanine Ethernet NIC Dynamic + 1137 014d VIC 1385 PCIe Ethernet NIC Dynamic 0045 VIC FCoE HBA 1137 0047 VIC P81E PCIe FCoE HBA 1137 0048 VIC M81KR Mezzanine FCoE HBA @@ -12032,6 +12512,7 @@ 1137 012c VIC 1340 MLOM FCoE HBA 1137 012e VIC 1227 PCIe FCoE HBA 1137 0137 VIC 1380 Mezzanine FCoE HBA + 1137 014d VIC 1385 PCIe FCoE HBA 0046 VIC SCSI Controller 1137 012a VIC M4308 SCSI Controller 004e VIC 82 PCIe Upstream Port @@ -12040,6 +12521,7 @@ 1137 012a VIC M4308 Dual 40Gb 1137 012c VIC 1340 Dual 40Gb MLOM 1137 0137 VIC 1380 Dual 40Gb Mezzanine + 1137 014d VIC 1385 Dual 40Gb PCIe 00cf VIC Userspace NIC 1137 004f VIC 1280 Mezzanine Userspace NIC 1137 0084 VIC 1240 MLOM Userspace NIC @@ -12053,7 +12535,7 @@ 1138 Ziatech Corporation 8905 8905 [STD 32 Bridge] 1139 Dynamic Pictures, Inc - 0001 VGA Compatable 3D Graphics + 0001 VGA Compatible 3D Graphics 113a FWB Inc 113b Network Computing Devices 113c Cyclone Microsystems, Inc. @@ -12467,6 +12949,7 @@ 144d c006 vpr Matrix 170B4 CardBus bridge 0476 RL5c476 II 1014 0185 ThinkPad A/T/X Series + 1014 0555 ThinkPad X41 1014 056c ThinkPad Z60t 1028 014f Latitude X300 laptop 1028 0188 Inspiron 6000 laptop @@ -12513,14 +12996,14 @@ 103c 30b5 Presario V3242AU 103c 30b7 Presario V6133CL 103c 30cc Pavilion dv6700 - 103c 30cf Pavilion dv9668eg Laptop + 103c 30cf Pavilion dv95xx/96xx/97xx/98xx series 1043 1237 A6J-Q008 1043 1967 V6800V 144d c018 X20 IV 17aa 20ca ThinkPad T61 0811 R5C811 0822 R5C822 SD/SDIO/MMC/MS/MSPro Host Adapter - 1014 0556 ThinkPad X60s / Z60t + 1014 0556 ThinkPad X40 / X41 / X60s / Z60t 1014 0598 ThinkPad Z60m 1025 0121 Aspire 5920G 1028 0188 Inspiron 6000 laptop @@ -12559,6 +13042,7 @@ 1028 024f Dell Latitude e6500 103c 03b5 Presario V3242AU 103c 30b7 Presario V6133CL + 103c 30cf Pavilion dv9500/9600/9700 series 1183 0843 Alienware Aurora m9700 0852 xD-Picture Card Controller 1025 0121 Aspire 5920G @@ -12686,7 +13170,6 @@ 1198 Lambda Systems Inc 1199 Attachmate Corporation 0101 Advanced ISCA/PCI Adapter - 6832 Sierra Wireless MC8780 Device 119a Mind Share, Inc. 119b Omega Micro Inc. 1221 82C092G @@ -12742,6 +13225,7 @@ 2a55 88W8864 [Avastar] 802.11ac Wireless 2b36 88W8764 [Avastar] 802.11n Wireless 2b38 88W8897 [AVASTAR] 802.11ac Wireless + 2b40 88W8964 [Avastar] 802.11ac Wireless 4101 OLPC Cafe Controller Secure Digital Controller 4320 88E8001 Gigabit Ethernet Controller 1019 0f38 Marvell 88E8001 Gigabit Ethernet Controller (ECS) @@ -13234,6 +13718,16 @@ 8032 ATTO Celerity FC8xEN 117c 003b Celerity FC-82EN Fibre Channel Adapter 117c 003c Celerity FC-84EN Fibre Channel Adapter + 8053 PM8053 SXP 12G 24-port SAS/SATA expander + 8054 PM8054 SXP 12G 36-port SAS/SATA expander + 8055 PM8055 SXP 12G 48-port SAS/SATA expander + 8056 PM8056 SXP 12G 68-port SAS/SATA expander + 8060 PM8060 SRCv 12G eight-port SAS/SATA RoC + 8063 PM8063 SRCv 12G 16-port SAS/SATA RoC + 8070 PM8070 Tachyon SPCv 12G eight-port SAS/SATA controller + 8071 PM8071 Tachyon SPCve 12G eight-port SAS/SATA controller + 8072 PM8072 Tachyon SPCv 12G 16-port SAS/SATA controller + 8073 PM8073 Tachyon SPCve 12G 16-port SAS/SATA controller 11f9 I-Cube Inc 11fa Kasan Electronics Company, Ltd. 11fb Datel Inc @@ -13287,6 +13781,7 @@ 1202 9844 SK-9843 SX dual link 1203 Bayer Corporation, Agfa Division 1204 Lattice Semiconductor Corporation + 1965 SB6501 802.11ad Wireless Network Adapter 1205 Array Corporation 1206 Amdahl Corporation 1208 Parsytec GmbH @@ -13377,6 +13872,7 @@ 1028 04a3 Precision M4600 8331 O2 Flash Memory Card 8520 SD/MMC Card Reader Controller + 8621 SD/MMC Card Reader Controller 1218 Hybricon Corp. 1219 First Virtual Corporation 121a 3Dfx Interactive, Inc. @@ -13598,6 +14094,7 @@ 1969 ES1938/ES1946/ES1969 Solo-1 Audiodrive 1014 0166 ES1969 SOLO-1 AudioDrive on IBM Aptiva Mainboard 125d 8888 Solo-1 Audio Adapter + 125d 8898 ES1938S TTSOLO1-SL [TerraTec 128i PCI] 153b 111b Terratec 128i PCI 1978 ES1978 Maestro 2E 0e11 b112 Armada M700/E500 @@ -13721,14 +14218,14 @@ 1273 Hughes Network Systems 0002 DirecPC 1274 Ensoniq - 1171 ES1373 [AudioPCI] (also Creative Labs CT5803) - 1371 ES1371 / Creative Labs CT2518 [AudioPCI-97] + 1171 ES1373 / Creative Labs CT5803 [AudioPCI] + 1371 ES1371/ES1373 / Creative Labs CT2518 0e11 0024 AudioPCI on Motherboard Compaq Deskpro 0e11 b1a7 ES1371, ES1373 AudioPCI 1033 80ac ES1371, ES1373 AudioPCI 1042 1854 Tazer 107b 8054 Tabor2 - 1274 1371 AudioPCI 64V/128 / Creative Sound Blaster CT4810 + 1274 1371 Audio PCI 64V/128/5200 / Creative CT4810/CT5803/CT5806 [Sound Blaster PCI] 1274 8001 CT4751 board 1462 6470 ES1371, ES1373 AudioPCI On Motherboard MS-6147 1.1A 1462 6560 ES1371, ES1373 AudioPCI On Motherboard MS-6156 1.10 @@ -13779,10 +14276,10 @@ 8086 5643 ES1371, ES1373 AudioPCI On Motherboard Vancouver 8086 5753 ES1371, ES1373 AudioPCI On Motherboard WS440BX 5000 ES1370 [AudioPCI] - 5880 5880B [AudioPCI] - 1274 2000 Creative Sound Blaster AudioPCI128 + 5880 5880B / Creative Labs CT5880 + 1274 2000 Creative CT4810 [Sound Blaster AudioPCI 128] 1274 2003 Creative SoundBlaster AudioPCI 128 - 1274 5880 Creative Sound Blaster AudioPCI128 + 1274 5880 Creative CT4750 [Sound Blaster PCI 128] 1274 8001 Sound Blaster 16PCI 4.1ch 1458 a000 5880 AudioPCI On Motherboard 6OXET 1462 6880 5880 AudioPCI On Motherboard MS-6188 1.00 @@ -13948,9 +14445,10 @@ 8213 IT8213 IDE Controller 1458 b000 GA-EG45M-DS2H Mainboard 8330 IT8330G - 8872 IT8874F PCI Dual Serial Port Controller + 8872 IT887xF PCI to ISA I/O chip with SMB, GPIO, Serial or Parallel Port 8888 IT8888F/G PCI to ISA Bridge with SMB [Golden Gate] 8889 IT8889F PCI to ISA Bridge + 8893 IT8893E PCIe to PCI Bridge e886 IT8330G 1284 Sahara Networks, Inc. 1285 Platform Technologies, Inc. @@ -14010,7 +14508,7 @@ 12a8 News Datacom 12a9 Xiotech Corporation 12aa SDL Communications, Inc. -12ab Yuan Yuan Enterprise Co., Ltd. +12ab YUAN High-Tech Development Co., Ltd. 0000 MPG160/Kuroutoshikou ITVC15-STVLP 0002 AU8830 [Vortex2] Based Sound Card With A3D Support 0003 T507 (DVB-T) TV tuner/capture device @@ -14559,6 +15057,7 @@ 0206 GPS180PEX GPS Receiver (PCI Express) 0207 GLN180PEX GPS/GLONASS receiver (PCI Express) 0208 GPS180AMC GPS Receiver (PCI Express / MicroTCA / AdvancedMC) + 0209 GRC181PEX GPS/GLONASS/BEIDOU receiver (PCI Express) 0301 TCR510PCI IRIG Timecode Reader 0302 TCR167PCI IRIG Timecode Reader 0303 TCR511PCI IRIG Timecode Reader @@ -15032,6 +15531,8 @@ 1752 PCI-1752 1754 PCI-1754 1756 PCI-1756 +# FPGA bridge to two SJA1000 + c302 MIOe-3680 2-Port CAN-Bus MIOe Module with Isolation Protection 13ff Silicon Spice Inc 1400 Artx Inc 1401 9432 TX @@ -15171,7 +15672,6 @@ 1412 VIA Technologies Inc. 1712 ICE1712 [Envy24] PCI Multi-Channel I/O Controller 1412 1712 Hoontech ST Audio DSP 24 - 1412 3632 M-Audio Delta Audiophile 192 1412 d630 M-Audio Delta 1010 1412 d631 M-Audio Delta DiO 1412 d632 M-Audio Delta 66 @@ -15196,6 +15696,7 @@ 1412 1724 Albatron PX865PE 7.1 1412 3630 M-Audio Revolution 7.1 1412 3631 M-Audio Revolution 5.1 + 1412 3632 M-Audio Audiophile 192 153b 1145 Aureon 7.1 Space 153b 1147 Aureon 5.1 Sky 153b 1150 PHASE 22 @@ -15443,10 +15944,11 @@ 5011 T520-LL-CR Unified Wire Ethernet Controller 5012 T560-CR Unified Wire Ethernet Controller 5013 T580-CHR Unified Wire Ethernet Controller - 5014 T580-LP-SO-CR Unified Wire Ethernet Controller + 5014 T580-SO-CR Unified Wire Ethernet Controller 5015 T502-BT Unified Wire Ethernet Controller 5016 T580-OCP-SO Unified Wire Ethernet Controller 5017 T520-OCP-SO Unified Wire Ethernet Controller + 5018 T540-BT Unified Wire Ethernet Controller 5080 T540-5080 Unified Wire Ethernet Controller 5081 T540-5081 Unified Wire Ethernet Controller 5082 T504-5082 Unified Wire Ethernet Controller @@ -15460,6 +15962,16 @@ 5090 T540-5090 Unified Wire Ethernet Controller 5091 T522-5091 Unified Wire Ethernet Controller 5092 T520-5092 Unified Wire Ethernet Controller + 5093 T580-5093 Unified Wire Ethernet Controller + 5094 T540-5094 Unified Wire Ethernet Controller + 5095 T540-5095 Unified Wire Ethernet Controller + 5096 T580-5096 Unified Wire Ethernet Controller + 5097 T520-5097 Unified Wire Ethernet Controller + 5098 T580-5098 Unified Wire Ethernet Controller + 5099 T580-5099 Unified Wire Ethernet Controller + 509a T520-509A Unified Wire Ethernet Controller + 509b T540-509B Unified Wire Ethernet Controller + 509c T520-509C Unified Wire Ethernet Controller 5401 T520-CR Unified Wire Ethernet Controller 5402 T522-CR Unified Wire Ethernet Controller 5403 T540-CR Unified Wire Ethernet Controller @@ -15479,10 +15991,11 @@ 5411 T520-LL-CR Unified Wire Ethernet Controller 5412 T560-CR Unified Wire Ethernet Controller 5413 T580-CHR Unified Wire Ethernet Controller - 5414 T580-LP-SO-CR Unified Wire Ethernet Controller + 5414 T580-SO-CR Unified Wire Ethernet Controller 5415 T502-BT Unified Wire Ethernet Controller 5416 T580-OCP-SO Unified Wire Ethernet Controller 5417 T520-OCP-SO Unified Wire Ethernet Controller + 5418 T540-BT Unified Wire Ethernet Controller 5480 T540-5080 Unified Wire Ethernet Controller 5481 T540-5081 Unified Wire Ethernet Controller 5482 T504-5082 Unified Wire Ethernet Controller @@ -15496,6 +16009,16 @@ 5490 T540-5090 Unified Wire Ethernet Controller 5491 T522-5091 Unified Wire Ethernet Controller 5492 T520-5092 Unified Wire Ethernet Controller + 5493 T580-5093 Unified Wire Ethernet Controller + 5494 T540-5094 Unified Wire Ethernet Controller + 5495 T540-5095 Unified Wire Ethernet Controller + 5496 T580-5096 Unified Wire Ethernet Controller + 5497 T520-5097 Unified Wire Ethernet Controller + 5498 T580-5098 Unified Wire Ethernet Controller + 5499 T580-5099 Unified Wire Ethernet Controller + 549a T520-509A Unified Wire Ethernet Controller + 549b T540-509B Unified Wire Ethernet Controller + 549c T520-509C Unified Wire Ethernet Controller 5501 T520-CR Unified Wire Storage Controller 5502 T522-CR Unified Wire Storage Controller 5503 T540-CR Unified Wire Storage Controller @@ -15515,10 +16038,11 @@ 5511 T520-LL-CR Unified Wire Storage Controller 5512 T560-CR Unified Wire Storage Controller 5513 T580-CHR Unified Wire Storage Controller - 5514 T580-LP-SO-CR Unified Wire Storage Controller + 5514 T580-SO-CR Unified Wire Storage Controller 5515 T502-BT Unified Wire Storage Controller 5516 T580-OCP-SO Unified Wire Storage Controller 5517 T520-OCP-SO Unified Wire Storage Controller + 5518 T540-BT Unified Wire Storage Controller 5580 T540-5080 Unified Wire Storage Controller 5581 T540-5081 Unified Wire Storage Controller 5582 T504-5082 Unified Wire Storage Controller @@ -15532,6 +16056,16 @@ 5590 T540-5090 Unified Wire Storage Controller 5591 T522-5091 Unified Wire Storage Controller 5592 T520-5092 Unified Wire Storage Controller + 5593 T580-5093 Unified Wire Storage Controller + 5594 T540-5094 Unified Wire Storage Controller + 5595 T540-5095 Unified Wire Storage Controller + 5596 T580-5096 Unified Wire Storage Controller + 5597 T520-5097 Unified Wire Storage Controller + 5598 T580-5098 Unified Wire Storage Controller + 5599 T580-5099 Unified Wire Storage Controller + 559a T520-509A Unified Wire Storage Controller + 559b T540-509B Unified Wire Storage Controller + 559c T520-509C Unified Wire Storage Controller 5601 T520-CR Unified Wire Storage Controller 5602 T522-CR Unified Wire Storage Controller 5603 T540-CR Unified Wire Storage Controller @@ -15551,10 +16085,11 @@ 5611 T520-LL-CR Unified Wire Storage Controller 5612 T560-CR Unified Wire Storage Controller 5613 T580-CHR Unified Wire Storage Controller - 5614 T580-LP-SO-CR Unified Wire Storage Controller + 5614 T580-SO-CR Unified Wire Storage Controller 5615 T502-BT Unified Wire Storage Controller 5616 T580-OCP-SO Unified Wire Storage Controller 5617 T520-OCP-SO Unified Wire Storage Controller + 5618 T540-BT Unified Wire Storage Controller 5680 T540-5080 Unified Wire Storage Controller 5681 T540-5081 Unified Wire Storage Controller 5682 T504-5082 Unified Wire Storage Controller @@ -15568,6 +16103,16 @@ 5690 T540-5090 Unified Wire Storage Controller 5691 T522-5091 Unified Wire Storage Controller 5692 T520-5092 Unified Wire Storage Controller + 5693 T580-5093 Unified Wire Storage Controller + 5694 T540-5094 Unified Wire Storage Controller + 5695 T540-5095 Unified Wire Storage Controller + 5696 T580-5096 Unified Wire Storage Controller + 5697 T520-5097 Unified Wire Storage Controller + 5698 T580-5098 Unified Wire Storage Controller + 5699 T580-5099 Unified Wire Storage Controller + 569a T520-509A Unified Wire Storage Controller + 569b T540-509B Unified Wire Storage Controller + 569c T520-509C Unified Wire Storage Controller 5701 T520-CR Unified Wire Ethernet Controller 5702 T522-CR Unified Wire Ethernet Controller 5703 T540-CR Unified Wire Ethernet Controller @@ -15587,7 +16132,7 @@ 5711 T520-LL-CR Unified Wire Ethernet Controller 5712 T560-CR Unified Wire Ethernet Controller 5713 T580-CR Unified Wire Ethernet Controller - 5714 T580-LP-SO-CR Unified Wire Ethernet Controller + 5714 T580-SO-CR Unified Wire Ethernet Controller 5715 T502-BT Unified Wire Ethernet Controller 5780 T540-5080 Unified Wire Ethernet Controller 5781 T540-5081 Unified Wire Ethernet Controller @@ -15602,6 +16147,11 @@ 5790 T540-5090 Unified Wire Ethernet Controller 5791 T522-5091 Unified Wire Ethernet Controller 5792 T520-5092 Unified Wire Ethernet Controller + 5793 T580-5093 Unified Wire Ethernet Controller + 5794 T540-5094 Unified Wire Ethernet Controller + 5795 T540-5095 Unified Wire Ethernet Controller + 5796 T580-5096 Unified Wire Ethernet Controller + 5797 T520-5097 Unified Wire Ethernet Controller 5801 T520-CR Unified Wire Ethernet Controller [VF] 5802 T522-CR Unified Wire Ethernet Controller [VF] 5803 T540-CR Unified Wire Ethernet Controller [VF] @@ -15621,10 +16171,11 @@ 5811 T520-LL-CR Unified Wire Ethernet Controller [VF] 5812 T560-CR Unified Wire Ethernet Controller [VF] 5813 T580-CHR Unified Wire Ethernet Controller [VF] - 5814 T580-LP-SO-CR Unified Wire Ethernet Controller [VF] + 5814 T580-SO-CR Unified Wire Ethernet Controller [VF] 5815 T502-BT Unified Wire Ethernet Controller [VF] 5816 T580-OCP-SO Unified Wire Ethernet Controller [VF] 5817 T520-OCP-SO Unified Wire Ethernet Controller [VF] + 5818 T540-BT Unified Wire Ethernet Controller [VF] 5880 T540-5080 Unified Wire Ethernet Controller [VF] 5881 T540-5081 Unified Wire Ethernet Controller [VF] 5882 T504-5082 Unified Wire Ethernet Controller [VF] @@ -15638,6 +16189,16 @@ 5890 T540-5090 Unified Wire Ethernet Controller [VF] 5891 T522-5091 Unified Wire Ethernet Controller [VF] 5892 T520-5092 Unified Wire Ethernet Controller [VF] + 5893 T580-5093 Unified Wire Ethernet Controller [VF] + 5894 T540-5094 Unified Wire Ethernet Controller [VF] + 5895 T540-5095 Unified Wire Ethernet Controller [VF] + 5896 T580-5096 Unified Wire Ethernet Controller [VF] + 5897 T520-5097 Unified Wire Ethernet Controller [VF] + 5898 T580-5098 Unified Wire Ethernet Controller [VF] + 5899 T580-5099 Unified Wire Ethernet Controller [VF] + 589a T520-509A Unified Wire Ethernet Controller [VF] + 589b T540-509B Unified Wire Ethernet Controller [VF] + 589c T520-509C Unified Wire Ethernet Controller [VF] a000 PE10K Unified Wire Ethernet Controller 1426 Storage Technology Corp. 1427 Better On-Line Solutions @@ -15701,12 +16262,15 @@ 9111 PCI-9111 9113 PCI-9113 9114 PCI-9114 +# 2-16 MB SRAM, 4x UART, I2C, misc I/O + a001 ADi-BSEC # nee Loronix Information Systems Inc. 144b Verint Systems Inc. 144c Catalina Research Inc 144d Samsung Electronics Co Ltd 1600 Apple PCIe SSD a800 XP941 PCIe SSD + a802 NVMe SSD Controller a820 NVMe SSD Controller 171X 1028 1f95 Express Flash NVMe XS1715 SSD 400GB 1028 1f96 Express Flash NVMe XS1715 SSD 800GB @@ -15717,6 +16281,12 @@ 1028 1fbb Express Flash NVMe SM1715 1.6TB SFF 1028 1fbc Express Flash NVMe SM1715 1.6TB AIC a821 NVMe SSD Controller 172X + 1028 1fb7 Express Flash NVMe PM1725 3.2TB SFF + 1028 1fb8 Express Flash NVMe PM1725 3.2TB AIC + 1028 1fb9 Express Flash NVMe PM1725 6.4TB AIC + 1028 1fc1 Express Flash NVMe PM1725 800GB SFF + 1028 1fc2 Express Flash NVMe PM1725 1.6TB SFF + 1028 1fc4 Express Flash NVMe PM1725 1.6TB AIC 144e OLITEC 144f Askey Computer Corp. 1450 Octave Communications Ind. @@ -15787,7 +16357,6 @@ 1485 ERMA - Electronic GmBH 1486 L3 Communications Telemetry & Instrumentation 1487 MARQUETTE Medical Systems -1488 KONTRON Electronik GmBH 1489 KYE Systems Corporation 148a OPTO 148b INNOMEDIALOGIC Inc. @@ -15814,6 +16383,7 @@ 3064 TPCI100 (2 Slot IndustryPack PCI Carrier) 30c8 TPCI200 4 Slot IndustryPack PCI Carrier 70c8 TPCE200 4 Slot IndustryPack PCIe Carrier + 9177 TXMC375 8 channel RS232/RS422/RS485 programmable serial interface 1499 EMTEC CO., Ltd 149a ANDOR Technology Ltd 149b SEIKO Instruments Inc @@ -15826,9 +16396,9 @@ 14a1 Systembase Co Ltd 14a2 Millennium Engineering Inc 14a3 Maverick Networks -# registered as GVC/BCM Advanced Research -14a4 Broadcom Corporation (Wrong ID) - 4318 BCM4318 [AirForce One 54g] 802.11g Wireless LAN Controller +14a4 Lite-On Technology Corporation +# Wrong vendor ID used + 4318 Broadcom BCM4318 [AirForce One 54g] 802.11g WLAN Controller 14a5 XIONICS Document Technologies Inc 14a6 INOVA Computers GmBH & Co KG 14a7 MYTHOS Systems Inc @@ -15966,7 +16536,8 @@ 14e1 INVERTEX 14e2 INFOLIBRIA 14e3 AMTELCO -14e4 Broadcom Corporation +# Formerly Broadcom Corporation +14e4 Broadcom Limited 0576 BCM43224 802.11a/b/g/n 0800 Sentry5 Chipcommon I/O Controller 0804 Sentry5 PCI Bridge @@ -15991,7 +16562,7 @@ 1028 0236 PowerEdge R610 BCM5709 Gigabit Ethernet 1028 0237 PowerEdge T610 BCM5709 Gigabit Ethernet 103c 7055 NC382i Integrated Multi-port PCI Express Gigabit Server Adapter - 103c 7059 NC382T PCI Express Dual Port Multifunction Gigabit Server Adapter + 103c 7059 NC382T PCIe Dual Port Multifunction Gigabit Server Adapter 10a9 8027 Quad port Gigabit Ethernet Controller 163a NetXtreme II BCM5709S Gigabit Ethernet 1028 027b PowerEdge M805 Broadcom NetXtreme II BCM5709S @@ -16062,7 +16633,7 @@ 1647 NetXtreme BCM5703 Gigabit Ethernet 0e11 0099 NC7780 1000BaseTX 0e11 009a NC7770 1000BaseTX - 10a9 8010 SGI IO9 Gigabit Ethernet (Copper) + 10a9 8010 IO9 Gigabit Ethernet (Copper) 14e4 0009 BCM5703 1000BaseTX 14e4 000a BCM5703 1000BaseSX 14e4 000b BCM5703 1000BaseTX @@ -16118,6 +16689,8 @@ 1656 NetXtreme BCM5718 Gigabit Ethernet PCIe 1657 NetXtreme BCM5719 Gigabit Ethernet PCIe 103c 169d Ethernet 1Gb 4-port 331FLR Adapter + 103c 22be Ethernet 1Gb 4-port 331i Adapter + 103c 3383 Ethernet 1Gb 4-port 331T Adapter 1659 NetXtreme BCM5721 Gigabit Ethernet PCI Express 1014 02c6 eServer xSeries server mainboard 1028 01e6 PowerEdge 860 @@ -16186,9 +16759,9 @@ 167b NetXtreme BCM5755 Gigabit Ethernet PCI Express 103c 280a DC5750 Microtower 167d NetXtreme BCM5751M Gigabit Ethernet PCI Express - 1014 0577 ThinkPad Z60t - 103c 0934 HP nx8220 - 103c 0940 HP Compaq nw8240 Mobile Workstation + 1014 0577 ThinkPad X41 / Z60t + 103c 0934 nx8220 + 103c 0940 Compaq nw8240 Mobile Workstation 17aa 2081 ThinkPad R60e 167e NetXtreme BCM5751F Fast Ethernet PCI Express 167f NetLink BCM5787F Fast Ethernet PCI Express @@ -16211,15 +16784,15 @@ 168d NetXtreme II BCM57840 10/20 Gigabit Ethernet 168e NetXtreme II BCM57810 10 Gigabit Ethernet 103c 1798 Flex-10 10Gb 2-port 530FLB Adapter [Meru] - 103c 17a5 HP Flex-10 10Gb 2-port 530M Adapter - 103c 18d3 HP Ethernet 10Gb 2-port 530T Adapter - 103c 1930 HP FlexFabric 10Gb 2-port 534FLR-SFP+ Adapter - 103c 1931 HP StoreFabric CN1100R Dual Port Converged Network Adapter - 103c 1932 HP FlexFabric 10Gb 2-port 534FLB Adapter - 103c 1933 HP FlexFabric 10Gb 2-port 534M Adapter - 103c 193a HP FlexFabric 10Gb 2-port 533FLR-T Adapter - 103c 3382 HP Ethernet 10Gb 2-port 530FLR-SFP+ Adapter - 103c 339d HP Ethernet 10Gb 2-port 530SFP+ Adapter + 103c 17a5 Flex-10 10Gb 2-port 530M Adapter + 103c 18d3 Ethernet 10Gb 2-port 530T Adapter + 103c 1930 FlexFabric 10Gb 2-port 534FLR-SFP+ Adapter + 103c 1931 StoreFabric CN1100R Dual Port Converged Network Adapter + 103c 1932 FlexFabric 10Gb 2-port 534FLB Adapter + 103c 1933 FlexFabric 10Gb 2-port 534M Adapter + 103c 193a FlexFabric 10Gb 2-port 533FLR-T Adapter + 103c 3382 Ethernet 10Gb 2-port 530FLR-SFP+ Adapter + 103c 339d Ethernet 10Gb 2-port 530SFP+ Adapter 1690 NetXtreme BCM57760 Gigabit Ethernet PCIe 1691 NetLink BCM57788 Gigabit Ethernet PCIe 1028 04aa XPS 8300 @@ -16245,14 +16818,14 @@ 16a0 NetLink BCM5785 Fast Ethernet 16a1 BCM57840 NetXtreme II 10 Gigabit Ethernet 16a2 BCM57840 NetXtreme II 10/20-Gigabit Ethernet - 103c 1916 HP FlexFabric 20Gb 2-port 630FLB Adapter - 103c 1917 HP FlexFabric 20Gb 2-port 630M Adapter + 103c 1916 FlexFabric 20Gb 2-port 630FLB Adapter + 103c 1917 FlexFabric 20Gb 2-port 630M Adapter 103c 2231 3820C 10/20Gb Converged Network Adapter 103c 22fa FlexFabric 10Gb 2-port 536FLB Adapter 16a3 NetXtreme BCM57786 Gigabit Ethernet PCIe 16a4 BCM57840 NetXtreme II Ethernet Multi Function - 103c 1916 HP NPAR 20Gb 2-port 630FLB Adapter - 103c 1917 HP NPAR 20Gb 2-port 630M Adapter + 103c 1916 NPAR 20Gb 2-port 630FLB Adapter + 103c 1917 NPAR 20Gb 2-port 630M Adapter 103c 2231 3820C 10/20Gb Converged Network Adapter (NPAR 1.5) 103c 22fa FlexFabric 10Gb 2-port 536FLB Adapter (NPAR 1.5) # The Broadcom 57800 device has two 1Gig ports and two 10Gig ports. The subsystem information can be used to differentiate. @@ -16298,32 +16871,32 @@ 103c 703b NC373i Integrated Multifunction Gigabit Server Adapter 103c 703d NC373F PCI Express Multifunction Gigabit Server Adapter 16ad NetXtreme II BCM57840 10/20 Gigabit Ethernet Virtual Function - 103c 1916 HP FlexFabric 20Gb 2-port 630FLB Adapter - 103c 1917 HP FlexFabric 20Gb 2-port 630M Adapter + 103c 1916 FlexFabric 20Gb 2-port 630FLB Adapter + 103c 1917 FlexFabric 20Gb 2-port 630M Adapter 103c 2231 3820C 10/20Gb Converged Network Adapter (SR-IOV VF) 103c 22fa FlexFabric 10Gb 2-port 536FLB Adapter (SR-IOV VF) 16ae NetXtreme II BCM57810 10 Gigabit Ethernet Multi Function - 103c 1798 HP NPAR 10Gb 2-port 530FLB Adapter - 103c 17a5 HP NPAR 10Gb 2-port 530M Adapter - 103c 18d3 HP NPAR 10Gb 2-port 530T Adapter - 103c 1930 HP NPAR 10Gb 2-port 534FLR-SFP+ Adapter - 103c 1931 HP NPAR CN1100R Dual Port Converged Network Adapter - 103c 1932 HP NPAR 10Gb 2-port 534FLB Adapter - 103c 1933 HP NPAR 10Gb 2-port 534M Adapter - 103c 193a HP NPAR 10Gb 2-port 533FLR-T Adapter - 103c 3382 HP NPAR 10Gb 2-port 530FLR-SFP+ Adapter - 103c 339d HP NPAR 10Gb 2-port 530SFP+ Adapter + 103c 1798 NPAR 10Gb 2-port 530FLB Adapter + 103c 17a5 NPAR 10Gb 2-port 530M Adapter + 103c 18d3 NPAR 10Gb 2-port 530T Adapter + 103c 1930 NPAR 10Gb 2-port 534FLR-SFP+ Adapter + 103c 1931 NPAR CN1100R Dual Port Converged Network Adapter + 103c 1932 NPAR 10Gb 2-port 534FLB Adapter + 103c 1933 NPAR 10Gb 2-port 534M Adapter + 103c 193a NPAR 10Gb 2-port 533FLR-T Adapter + 103c 3382 NPAR 10Gb 2-port 530FLR-SFP+ Adapter + 103c 339d NPAR 10Gb 2-port 530SFP+ Adapter 16af NetXtreme II BCM57810 10 Gigabit Ethernet Virtual Function - 103c 1798 HP Flex-10 10Gb 2-port 530FLB Adapter - 103c 17a5 HP Flex-10 10Gb 2-port 530M Adapter - 103c 18d3 HP Ethernet 10Gb 2-port 530T Adapter - 103c 1930 HP FlexFabric 10Gb 2-port 534FLR-SFP+ Adapter - 103c 1931 HP StoreFabric CN1100R Dual Port Converged Network Adapter - 103c 1932 HP FlexFabric 10Gb 2-port 534FLB Adapter - 103c 1933 HP FlexFabric 10Gb 2-port 534M Adapter - 103c 193a HP FlexFabric 10Gb 2-port 533FLR-T Adapter - 103c 3382 HP Ethernet 10Gb 2-port 530FLR-SFP+ Adapter - 103c 339d HP Ethernet 10Gb 2-port 530SFP+ Adapter + 103c 1798 Flex-10 10Gb 2-port 530FLB Adapter + 103c 17a5 Flex-10 10Gb 2-port 530M Adapter + 103c 18d3 Ethernet 10Gb 2-port 530T Adapter + 103c 1930 FlexFabric 10Gb 2-port 534FLR-SFP+ Adapter + 103c 1931 StoreFabric CN1100R Dual Port Converged Network Adapter + 103c 1932 FlexFabric 10Gb 2-port 534FLB Adapter + 103c 1933 FlexFabric 10Gb 2-port 534M Adapter + 103c 193a FlexFabric 10Gb 2-port 533FLR-T Adapter + 103c 3382 Ethernet 10Gb 2-port 530FLR-SFP+ Adapter + 103c 339d Ethernet 10Gb 2-port 530SFP+ Adapter 16b0 NetXtreme BCM57761 Gigabit Ethernet PCIe 16b1 NetLink BCM57781 Gigabit Ethernet PCIe 1849 96b1 Z77 Extreme4 motherboard @@ -16348,7 +16921,41 @@ 103c 1321 Core I/O LAN/SCSI Combo [AB314A] 14e4 0009 NetXtreme BCM5703 1000Base-T 14e4 000a NetXtreme BCM5703 1000Base-SX + 16c8 BCM57301 NetXtreme-C 10Gb Ethernet Controller + 16c9 BCM57302 NetXtreme-C 10Gb/25Gb Ethernet Controller + 16ca BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet Controller + 16cb BCM57304 NetXtreme-C Ethernet Virtual Function + 16cc BCM57417 NetXtreme-E Ethernet Partition + 16ce BCM57311 NetXtreme-C 10Gb RDMA Ethernet Controller + 16cf BCM57312 NetXtreme-C 10Gb/25Gb RDMA Ethernet Controller + 16d0 BCM57402 NetXtreme-E 10Gb Ethernet Controller + 16d1 BCM57404 NetXtreme-E 10Gb/25Gb Ethernet Controller + 16d2 BCM57406 NetXtreme-E 10GBASE-T Ethernet Controller + 16d3 BCM57404 NetXtreme-E Ethernet Virtual Function + 16d4 BCM57402 NetXtreme-E Ethernet Partition + 16d5 BCM57407 NetXtreme-E 10GBase-T Ethernet Controller + 16d6 BCM57412 NetXtreme-E 10Gb RDMA Ethernet Controller + 16d7 BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller + 1590 020e Ethernet 25Gb 2-port 631SFP28 Adapter + 1590 0211 Ethernet 25Gb 2-port 631FLR-SFP28 Adapter + 16d8 BCM57416 NetXtreme-E 10GBase-T RDMA Ethernet Controller + 1590 020c Ethernet 10Gb 2-port 535T Adapter + 1590 0212 Ethernet 10Gb 2-port 535FLR-T Adapter + 16d9 BCM57417 NetXtreme-E 10GBASE-T RDMA Ethernet Controller + 108e 4866 Dual Port 10GBase-T Ethernet Controller + 16dc BCM57414 NetXtreme-E Ethernet Virtual Function 16dd NetLink BCM5781 Gigabit Ethernet PCI Express + 16de BCM57412 NetXtreme-E Ethernet Partition + 16df BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb RDMA Ethernet Controller + 16e1 BCM57314 NetXtreme-C Ethernet Virtual Function + 16e2 BCM57417 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller + 108e 4866 Dual Port 10Gb/25Gb SFP28 Ethernet Controller + 16e3 BCM57416 NetXtreme-E 10Gb RDMA Ethernet Controller + 16e7 BCM57404 NetXtreme-E Ethernet Partition + 16e8 BCM57406 NetXtreme-E Ethernet Partition + 16e9 BCM57407 NetXtreme-E 25Gb Ethernet Controller + 16ec BCM57414 NetXtreme-E Ethernet Partition + 16ee BCM57416 NetXtreme-E Ethernet Partition 16f3 NetXtreme BCM5727 Gigabit Ethernet PCIe 16f7 NetXtreme BCM5753 Gigabit Ethernet PCI Express 16fd NetXtreme BCM5753M Gigabit Ethernet PCI Express @@ -16548,8 +17155,10 @@ 43a0 BCM4360 802.11ac Wireless Network Adapter 43a1 BCM4360 802.11ac Wireless Network Adapter 43a2 BCM4360 802.11ac Wireless Network Adapter + 43a3 BCM4350 802.11ac Wireless Network Adapter 43a9 BCM43217 802.11b/g/n 43aa BCM43131 802.11b/g/n + 43ae BCM43162 802.11ac Wireless Network Adapter 43b1 BCM4352 802.11ac Wireless Network Adapter 43ba BCM43602 802.11ac Wireless LAN SoC 43bb BCM43602 802.11ac Wireless LAN SoC @@ -16562,6 +17171,7 @@ 4401 BCM4401 100Base-T 1025 0035 TravelMate 660 1025 0064 Extensa 3000 series laptop + 1028 8127 Dimension 2400 103c 08b0 tc1100 tablet 1043 80a8 A7V8X motherboard 4402 BCM4402 Integrated 10/100BaseT @@ -16778,6 +17388,8 @@ 2464 HSF 56k Data/Fax/Voice Modem (Mob SmartDAA) 2465 HSF 56k Data/Fax/Voice/Spkp (w/HS) Modem (Mob SmartDAA) 2466 HSF 56k Data/Fax/Voice/Spkp Modem (Mob SmartDAA) + 2702 HSFi modem RD01-D270 + 1028 8d88 SmartHSFi V92 56K PCI Modem 2f00 HSF 56k HSFi Modem 13e0 8d84 IBM HSFi V.90 13e0 8d85 Compaq Stinger @@ -16892,6 +17504,7 @@ 0070 8010 WinTV HVR-1400 ExpressCard 0070 f038 WinTV HVR-5525 107d 6f22 WinFast PxTV1200 + 12ab d585 PE988J Hybrid ATSC/QAM PCI-E AVS Video Capture (SoftEncoder) 13c2 3013 TT-budget CT2-4500 CI 1461 c039 AVerTV Hybrid Express (A577) 153b 117e Cinergy T PCIe Dual @@ -16903,6 +17516,7 @@ 4254 980c T980C 8880 CX23887/8 PCIe Broadcast Audio and Video Decoder with 3D Comb 0070 2259 WinTV HVR-1250 + 0070 6a18 WinTV-quadHD 0070 c108 WinTV-HVR-4400-HD model 1278 5654 2389 GoTView X5 DVD Hybrid PCI-E 5654 2390 GoTView X5 3D HYBRID PCI-E @@ -16972,8 +17586,8 @@ 1320 10bd SURECOM EP-320X-S 100/10M Ethernet PCI Adapter 0891 MTD-8xx 100/10M Ethernet PCI Adapter 1517 ECHOTEK Corp -# nee PEP MODULAR Computers GmbH -1518 Kontron Modular Computers GmbH +# old ID, now 1059 +1518 Kontron 1519 TELEFON AKTIEBOLAGET LM Ericsson 151a Globetek 1002 PCI-1002 @@ -17041,6 +17655,7 @@ 1410 CB1410 Cardbus Controller 1025 003c CL50 motherboard 1025 005a TravelMate 290 + 103c 30d5 530 Laptop 1411 CB-710/2/4 Cardbus Controller 103c 006a NX9500 1412 CB-712/4 Cardbus Controller @@ -17090,6 +17705,7 @@ 9277 5 Volt Delta Sigma Converter Card 9278 10 Volt Delta Sigma Converter Card 9287 Analog Output Card + 9290 FPGA Card 1543 SILICON Laboratories 3052 Intel 537 [Winmodem] 4c22 Si3036 MC'97 DAA @@ -17194,8 +17810,8 @@ 158d Point Multimedia Systems 158e Lara Technology Inc 158f Ditect Coop -# nee 3PAR Inc. -1590 Hewlett-Packard Company +# formerly 3PAR Inc. +1590 Hewlett Packard Enterprise 0001 Eagle Cluster Manager 0002 Osprey Cluster Manager 0003 Harrier Cluster Manager @@ -17236,6 +17852,7 @@ 15aa Moreton Bay 15ab Bluesteel Networks Inc 15ac North Atlantic Instruments + 6893 3U OpenVPX Multi-function I/O Board [Model 68C3] 15ad VMware 0405 SVGA II Adapter 0710 SVGA Adapter @@ -17252,6 +17869,7 @@ 07e0 SATA AHCI controller 0801 Virtual Machine Interface 15ad 0800 Hypervisor ROM Interface + 0820 Paravirtual RDMA controller 1977 HD Audio Controller 15ae Amersham Pharmacia Biotech 15b0 Zoltrix International Ltd @@ -17263,6 +17881,13 @@ 01ff MT27600 Family [Connect-IB Flash Recovery] 0209 MT27700 Family [ConnectX-4 Flash Recovery] 020b MT27710 Family [ConnectX-4 Lx Flash Recovery] + 020d MT28800 Family [ConnectX-5 Flash Recovery] +# reserved for RM#105916 + 024e MT53100 [Spectrum-2, Flash recovery mode] +# Actual value to be used + 024f MT53100 [Spectrum-2, Flash recovery mode] + 0262 MT27710 [ConnectX-4 Lx Programmable] EN + 0263 MT27710 [ConnectX-4 Lx Programmable Virtual Function] EN 1002 MT25400 Family [ConnectX-2 Virtual Function] 1003 MT27500 Family [ConnectX-3] 103c 1777 InfiniBand FDR/EN 10/40Gb Dual Port 544FLR-QSFP Adapter (Rev Cx) @@ -17276,6 +17901,7 @@ 1007 MT27520 Family [ConnectX-3 Pro] 103c 22f3 InfiniBand FDR/Ethernet 10Gb/40Gb 2-port 544+QSFP Adapter 103c 22f4 InfiniBand FDR/Ethernet 10Gb/40Gb 2-port 544+FLR-QSFP Adapter + 103c 801f Ethernet 10G 2-port 546SFP+ Adapter 117c 0090 FastFrame NQ41 117c 0091 FastFrame NQ42 117c 0092 FastFrame NQ11 @@ -17294,8 +17920,18 @@ 1014 MT27700 Family [ConnectX-4 Virtual Function] 1015 MT27710 Family [ConnectX-4 Lx] 1016 MT27710 Family [ConnectX-4 Lx Virtual Function] - 1017 MT27640 Family - 1018 MT27641 Family + 1017 MT27800 Family [ConnectX-5] + 1018 MT28800 Family [ConnectX-5 Virtual Function] + 1019 MT28800 Family [ConnectX-5 Ex] + 101a MT28800 Family [ConnectX-5 Ex Virtual Function] + 101b MT28831 + 101c MT28840 + 101d MT28841 + 101e MT28850 + 101f MT28851 + 1020 MT28860 + 1021 MT28861 + 1974 MT28800 Family [ConnectX-5 PCIe Bridge] 5274 MT21108 InfiniBridge 5a44 MT23108 InfiniHost 5a45 MT23108 [Infinihost HCA Flash Recovery] @@ -17321,9 +17957,26 @@ 15b3 0018 HP 10 GbE PCI-e G2 Dual-Port NIC (rev C1) 675a MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe Gen2 5GT/s] 6764 MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s Virtualization+] - 103c 3313 HP NC542m Dual Port Flex-10 10GbE BLc Adapter + 103c 3313 NC542m Dual Port Flex-10 10GbE BLc Adapter 676e MT26478 [ConnectX EN 40GigE, PCIe 2.0 5GT/s] 6778 MT26488 [ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virtualization+] + 7101 NPS-400 configuration and management interface + 7102 NPS-400 network interface PF + 7103 NPS-400 network interface VF + 7121 NPS-600 configuration and management interface + 7122 NPS-600 network interface PF + 7123 NPS-600 network interface VF +# SwitchX-2, 40GbE switch + c738 MT51136 + c739 MT51136 GW + c838 MT52236 + c839 MT52236 router + caf1 ConnectX-4 CAPI Function +# Spectrum, 100GbE Switch + cb84 MT52100 + cf08 MT53236 + cf6c MT53100 [Spectrum-2, 64 x 100GbE switch] + d2f0 Switch-IB 3 HDR (200Gbps) switch 15b4 CCI/TRIAD 15b5 Cimetrics Inc 15b6 Texas Memory Systems Inc @@ -17502,15 +18155,15 @@ 1642 Bitland(ShenZhen) Information Technology Co., Ltd. 1657 Brocade Communications Systems, Inc. 0013 425/825/42B/82B 4Gbps/8Gbps PCIe dual port FC HBA - 103c 1742 HP 82B 8Gbps dual port FC HBA - 103c 1744 HP 42B 4Gbps dual port FC HBA + 103c 1742 82B 8Gbps dual port FC HBA + 103c 1744 42B 4Gbps dual port FC HBA 1657 0014 425/825 4Gbps/8Gbps PCIe dual port FC HBA 0014 1010/1020/1007/1741 10Gbps CNA 1657 0014 1010/1020/1007/1741 10Gbps CNA - FCOE 1657 0015 1010/1020/1007/1741 10Gbps CNA - LL 0017 415/815/41B/81B 4Gbps/8Gbps PCIe single port FC HBA - 103c 1741 HP 41B 4Gbps single port FC HBA - 103c 1743 HP 81B 8Gbps single port FC HBA + 103c 1741 41B 4Gbps single port FC HBA + 103c 1743 81B 8Gbps single port FC HBA 1657 0014 415/815 4Gbps/8Gbps single port PCIe FC HBA 0021 804 8Gbps FC HBA for HP Bladesystem c-class # AnyIO Adapter @@ -17595,6 +18248,7 @@ 7181 Proc10a_27S 7191 Proc10a_48S 71a1 Proc10a_66S + 71b1 Proc10A 165d Hsing Tech. Enterprise Co., Ltd. 165f Linux Media Labs, LLC 1020 LMLM4 MPEG-4 encoder @@ -17797,6 +18451,7 @@ 0777 4005 SR71-15 802.11an Mini PCI Adapter 1186 3a7a DWA-552 802.11n Xtreme N Desktop Adapter (rev A2) 1186 3a7d DWA-552 802.11n Xtreme N Desktop Adapter (rev A3) + 168c 2096 Compex WLM200NX / Wistron DNMA-92 002a AR928X Wireless Network Adapter (PCI-Express) 0777 4f05 SR71-X 802.11abgn Wireless ExpressCard Adapter [AR9280] 103c 3041 AR5BHB92-H 802.11abgn Wireless Half-size Mini PCIe Card [AR9280] @@ -17837,10 +18492,15 @@ 1a56 2000 Killer Wireless-N 1102 Half-size Mini PCIe Card [AR9382] 1a56 2001 Killer Wireless-N 1103 Half-size Mini PCIe Card [AR9380] 0032 AR9485 Wireless Network Adapter + 1028 0208 Wireless 1506 WLAN Half Mini-Card 103c 1838 AR9485/HB125 802.11bgn 1×1 Wi-Fi Adapter 105b e044 Unex DHXA-225 - 0033 AR9580 Wireless Network Adapter + 144d 410e AR9485WB-EG 802.11b/g/n mini-PCIe card on a series 3 laptop + 1a3b 1186 AW-NE186H + 0033 AR958x 802.11abgn Wireless Network Adapter + 168c a120 AR9582 802.11a/n WLAN Mini-PCIe Adapter 0034 AR9462 Wireless Network Adapter + 1028 0300 Wireless 1802 802.11abgn Adapter 1a56 2003 Killer Wireless-N 1202 Half-size Mini PCIe Card 0036 QCA9565 / AR9565 Wireless Network Adapter 0037 AR9485 Wireless Network Adapter @@ -17849,6 +18509,10 @@ 003c QCA986x/988x 802.11ac Wireless Network Adapter 003e QCA6174 802.11ac Wireless Network Adapter 1a56 1525 Killer N1525 Wireless-AC + 0040 QCA9980/9990 802.11ac Wireless Network Adapter + 0041 QCA6164 802.11ac Wireless Network Adapter + 0042 QCA9377 802.11ac Wireless Network Adapter + 0050 QCA9887 802.11ac Wireless Network Adapter 0207 AR5210 Wireless Network Adapter [AR5000 802.11a] 1014 AR5212 802.11abg NIC 1014 058a ThinkPad 11a/b/g Wireless LAN Mini Express Adapter (AR5BXB6) @@ -17874,6 +18538,8 @@ 0001 SafeXcel 1140 000a SafeXcel 1841 1141 SafeXcel 1141 +# misused vendor ID 0001 + 0001 0001 SafeXcel 1141 v. 1.1 1841 SafeXcel 1842 16af SparkLAN Communications, Inc. 16b4 Aspex Semiconductor Ltd @@ -17964,6 +18630,16 @@ 7005 XMC-7K410CC: User-Configurable Kintex-7 FPGA, 410k logic cells, conduction-cooled 7006 XMC-7A200: User-Configurable Artix-7 FPGA, 200k logic cells with Plug-In I/O 7007 XMC-7A200CC: User-Configurable Conduction-Cooled Artix-7 FPGA, with 200k logic cells + 7011 AP440-1: 32-Channel Isolated Digital Input Module + 7012 AP440-2: 32-Channel Isolated Digital Input Module + 7013 AP440-3: 32-Channel Isolated Digital Input Module + 7014 AP445: 32-Channel Isolated Digital Output Module + 7016 AP470 48-Channel TTL Level Digital Input/Output Module + 7018 AP408: 32-Channel Digital I/O Module + 701a AP220-16 12-Bit, 16-Channel Analog Output Module + 701b AP231-16 16-Bit, 16-Channel Analog Output Module + 7042 AP482 Counter Timer Module with TTL Level Input/Output + 7044 AP484 Counter Timer Module with RS422 Input/Output 16da Advantech Co., Ltd. 0011 INES GPIB-PCI 16df PIKA Technologies Inc. @@ -18057,8 +18733,15 @@ 0095 Octeon III CN78XX Network Processor 0096 Octeon III CN70XX Network Processor 9700 Octeon III CN73XX Network Processor + 9702 CN23XX [LiquidIO II] Intelligent Adapter + 177d 0003 CN2350 [LiquidIO II] 2-port 10GbE Intelligent adapter + 177d 0004 CN2350 [LiquidIO II] 2-port 25GbE Intelligent adapter + 9703 CN23XX [LiquidIO II] NVMe Controller + 9712 CN23XX [LiquidIO II] SRIOV Virtual Function + 177d 0003 CN2350 [LiquidIO II] 2-port 10GbE SRIOV Virtual Function + 9713 CN23XX [LiquidIO II] NVMe SRIOV Virtual Function 9800 Octeon Fusion CNF75XX Processor - a001 THUNDERX MRML Bridge + a001 ThunderX MRML(Master RML Bridge to RSL devices) a002 THUNDERX PCC Bridge 177d a102 CN88XX PCC Bridge a008 THUNDERX SMMU @@ -18096,15 +18779,24 @@ a026 THUNDERX BGX (Common Ethernet Interface) a027 THUNDERX IOBN a029 THUNDERX NCSI (Network Controller Sideband Interface) - a02a THUNDERX SGP + a02a ThunderX SGPIO (Serial GPIO controller for SATA disk lights) a02b THUNDERX SMI / MDIO Controller a02c THUNDERX DAP (Debug Access Port) a02d THUNDERX PCIERC (PCIe Root Complex) - a02e THUNDERX L2C-TAD + a02e ThunderX L2C-TAD (Level 2 cache tag and data) a02f THUNDERX L2C-CBC a030 THUNDERX L2C-MCI a031 THUNDERX MIO-FUS (Fuse Access Controller) a032 THUNDERX FUSF (Fuse Controller) + a033 THUNDERX Random Number Generator virtual function + a034 THUNDERX Network Interface Controller virtual function + a035 THUNDERX Parallel Bus + a036 ThunderX RAD (RAID acceleration engine) virtual function + a037 THUNDERX ZIP virtual function + a040 THUNDERX CPT Cryptographic Accelerator + a100 THUNDERX CN88XX 48 core SoC + a200 OCTEON TX CN81XX/CN80XX + a300 OCTEON TX CN83XX 1787 Hightech Information System Ltd. 1789 Ennyah Technologies Corp. # also used by Struck Innovative Systeme for joint developments @@ -18139,6 +18831,8 @@ 6816 TW6816 multimedia video controller # channel 8 of 8 6817 TW6816 multimedia video controller +# Example MuniPCI-E card: http://www.commell.com.tw/product/surveillance/MPX-6864.htm + 6864 TW6864 multimedia video controller 1799 Belkin 6001 F5D6001 Wireless PCI Card [Realtek RTL8180] 6020 F5D6020 v3000 Wireless PCMCIA Card [Realtek RTL8180] @@ -18162,6 +18856,7 @@ 8083 GL880 USB 1.1 UHCI controller 8084 GL880 USB 2.0 EHCI controller 17aa Lenovo + 402b Intel 82599ES 10Gb 2-port Server Adapter X520-2 17ab Phillips Components 17af Hightech Information System Ltd. 17b3 Hawking Technologies @@ -18174,13 +18869,16 @@ 0017 StorSecure 300 GZIP Compression and AES Encryption Card 17c0 Wistron Corp. 17c2 Newisys, Inc. -17cb Airgo Networks, Inc. +# nee Airgo Networks, Inc. +17cb Qualcomm 0001 AGN100 802.11 a/b/g True MIMO Wireless Card 1385 5c00 WGM511 Pre-N 802.11g Wireless CardBus Adapter 1737 0045 WMP54GX v1 802.11g Wireless-G PCI Adapter with SRX 0002 AGN300 802.11 a/b/g True MIMO Wireless Card 1385 6d00 WPNT511 RangeMax 240 Mbps Wireless CardBus Adapter 1737 0054 WPC54GX4 v1 802.11g Wireless-G Notebook Adapter with SRX400 + 0400 Datacenter Technologies QDF2432 PCI Express Root Port + 0401 Datacenter Technologies QDF2400 PCI Express Root Port 17cc NetChip Technology, Inc 2280 USB 2.0 17cf Z-Com, Inc. @@ -18192,6 +18890,11 @@ 1170 ARC-1170 24-Port PCI-X to SATA RAID Controller 1201 ARC-1200 2-Port PCI-Express to SATA II RAID Controller 1210 ARC-1210 4-Port PCI-Express to SATA RAID Controller + 1214 ARC-12x4 PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1214 ARC-1214 4-Port PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1224 ARC-1224 8-Port PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1264 ARC-1264 12/16 Port PCIe 2.0 to SATA 6Gb RAID Controller + 17d3 1284 ARC-1284 24 Port PCIe 2.0 to SATA 6Gb RAID Controller 1220 ARC-1220 8-Port PCI-Express to SATA RAID Controller 1222 ARC-1222 8-Port PCI-Express to SAS/SATA II RAID Controller 1230 ARC-1230 12-Port PCI-Express to SATA RAID Controller @@ -18199,9 +18902,22 @@ 1280 ARC-1280/1280ML 24-Port PCI-Express to SATA II RAID Controller 17d3 1221 ARC-1221 8-Port PCI-Express to SATA RAID Controller 1300 ARC-1300ix-16 16-Port PCI-Express to SAS Non-RAID Host Adapter - 1680 ARC-1680 8 port PCIe/PCI-X to SAS/SATA II RAID Controller + 1320 ARC-1320 8/16 Port PCIe 2.0 to SAS/SATA 6Gb Non-RAID Host Adapter + 1330 ARC-1330 16 Port PCIe 3.0 to SAS/SATA 12Gb Non-RAID Host Adapter + 1680 ARC-1680 series PCIe to SAS/SATA 3Gb RAID Controller 17d3 1212 ARC-1212 4-Port PCIe to SAS/SATA II RAID Controller - 1880 ARC-1880 8/12 port PCIe/PCI-X to SAS/SATA II RAID Controller + 17d3 1222 ARC-1222 8-Port PCIe to SAS/SATA 3Gb RAID Controller + 17d3 1680 ARC-1680 8/12/16/24 Port PCIe to SAS/SATA 3Gb RAID Controller + 1880 ARC-188x series PCIe 2.0/3.0 to SAS/SATA 6/12Gb RAID Controller + 17d3 1213 ARC-1213 4-Port PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1215 ARC-1215 4-Port PCIe 3.0 to SAS/SATA 6Gb RAID Controller + 17d3 1216 ARC-1216 4-Port PCIe 3.0 to SAS/SATA 12Gb RAID Controller + 17d3 1223 ARC-1223 8-Port PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1225 ARC-1225 8-Port PCIe 3.0 to SAS/SATA 6Gb RAID Controller + 17d3 1226 ARC-1226 8-Port PCIe 3.0 to SAS/SATA 12Gb RAID Controller + 17d3 1880 ARC-1880 8/12/16/24 Port PCIe 2.0 to SAS/SATA 6Gb RAID Controller + 17d3 1882 ARC-1882 8/12/16/24 Port PCIe 3.0 to SAS/SATA 6Gb RAID Controller + 17d3 1883 ARC-1883 8/12/16/24 Port PCIe 3.0 to SAS/SATA 12Gb RAID Controller # nee Neterion Inc., previously S2io Inc. 17d5 Exar Corp. 5731 Xframe 10-Gigabit Ethernet PCI-X @@ -18236,6 +18952,44 @@ 17db Cray Inc 0101 XT Series [Seastar] 3D Toroidal Router 17de KWorld Computer Co. Ltd. +17df Dini Group + 1864 Virtex4 PCI Board w/ QL5064 Bridge [DN7000K10PCI/DN8000K10PCI/DN8000K10PSX/NOTUS] + 1865 Virtex4 ASIC Emulator [DN8000K10PCIe] + 1866 Virtex4 ASIC Emulator Cable Connection [DN8000K10PCI] + 1867 Virtex4 ASIC Emulator Cable Connection [DN8000K10PCIe] + 1868 Virtex4 ASIC Emulator [DN8000K10PCIe-8] + 1900 Virtex5 PCIe ASIC Emulator [DN9000K10PCIe8T/DN9002K10PCIe8T/DN9200K10PCIe8T/DN7006K10PCIe8T/DN7406K10PCIe8T] + 1901 Virtex5 PCIe ASIC Emulator Large BARs [DN9000K10PCIe8T/DN9002K10PCIe8T/DN9200K10PCIe8T/DN7006K10PCIe8T/DN7406K10PCIe8T] + 1902 Virtex5 PCIe ASIC Emulator Low Power [Interceptor] + 1903 Spartan6 PCIe FPGA Accelerator Board [DNBFCS12PCIe] + 1904 Virtex6 PCIe ASIC Emulation Board [DNDUALV6_PCIe4] + 1905 Virtex6 PCIe ASIC Emulation Board [DNV6F6PCIe] + 1906 Virtex6 PCIe ASIC Emulation Board [DN2076K10] + 1907 Virtex6 PCIe ASIC Emulation Board [DNV6F2PCIe] + 1908 Virtex6 PCIe ASIC Emulation Board Large BARs[DNV6F2PCIe] + 1909 Kintex7 PCIe FPGA Accelerator Board [DNK7F5PCIe] + 190a Virtex7 PCIe ASIC Emulation Board [DNV7F1A] + 190b Stratix5 PCIe ASIC Emulation Board [DNS5GXF2] + 190c Virtex7 PCIe ASIC Emulation Board [DNV7F2A] + 190d Virtex7 PCIe ASIC Emulation Board [DNV7F4A] + 190e Virtex7 PCIe ASIC Emulation Board [DNV7F2B] + 190f KintexUS PCIe MainRef Design [DNPCIE_40G_KU_LL] + 1910 VirtexUS ASIC Emulation Board [DNVUF4A] + 1911 VirtexUS PCIe ASIC Emulation Board [DNVU_F2PCIe] + 1912 KintexUS PCIe MainRef Design [DNPCIe_40G_KU_LL_QSFP] + 1913 VirtexUS ASIC Emulation Board [DNVUF1A] + 1914 VirtexUS ASIC Emulation Board [DNVUF2A] + 1915 Arria10 PCIe MainRef Design [DNPCIe_80G_A10_LL] + 1916 VirtexUS PCIe Accelerator Board [DNVUF2_HPC_PCIe] + 1a00 Virtex6 PCIe DMA Netlist Design + 1a01 Virtex6 PCIe Darklite Design [DNPCIe_HXT_10G_LL] + 1a02 Virtex7 PCIe DMA Netlist Design + 1a03 Kintex7 PCIe Darklite Design [DNPCIe_K7_10G_LL] + 1a05 Stratix5 PCIe Darklite Design [DNS5GX_F2] + 1a06 VirtexUS PCIe DMA Netlist Design + 1a07 KintexUS PCIe Darklite Design [DNPCIe_40G_KU_LL] + 1a08 KintexUS PCIe Darklite Design [DNPCIe_40G_KU_LL_QSFP] + 1a09 Arria10 PCIe Darklite Design [DNPCIe_80G_A10_LL] 17e4 Sectra AB 0001 KK671 Cardbus encryption board 0002 KK672 Cardbus encryption board @@ -18249,6 +19003,7 @@ 17f2 Albatron Corp. 17f3 RDC Semiconductor, Inc. 1010 R1010 IDE Controller + 2012 M2012/R3308 VGA-compatible graphics adapter 6020 R6020 North Bridge 6021 R6021 Host Bridge 6030 R6030 ISA Bridge @@ -18308,6 +19063,7 @@ 13d1 abe3 miniPCI Pluscom 802.11 a/b/g 1458 e933 GN-WI01GS 1458 e934 GN-WP01GS + 1462 b833 MP54G5 (MS-6833B) 1737 0055 WMP54G v4.1 1799 700e F5D7000 v6000 Wireless G Desktop Card 1799 701e F5D7010 v6000 Wireless G Notebook Card @@ -18353,6 +19109,7 @@ 5390 RT5390 Wireless 802.11n 1T/1R PCIe 103c 1636 U98Z077.00 Half-size Mini PCIe Card 5392 RT5392 PCIe Wireless Network Adapter + 539b RT5390R 802.11bgn PCIe Wireless Network Adapter 539f RT5390 [802.11 b/g/n 1T1R G-band PCI Express Single Chip] 103c 1637 Pavilion DM1Z-3000 PCIe wireless card 5592 RT5592 PCIe Wireless Network Adapter @@ -18381,6 +19138,8 @@ 1849 ASRock Incorporation 184a Thales Computers 1100 MAX II cPLD +1850 Advantest Corporation + 0048 EK220-66401 Computer Interface Card 1851 Microtune, Inc. 1852 Anritsu Corp. 1853 SMSC Automotive Infotainment System Group @@ -18462,6 +19221,7 @@ 18d2 Sitecom Europe BV (Wrong ID) # Sitecom HFC-S based ISDN controller card DC-105v2 3069 DC-105v2 ISDN controller +18d4 Celestica 18d8 Dialogue Technology Corp. 18dd Artimi Inc 4c6f Artimi RTMI-100 UWB adapter @@ -18544,6 +19304,7 @@ 0135 NT20E2-PTP Network Adapter 2x10Gb 0145 NT40E3-4-PTP Network Adapter 4x10Gb 0155 NT100E3-1-PTP Network Adapter 1x100Gb + 0165 NT80E3-2-PTP Network Adapter 2x40Gb 0175 NT20E3-2-PTP Network Adapter 2x10Gb 18f6 NextIO 1000 [Nexsis] Switch Virtual P2P PCIe Bridge @@ -18605,6 +19366,9 @@ 0013 SH7757 PCIe Switch [PS] 0014 uPD720201 USB 3.0 Host Controller 0015 uPD720202 USB 3.0 Host Controller + 001a SH7758 PCIe-PCI Bridge [PPB] + 001b SH7758 PCIe End-Point [PBI] + 001d SH7758 PCIe Switch [PS] 1919 Soltek Computer Inc. 1923 Sangoma Technologies Corp. 0040 A200/Remora FXO/FXS Analog AFT card @@ -18688,13 +19452,23 @@ 1924 8007 SFN7322F-R2 Precision Time SFP+ Server Adapter 1924 8009 SFN7x22F-R2 Flareon Ultra 7000 Series 10G Adapter 1924 800a SFN7x02F-R2 Flareon 7000 Series 10G Adapter - 1924 800b SFN7x22F-R3 Flareon Ultra 7000 Series 10G Adapter 1924 800c SFN7x22F-R3 Flareon Ultra 7000 Series 10G Adapter 1924 800d SFN7x02F-R3 Flareon 7000 Series 10G Adapter + 1924 8010 SFA7942Q-R1 QSFP+ AOE Adapter + 1924 8015 SFA7942Q-A5-0-R1 QSFP+ AOE Adapter 0923 SFC9140 1924 800b SFN7x42Q-R1 Flareon Ultra 7000 Series 10/40G Adapter 1924 800e SFN7x42Q-R2 Flareon Ultra 7000 Series 10/40G Adapter 1924 800f SFN7xx4F-R1 Flareon Ultra 7000 Series 10G Adapter + 0a03 SFC9220 + 1924 8011 SFN 8022-R1 Solarflare Flareon 8000 Series 10G Adapter + 1924 8012 SFN8522-R1 Flareon Ultra 8000 Series 10G Adapter + 1924 8013 SFN8042-R1 Solarflare Flareon 8000 Series 10/40G Adapter + 1924 8014 SFN8542-R1 Flareon Ultra 8000 Series 10/40G Adapter + 1924 8016 SFN8022-R2 Flareon 8000 Series 10G Adapter + 1924 8017 SFN8522-R2 Flareon Ultra 8000 Series 10G Adapter + 1924 8018 SFN8042-R2 Flareon 8000 Series 10/40G Adapter + 1924 8019 SFN8542-R2 Flareon Ultra 8000 Series 10/40G Adapter 1803 SFC9020 Virtual Function [Solarstorm] 1813 SFL9021 Virtual Function [Solarstorm] 1903 SFC9120 Virtual Function @@ -18719,15 +19493,19 @@ 000c Qualcomm MSM6275 UMTS chip 1932 DiBcom 193c MAXIM Integrated Products -193f Comtech AHA Corp. +193f AHA Products Group 0001 AHA36x-PCIX 0360 AHA360-PCIe 0363 AHA363-PCIe 0364 AHA364-PCIe 0367 AHA367-PCIe 0370 AHA370-PCIe + 0604 AHA604 + 0605 AHA605 3641 AHA3641 3642 AHA3642 + 6101 AHA6101 + 6102 AHA6102 1942 ClearSpeed Technology plc e511 Advance X620 accelerator card e521 Advance e620 accelerator card @@ -18811,6 +19589,7 @@ 0401 P4080 0408 P4040E 0409 P4040 + 041f P3041 0440 T4240 with security 0441 T4240 without security 0446 T4160 with security @@ -18827,6 +19606,8 @@ 1a56 1201 Killer E2100 Gigabit Ethernet Controller # PCIe interface for emulator fc02 RedStone +# CFI device over PCIe + fc03 CFI 1958 Faster Technology, LLC. 1959 PA Semi, Inc a000 PA6T Core @@ -18861,6 +19642,7 @@ 1073 AR8151 v1.0 Gigabit Ethernet 1083 AR8151 v2.0 Gigabit Ethernet 1090 AR8162 Fast Ethernet + 1043 108d VivoBook X202E, X202EV 1091 AR8161 Gigabit Ethernet 1043 1477 N56VZ 10a0 QCA8172 Fast Ethernet @@ -18870,6 +19652,7 @@ 2062 AR8152 v2.0 Fast Ethernet # E2200, E2201, E2205 e091 Killer E220x Gigabit Ethernet Controller + e0a1 Killer E2400 Gigabit Ethernet Controller 196a Sensory Networks Inc. 0101 NodalCore C-1000 Content Classification Accelerator 0102 NodalCore C-2000 Content Classification Accelerator @@ -18931,6 +19714,7 @@ 8521 AU8521 TV card # nee ServerEngines Corp. 19a2 Emulex Corporation + 0120 x1 PCIe Gen2 Bridge[Pilot4] 0200 BladeEngine 10Gb PCI-E iSCSI adapter 0201 BladeEngine 10Gb PCIe Network Adapter 0211 BladeEngine2 10Gb Gen2 PCIe Network Adapter @@ -19075,6 +19859,7 @@ 0001 Vulcan SP HT6210 10-Gigabit Ethernet (rev 02) 1a88 MEN Mikro Elektronik 4d45 Multifunction IP core +1a8a StarBridge, Inc. 1a8c Verigy Pte. Ltd. 1100 E8001-66443 PCI Express CIC 1a8e DRS Technologies @@ -19098,6 +19883,9 @@ 9100 TPRO-PCI-66U Timecode Reader/Generator 1ade Spin Master Ltd. 1501 Swipetech barcode scanner + 3038 PCIe Video Bridge + 13c2 3016 TT-budget S2-4200 Twin + 4254 0552 S952 v3 1ae0 Google, Inc. 1ae7 First Wise Media GmbH 0520 HFC-S PCI A [X-TENSIONS XC-520] @@ -19144,8 +19932,24 @@ 1004 Virtio SCSI 1005 Virtio RNG 1009 Virtio filesystem - 1010 Virtio GPU - 1012 Virtio input device +# virtio 1.0 + 1041 Virtio network device +# virtio 1.0 + 1042 Virtio block device +# virtio 1.0 + 1043 Virtio console +# virtio 1.0 + 1044 Virtio RNG +# virtio 1.0 + 1045 Virtio memory balloon +# virtio 1.0 + 1048 Virtio SCSI +# virtio 1.0 + 1049 Virtio filesystem +# virtio 1.0 + 1050 Virtio GPU +# virtio 1.0 + 1052 Virtio input 1110 Inter-VM shared memory 1af4 1100 QEMU Virtual Machine 1af5 Netezza Corp. @@ -19180,6 +19984,9 @@ 1af4 1100 QEMU Virtual Machine 0005 QEMU PCI Test Device 1af4 1100 QEMU Virtual Machine + 0006 PCI Rocker Ethernet switch device + 0007 PCI SD Card Host Controller Interface + 000a PCI-PCI bridge (multiseat) 0100 QXL paravirtual graphic card 1af4 1100 QEMU Virtual Machine 1b37 Signal Processing Devices Sweden AB @@ -19229,12 +20036,14 @@ 91a4 88SE912x IDE Controller 9220 88SE9220 PCIe 2.0 x2 2-port SATA 6 Gb/s RAID Controller 9230 88SE9230 PCIe SATA 6Gb/s Controller + 1d49 0300 ThinkSystem M.2 with Mirroring Enablement Kit 9235 88SE9235 PCIe 2.0 x2 4-port SATA 6 Gb/s Controller 9445 88SE9445 PCIe 2.0 x4 4-Port SAS/SATA 6 Gbps RAID Controller 9480 88SE9480 SAS/SATA 6Gb/s RAID controller 9485 88SE9485 SAS/SATA 6Gb/s controller 1b55 NetUP Inc. 18f6 Dual DVB Universal CI card + 18f7 Dual DVB Universal CI card rev 1.4 2a2c Dual DVB-S2-CI card e2e4 Dual DVB-T/C-CI RF card # 2xHDMI and 2xHD-SDI inputs @@ -19257,9 +20066,12 @@ d230 D230 Dual-port E1/T1 card (2nd generation) d410 D410/430 Quad-port E1/T1 card d430 D410/430 Quad-port E1/T1 card +1b79 Absolute Analysis 1b85 OCZ Technology Group, Inc. 1041 RevoDrive 3 X2 PCI-Express SSD 240 GB (Marvell Controller) 8788 RevoDrive Hybrid +1b94 Signatec / Dynamic Signals Corp + e400 PX14400 Dual Xilinx Virtex5 based Digitizer 1b96 Western Digital 1b9a XAVi Technologies Corp. 1bad ReFLEX CES @@ -19277,6 +20089,16 @@ 1bb1 6511 Nytro XH6550-2GB DRAM # 8GB variant of Nytro PCIe controller 1bb1 6512 Nytro XH6550-8GB DRAM +# 1.5 TB Nytro PCIe controller + 1bb1 6521 Nytro XP6500-8A1536 1.5TB +# 2TB Nytro PCIe controller + 1bb1 6522 Nytro XP6500-8A2048 +# 4TB Nytro PCIe controller + 1bb1 6523 Nytro XP6500-8A4096 + 0100 Nytro Flash Storage + 1bb1 0101 Nytro XF1440 + 1bb1 0121 Nytro XM1440 + 1bb1 01a1 Nytro XP7102 1bb3 Bluecherry 4304 BC-04120A MPEG4 4 port video encoder / decoder 4309 BC-08240A MPEG4 4 port video encoder / decoder @@ -19299,8 +20121,11 @@ 1101 OmniBus II PCIe Multi-Protocol Interface Card 1102 OmniBusBox II Multi-Protocol Interface Core 1103 OmniBus II cPCIe/PXIe Multi-Protocol Interface Card +1bd4 Inspur Electronic Information Industry Co., Ltd. 1bee IXXAT Automation GmbH 0003 CAN-IB200/PCIe +1bef Lantiq + 0011 MIPS SoC PCI Express Port 1bf4 VTI Instruments Corporation 0001 SentinelEX 1bfd EeeTOP @@ -19308,6 +20133,13 @@ 4254 10G-PCIE3-8D-2S 4255 10G-PCIE3-8D-Q 4256 10G-PCIE3-8D-2S + 4258 10G-PCIE3-8E-2S Network Adapter + 4260 10G-PCIE3-8E-4S Network Adapter + 4261 10G-PCIE3-8E-4S Network Adapter + 4262 10G-PCIE3-8E-4S Network Adapter + 4263 10G-PCIE3-8E-4S Network Adapter + 4264 10G-PCIE3-8E-2S Network Adapter + 4265 10G-PCIE3-8E-2S Network Adapter 1c1c Symphony 0001 82C101 1c28 Lite-On IT Corp. / Plextor @@ -19322,6 +20154,8 @@ 00a4 FBC4XGG3 Capture 4x10Gb 00a5 FBC2XLG Capture 2x40Gb 00a6 FBC1CG Capture 1x100Gb + 00a9 FBC2XGHH Capture 2x10Gb + 00af Capture slave device # Used on V120 VME Crate Controller 1c32 Highland Technology, Inc. 1c33 Daktronics, Inc @@ -19334,6 +20168,12 @@ # A Western Digital Subsidiary 1c58 HGST, Inc. 0003 Ultrastar SN100 Series NVMe SSD + 1014 04f5 PCIe3 1.6TB NVMe Flash Adapter + 1014 04f6 PCIe3 3.2TB NVMe Flash Adapter +# http://www.nicevt.ru/ (in Russian) +1c63 Science and Research Centre of Computer Technology (JSC "NICEVT") +# http://www.radiotec.ru/catalog.php?cat=jr8&art=14109 + 0008 K1927BB1Ya [EC8430] Angara Interconnection Network Adapter 1c7e TTTech Computertechnik AG 0200 zFAS Debug Port 1c7f Elektrobit Austria GmbH @@ -19341,25 +20181,44 @@ 1c8a TSF5 Corporation 0001 Hunter PCI Express 1cb1 Collion UG & Co.KG +1cb8 Dawning Information Industry Co., Ltd. 1cc5 Embedded Intelligence, Inc. 0100 CAN-PCIe-02 +1cc7 Radian Memory Systems Inc. + 0200 RMS-200 + 0250 RMS-250 1cd2 SesKion GmbH 0301 Simulyzer-RT CompactPCI Serial DIO-1 card + 0302 Simulyzer-RT CompactPCI Serial PSI5-ECU-1 card + 0303 Simulyzer-RT CompactPCI Serial PSI5-SIM-1 card + 0304 Simulyzer-RT CompactPCI Serial PWR-ANA-1 card + 0305 Simulyzer-RT CompactPCI Serial CAN-1 card +1cdd secunet Security Networks AG 1ce4 Exablaze 0001 ExaNIC X4 0002 ExaNIC X2 + 0003 ExaNIC X10 + 0004 ExaNIC X10-GM + 0005 ExaNIC X40 1cf7 Subspace Dynamics 1d00 Pure Storage +1d1d CNEX Labs + 1f1f QEMU NVM Express LightNVM Controller + 2807 8800 series NVMe SSD # CEM Solutions Pvt. Ltd. 1d21 Allo 1d26 Kalray Inc. 0040 Turbocard2 Accelerator + 0080 Open Network Interface Card 80G + 00c0 Turbocard3 Accelerator e004 AB01/EMB01 Development Board 1d40 Techman Electronics (Changshu) Co., Ltd. 1d44 DPT a400 PM2x24/PM3224 +1d49 Lenovo 1d5c Fantasia Trading LLC 1d61 Technobox, Inc. +1d62 Nebbiolo Technologies 1d65 Imagine Communications Corp. 04de Taurus/McKinley 1d6c Atomic Rules LLC @@ -19370,7 +20229,16 @@ 1005 ZC706-Z045 1006 KCU105-KU040 1007 XUSP3S-VU095 [Jasper] + 1008 XUSPL4-VU065 [Mustang UltraScale] + 1009 XUSPL4-VU3P [Mustang UltraScale+] + 100a A10PL4-A10GX115 + 100b K35-2SFP + 100c K35-4SFP + 100d AR-ARKA-FX0 [Arkville 32B DPDK Data Mover] + 100e AR-ARKA-FX1 [Arkville 64B DPDK Data Mover] 4200 A5PL-E1-10GETI [10 GbE Ethernet Traffic Instrument] +1d78 DERA +1d8f Enyx 1de1 Tekram Technology Co.,Ltd. 0391 TRM-S1040 [DC-315 / DC-395 series] 2020 DC-390 @@ -19411,13 +20279,18 @@ 4010 TN4010 Clean SROM 4020 TN9030 10GbE CX4 Ethernet Adapter 4022 TN9310 10GbE SFP+ Ethernet Adapter + 1043 8709 XG-C100F 10GbE SFP+ Ethernet Adapter 1186 4d00 DXE-810S 10GbE SFP+ Ethernet Adapter + 1432 8103 EN-8102PF 10GbE SPF+ Ethernet Adapter 1fc9 3015 Ethernet Adapter 4024 TN9210 10GBase-T Ethernet Adapter 4025 TN9510 10GBase-T/NBASE-T Ethernet Adapter + 105a 7203 SANLink3 NBase-T1 1186 2900 DXE-810T 10GBase-T Ethernet Adapter + 1432 8102 EN-8102P 10GbE Ethernet Adapter 1fc9 3015 Ethernet Adapter 4026 TN9610 10GbE SFP+ Ethernet Adapter + 4027 TN9710 10GBase-T/NBASE-T Ethernet Adapter 1fcc StreamLabs f416 MS416 fb01 MH4LM @@ -19445,7 +20318,9 @@ 2955 Connectix Virtual PC 6e61 OHCI USB 1.1 controller 2a15 3D Vision(???) +2bd8 ROPEX Industrie-Elektronik GmbH 3000 Hansol Electronics Inc. +3112 Satelco Ingenieria S.A. 3142 Post Impression Systems. 31ab Zonet 1faa ZEW1602 802.11b/g Wireless Adapter @@ -19465,6 +20340,7 @@ 4c53 4000 PMCCARR1 carrier board 0022 HiNT HB4 PCI-PCI Bridge (PCI6150) 0026 HB2 PCI-PCI Bridge + 1014 AudioTrak Maya 1018 Audiotrak INCA88 1019 Miditrak 2120 101a E.Band [AudioTrak Inca88] @@ -19488,6 +20364,7 @@ 1140 VR-12-PCI # multiport serial board 1141 PCI-485(422) + 1142 PCI-CAN2 3842 eVga.com. Corp. 38ef 4Links 3d3d 3DLabs @@ -19669,64 +20546,136 @@ adc1 ADC200ME High speed ADC de01 DL200ME High resolution delay line PCI based card de02 DL200ME Middle resolution delay line PCI based card +# Can't find any information on this company +4651 TXIC 4680 Umax Computer Corp 4843 Hercules Computer Technology Inc 4916 RedCreek Communications Inc 1960 RedCreek PCI adapter 4943 Growth Networks 494f ACCES I/O Products, Inc. - 0520 PCI-IDO-48 - 0920 PCI-IDI-48 - 0c50 PCI-DIO-24H - 0c51 PCI-DIO-24D - 0c60 PCI-DIO-48(H) - 0c68 PCI-DIO-72 - 0c70 PCI-DIO-96 - 0c78 PCI-DIO-120 - 0dc8 PCI-IDIO-16 - 0e50 PCI-DIO-24S - 0e51 PCI-DIO-24H(C) - 0e52 PCI-DIO-24D(C) - 0e60 PCI-DIO-48S(H) - 0e61 P104-DIO-24S - 0f00 PCI-IIRO-8 - 0f01 LPCI-IIRO-8 - 0f08 PCI-IIRO-16 - 1050 PCI-422/485-2 - 1058 PCI-COM422/4 - 1059 PCI-COM485/4 - 1068 PCI-COM422/8 - 1069 PCI-COM485/8 - 1088 PCI-COM232/1 - 1090 PCI-COM232/2 - 10a8 P104-COM232-8 - 10c9 PCI-COM-1S - 10d0 PCI-COM2S - 10e8 PCI-COM-8SM - 1148 PCI-ICM-1S - 1150 PCI-ICM-2S - 1158 PCI-ICM422/4 - 1159 PCI-ICM485/4 - 1250 PCI-WDG-2S + 0508 PCI-IDO-16A FET Output Card + 0518 PCI-IDO-32A FET Output Card + 0520 PCI-IDO-48 FET Output Card + 0521 PCI-IDO-48A FET Output Card + 0703 PCIe-RO-4 Electromechanical Relay Output Card + 07d0 PCIe-IDO-24 FET Output Card + 0920 PCI-IDI-48 Isolated Digital Input Card + 0bd0 PCIe-IDI-24 Isolated Digital Input Card + 0c50 PCI-DIO-24H 1x 8255 Digital Input / Output Card + 0c51 PCI-DIO-24D 1x 8255 Digital Input / Output Card + 0c52 PCIe-DIO-24 1x 8255 Digital Input / Output Card + 0c53 PCIe-DIO-24H 8255 Digital Input / Output Card + 0c57 mPCIe-DIO-24 8255 Digital Input / Output Card + 0c60 PCI-DIO-48H 8255 Digital Input / Output Card + 0c61 PCIe-DIO-48 8255 Digital Input / Output Card + 0c62 P104-DIO-48 8255 Digital Input / Output Card + 0c68 PCI-DIO-72 8255 Digital Input / Output Card + 0c69 P104-DIO-96 8255 Digital Input / Output Card + 0c70 PCI-DIO-96 8255 Digital Input / Output Card + 0c78 PCI-DIO-120 8255 Digital Input / Output Card + 0dc8 PCI-IDIO-16 Isolated Digital Input / FET Output Card + 0e50 PCI-DIO-24S 8255 Digital Input / Output Card + 0e51 PCI-DIO-24H(C) 8255 Digital Input / Output Card + 0e52 PCI-DIO-24D(C) 8255 Digital Input / Output Card + 0e53 PCIe-DIO-24S 8255 Digital Input / Output Card + 0e54 PCIe-DIO-24HS 8255 Digital Input / Output Card + 0e55 PCIe-DIO-24DC 8255 Digital Input / Output Card + 0e56 PCIe-DIO-24DCS 8255 Digital Input / Output Card + 0e57 mPCIe-DIO-24S 8255 Digital Input / Output Card + 0e60 PCI-DIO-48S 2x 8255 Digital Input / Output Card + 0e61 PCIe-DIO-48S 2x 8255 Digital Input / Output Card + 0e62 P104-DIO-48S 2x 8255 Digital Input / Output Card + 0f00 PCI-IIRO-8 Isolated Digital / Relay Output Card + 0f01 LPCI-IIRO-8 Isolated Digital / Relay Output Card + 0f02 PCIe-IIRO-8 Isolated Digital / Relay Output Card + 0f08 PCI-IIRO-16 Isolated Digital / Relay Output Card + 0f09 PCIe-IIRO-16 Isolated Digital / Relay Output Card + 0fc0 PCIe-IDIO-12 Isolated Digital Input / FET Output Card + 0fc1 PCIe-IDI-12 Isolated Digital Input Card + 0fc2 PCIe-IDO-12 FET Output Card + 0fd0 PCIe-IDIO-24 Isolated Digital Input / FET Output Card + 1050 PCI-422/485-2 2x RS422/RS484 Card + 1051 PCIe-COM-2SRJ 2x RS422/RS484 Card w/RJ45 Connectors + 1052 104I-COM-2S 2x RS422/RS484 PCI/104 Board + 1053 mPCIe-COM-2S 2x RS422/RS484 PCI Express Mini Card + 1058 PCI-COM422/4 4x RS422 Card + 1059 PCI-COM485/4 4x RS485 Card + 105a PCIe-COM422-4 4x RS422 Card + 105b PCIe-COM485-4 4x RS485 Card + 105c PCIe-COM-4SRJ 4x RS422/RS485 Card w/RJ45 Connectors + 105d 104I-COM-4S 4x RS422/RS484 PCI/104 Board + 105e mPCIe-COM-4S 4x RS422/RS484 PCI Express Mini Card + 1068 PCI-COM422/8 8x RS422 Card + 1069 PCI-COM485/8 8x RS485 Card + 106a PCIe-COM422-8 8x RS422 Card + 106b PCIe-COM485-8 8x RS485 Card + 106c 104I-COM-8S 8x RS422/RS485 PCI/104 Board + 1088 PCI-COM232/1 1x RS232 Card + 1090 PCI-COM232/2 2x RS232 Card + 1091 PCIe-COM232-2RJ 2x RS232 Card w/RJ45 Connectors + 1093 mPCIe-COM232-2 2x RS232 PCI Express Mini Card + 1098 PCIe-COM232-4 4x RS232 Card + 1099 PCIe-COM232-4RJ 4x RS232 Card w/RJ45 Connectors + 109b mPCIe-COM232-4 4x RS232 PCI Express Mini Card + 10a8 P104-COM232-8 8x RS232 PC-104+ Board + 10a9 PCIe-COM232-8 8x RS232 Card + 10c9 PCI-COM-1S 1x RS422/RS485 Card + 10d0 PCI-COM2S 2x RS422/RS485 Card + 10d1 PCIe-COM-2SMRJ 2x RS232/RS422/RS485 Card w/RJ45 Connectors + 10d2 104I-COM-2SM 2x RS232/RS422/RS485 PCI/104 Board + 10d3 mPCIe-COM-2SM 2x RS232/RS422/RS485 PCI Express Mini Card + 10d8 PCI-COM-4SM 4x RS232/RS422/RS485 Card + 10d9 PCIe-COM-4SM 4x RS232/RS422/RS485 Card + 10da PCIe-COM-4SMRJ 4x RS232/RS422/RS485 Card w/RJ45 Connectors + 10db 104I-COM-4SM 4x RS232/RS422/RS485 PCI/104 Board + 10dc mPCIe-COM-4SM 4x RS232/RS422/RS485 PCI Express Mini Card + 10e8 PCI-COM-8SM 8x RS232/RS422/RS485 Card + 10e9 PCIe-COM-8SM 8x RS232/RS422/RS485 Card + 10ea 104I-COM-8SM 8x RS232/RS422/RS485 PCI-104 Board + 1108 mPCIe-ICM485-1 1x Isolated RS485 PCI Express Mini Card + 1110 mPCIe-ICM422-2 2x Isolated RS422 PCI Express Mini Card + 1111 mPCIe-ICM485-2 2x Isolated RS485 PCI Express Mini Card + 1118 mPCIe-ICM422-4 4x Isolated RS422 PCI Express Mini Card + 1119 mPCIe-ICM485-4 4x Isolated RS485 PCI Express Mini Card + 1148 PCI-ICM-1S 1x Isolated RS422/RS485 Card + 1150 PCI-ICM-2S 2x Isolated RS422/RS485 Card + 1152 PCIe-ICM-2S 2x Isolated RS422/RS485 Card + 1158 PCI-ICM422/4 4x Isolated RS422 Card + 1159 PCI-ICM485/4 4x Isolated RS485 Card + 115a PCIe-ICM-4S 4x Isolated RS422/RS485 Card + 1190 PCIe-ICM232-2 2x Isolated RS232 Card + 1191 mPCIe-ICM232-2 2x Isolated RS232 PCI Express Mini Card + 1198 PCIe-ICM232-4 4x Isolated RS232 Card + 1199 mPCIe-ICM232-4 4x Isolated RS422 PCI Express Mini Card + 11d0 PCIe-ICM-2SM 2x Isolated RS232/RS422/RS485 Card + 11d8 PCIe-ICM-4SM 4x Isolated RS232/RS422/RS485 Card + 1250 PCI-WDG-2S Watchdog and 2x Serial Card 12d0 PCI-WDG-IMPAC - 22c0 PCI-WDG-CSM - 2c50 PCI-DIO-96CT - 2c58 PCI-DIO-96C3 + 2230 PCI-QUAD-8 8x Quadrature Input Card + 2231 PCI-QUAD-4 4x Quadrature Input Card + 22c0 PCI-WDG-CSM Watchdog Card + 25c0 P104-WDG-E Watchdog PC/104+ Board + 2c50 PCI-DIO-96CT 96x Digital Input / Output Card + 2c58 PCI-DIO-96C3 96x Digital Input / Output Card w/3x 8254 Counter Card + 2ee0 PCIe-DIO24S-CTR12 24x Digital Input / Output Card w/4x 8254 Counter Card + 2fc0 P104-WDG-CSM Watchdog PC/104+ Board + 2fc1 P104-WDG-CSMA Advanced Watchdog PC/104+ Board 5ed0 PCI-DAC - 6c90 PCI-DA12-2 - 6c98 PCI-DA12-4 - 6ca0 PCI-DA12-6 - 6ca8 PCI-DA12-8 + 6c90 PCI-DA12-2 2x 12-bit Analog Output Card + 6c98 PCI-DA12-4 4x 12-bit Analog Output Card + 6ca0 PCI-DA12-6 6x 12-bit Analog Output Card + 6ca8 PCI-DA12-8 8x 12-bit Analog Output Card 6ca9 PCI-DA12-8V - 6cb0 PCI-DA12-16 + 6cb0 PCI-DA12-16 16x 12-bit Analog Output Card 6cb1 PCI-DA12-16V 8ef0 P104-FAS16-16 - aca8 PCI-AI12-16 - aca9 PCI-AI12-16A - eca8 PCI-AIO12-16 - eca9 PCI-A12-16 - ecaa PCI-A12-16A - ece8 PCI-A16-16 + aca8 PCI-AI12-16 12-bit 100kHz Analog Input Card + aca9 PCI-AI12-16A 12-bit 100kHz Analog Input w/FIFO Card + eca8 PCI-AIO12-16 12-bit 100kHz Analog Input w/2x Analog Output and FIFO Card + ecaa PCI-A12-16A 12-bit 100kHz Analog Input w/2x Analog Output and FIFO Card + ece8 LPCI-A16-16A 16-bit 500kHz Analog Input low-profile Card + ece9 LPCI-AIO16A 16-bit 500kHz Analog Input low-profile Card 4978 Axil Computer Inc 4a14 NetVin 5000 NV5000SC @@ -19931,6 +20880,10 @@ 5431 AuzenTech, Inc. 544c Teralogic Inc 0350 TL880-based HDTV/ATSC tuner +544d TBS Technologies + 6178 DVB-S2 4 Tuner PCIe Card + 544d 6904 TBS6904 DVB-S2 Quad Tuner PCIe Card + 544d 6905 TBS6905 DVB-S2 Quad Tuner PCIe Card 5452 SCANLAB AG 3443 RTC4 5455 Technische University Berlin @@ -19943,6 +20896,7 @@ 0003 TURBOstor HFP-832 [HiPPI NIC] 5646 Vector Fabrics BV 5654 VoiceTronix Pty Ltd +5678 Dawicontrol Computersysteme GmbH 5700 Netpower 584d AuzenTech Co., Ltd. 5851 Exacq Technologies @@ -20014,12 +20968,14 @@ 0044 Core Processor DRAM Controller 1025 0347 Aspire 7740G 1025 0487 TravelMate 5742 + 1028 040a Latitude E6410 144d c06a R730 Laptop 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] e4bf 50c1 PC1-GROOVE 0045 Core Processor PCI Express x16 Root Port 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 0046 Core Processor Integrated Graphics Controller + 1028 040a Latitude E6410 144d c06a R730 Laptop 17c0 10d9 Medion Akoya E7214 Notebook PC [MD98410] e4bf 50c1 PC1-GROOVE @@ -20087,6 +21043,7 @@ 0101 Xeon E3-1200/2nd Generation Core Processor Family PCI Express Root Port 1028 04b2 Vostro 3350 106b 00dc MacBookPro8,2 [Core i7, 15", 2011] + 144d c652 NP300E5C series laptop 0102 2nd Generation Core Processor Family Integrated Graphics Controller 1028 04aa XPS 8300 1043 0102 P8H67 Series Motherboard @@ -20095,6 +21052,7 @@ 1028 04b2 Vostro 3350 1028 04da Vostro 3750 106b 00dc MacBookPro8,2 [Core i7, 15", 2011] + 144d c652 NP300E5C series laptop 0105 Xeon E3-1200/2nd Generation Core Processor Family PCI Express Root Port 106b 00dc MacBookPro8,2 [Core i7, 15", 2011] 0106 2nd Generation Core Processor Family Integrated Graphics Controller @@ -20108,6 +21066,7 @@ 0112 2nd Generation Core Processor Family Integrated Graphics Controller 0116 2nd Generation Core Processor Family Integrated Graphics Controller 1028 04da Vostro 3750 + 144d c652 integrated HD 3000 graphics controller on NP300E5C series laptop 0122 2nd Generation Core Processor Family Integrated Graphics Controller 0126 2nd Generation Core Processor Family Integrated Graphics Controller 1028 04cc Vostro 3350 @@ -20125,13 +21084,16 @@ 0153 3rd Gen Core Processor Thermal Subsystem 1043 1517 Zenbook Prime UX31A 0154 3rd Gen Core processor DRAM Controller + 1025 0806 Aspire E1-470G 1025 0813 Aspire R7-571 103c 17f6 ProBook 4540s + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 0155 Xeon E3-1200 v2/3rd Gen Core processor PCI Express Root Port 8086 2010 Server Board S1200BTS 0156 3rd Gen Core processor Graphics Controller + 1043 108d VivoBook X202EV 0158 Xeon E3-1200 v2/Ivy Bridge DRAM Controller 1043 844d P8 series motherboard 8086 2010 Server Board S1200BTS @@ -20505,6 +21467,8 @@ 8086 3705 DC P3500 SSD [2.5" SFF] 8086 3709 DC P3600 SSD [Add-in Card] 8086 370a DC P3600 SSD [2.5" SFF] + 8086 370d SSD 750 Series [Add-in Card] + 8086 370e SSD 750 Series [2.5" SFF] 095a Wireless 7265 # Stone Peak 2 AC 8086 1010 Dual Band Wireless-AC 7265 @@ -20842,7 +21806,8 @@ 0f28 Atom Processor Z36xxx/Z37xxx Series LPE Audio Controller 0f31 Atom Processor Z36xxx/Z37xxx Series Graphics & Display 0f34 Atom Processor Z36xxx/Z37xxx Series USB EHCI - 0f35 Atom Processor Z36xxx/Z37xxx Series USB xHCI + 0f35 Atom Processor Z36xxx/Z37xxx, Celeron N2000 Series USB xHCI + 1025 0936 Aspire ES1 0f37 Atom Processor Z36xxx/Z37xxx Series OTG USB Device 0f38 Atom Processor Z36xxx/Z37xxx Series Camera ISP 0f40 Atom Processor Z36xxx/Z37xxx Series LPIO2 DMA Controller @@ -21079,6 +22044,7 @@ 1066 82562 EM/EX/GX - PRO/100 VM (LOM) Ethernet Controller 1067 82562 EM/EX/GX - PRO/100 VM Ethernet Controller 1068 82562ET/EZ/GT/GZ - PRO/100 VE (LOM) Ethernet Controller Mobile + 103c 30d5 530 Laptop 1069 82562EM/EX/GX - PRO/100 VM (LOM) Ethernet Controller Mobile 106a 82562G - PRO/100 VE (LOM) Ethernet Controller 106b 82562G - PRO/100 VE Ethernet Controller Mobile @@ -21196,7 +22162,7 @@ 8086 1199 PRO/1000 GT Quad Port Server Adapter 10b6 82598 10GbE PCI-Express Ethernet Controller 10b9 82572EI Gigabit Ethernet Controller (Copper) - 103c 704a HP 110T PCIe Gigabit Server Adapter + 103c 704a 110T PCIe Gigabit Server Adapter 8086 1083 PRO/1000 PT Desktop Adapter 8086 1093 PRO/1000 PT Desktop Adapter 10ba 80003ES2LAN Gigabit Ethernet Controller (Copper) @@ -21311,6 +22277,7 @@ 1028 1f63 10GbE 2P X520k bNDC 103c 17d2 Ethernet 10Gb 2-port 560M Adapter 103c 18d0 Ethernet 10Gb 2-port 560FLB Adapter + 1059 0111 T4007 10GbE interface 8086 000c Ethernet X520 10GbE Dual Port KX4-KR Mezz 10f9 82599 10 Gigabit Dual Port Network Connection 10fb 82599ES 10-Gigabit SFI/SFP+ Network Connection @@ -21322,14 +22289,22 @@ 103c 2147 Ethernet 10Gb 1-port 561i Adapter 103c 2159 Ethernet 10Gb 2-port 562i Adapter 108e 7b11 Ethernet Server Adapter X520-2 + 1170 004c 82599 DP 10G Mezzanine Adapter 1734 11a9 10 Gigabit Dual Port Network Connection 17aa 1071 ThinkServer X520-2 AnyFabric + 17aa 4007 82599ES 10-Gigabit SFI/SFP+ Network Connection + 17aa 402b 82599ES 10Gb 2-port Server Adapter X520-DA2 + 17aa 402f FPGA Card XC7VX690T-3FFG1157E + 1bd4 001b 10G SFP+ DP ER102Fi4 Rack Adapter + 1bd4 002f 10G SFP+ DP EP102Fi4A Adapter + 1bd4 0032 10G SFP+ DP EP102Fi4 Adapter 8086 0002 Ethernet Server Adapter X520-DA2 8086 0003 Ethernet Server Adapter X520-2 8086 0006 Ethernet Server Adapter X520-1 8086 0008 Ethernet OCP Server Adapter X520-2 8086 000a Ethernet Server Adapter X520-1 8086 000c Ethernet Server Adapter X520-2 + 8086 10a6 82599ES 10Gb 2 port Server Adapter X520-DA2 8086 7a11 Ethernet Server Adapter X520-2 8086 7a12 Ethernet Server Adapter X520-2 10fc 82599 10 Gigabit Dual Port Network Connection @@ -21339,14 +22314,14 @@ 1025 1016 Travelmate 612 TX 1043 8027 TUSL2-C Mainboard 104d 80df Vaio PCG-FX403 - 8086 4532 D815EEA2 mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 4557 D815EGEW Mainboard 1131 82815 815 Chipset AGP Bridge 1132 82815 Chipset Graphics Controller (CGC) 1025 1016 Travelmate 612 TX 103c 2001 e-pc 40 104d 80df Vaio PCG-FX403 - 8086 4532 D815EEA2 Mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 4541 D815EEA Motherboard 8086 4557 D815EGEW Mainboard 1161 82806AA PCI64 Hub Advanced Programmable Interrupt Controller @@ -21578,6 +22553,7 @@ 150f 82580 Gigabit Fiber Network Connection 1510 82580 Gigabit Backplane Connection 1511 82580 Gigabit SFP Connection + 1513 CV82524 Thunderbolt Controller [Light Ridge 4C 2010] 1514 Ethernet X520 10GbE Dual Port KX4 Mezz 8086 000b Ethernet X520 10GbE Dual Port KX4 Mezz 1515 X540 Ethernet Controller Virtual Function @@ -21587,6 +22563,8 @@ 1517 82599ES 10 Gigabit Network Connection 1137 006a UCS CNA M61KR-I Intel Converged Network Adapter 1518 82576NS SerDes Gigabit Network Connection + 151a DSL2310 Thunderbolt Controller [Eagle Ridge 2C 2011] + 151b CVL2510 Thunderbolt Controller [Light Peak 2C 2010] 151c 82599 10 Gigabit TN Network Connection 108e 7b13 Dual 10GBASE-T LP 1520 I350 Ethernet Controller Virtual Function @@ -21609,7 +22587,11 @@ 1093 76b1 PCIe-8237R-S Ethernet Adapter 1093 775b PCIe-8237 Ethernet Adapter 10a9 802a UV2-BaseIO dual-port GbE + 15d9 0652 Dual Port i350 GbE MicroLP [AOC-CGP-i2] 17aa 1074 ThinkServer I350-T4 AnyFabric + 17aa 4005 I350 Gigabit Network Connection + 1bd4 001d 1G base-T QP EP014Ti1 Adapter + 1bd4 0035 1G base-T QP EP014Ti1 Adapter 8086 0001 Ethernet Server Adapter I350-T4 8086 0002 Ethernet Server Adapter I350-T2 8086 00a1 Ethernet Server Adapter I350-T4 @@ -21652,6 +22634,10 @@ 108e 7b15 Sun Dual Port 10 GbE PCIe 2.0 Low Profile Adapter, Base-T 1137 00bf Ethernet Converged Network Adapter X540-T2 17aa 1073 ThinkServer X540-T2 AnyFabric + 17aa 4006 Ethernet Controller 10-Gigabit X540-AT2 + 1bd4 001a 10G base-T DP ER102Ti3 Rack Adapter + 1bd4 0033 10G base-T DP EP102Ti3 Adapter + 1bd4 0034 10G base-T DP EP102Ti3A Adapter 8086 0001 Ethernet Converged Network Adapter X540-T2 8086 0002 Ethernet Converged Network Adapter X540-T1 8086 001a Ethernet Converged Network Adapter X540-T2 @@ -21671,14 +22657,18 @@ 8086 0002 Ethernet Server Adapter I210-T1 1536 I210 Gigabit Fiber Network Connection 1537 I210 Gigabit Backplane Connection + 1059 0110 T4005 1GbE interface + 1059 0111 T4007 1GbE interface + 1059 0120 T4008 1GbE interface 1538 I210 Gigabit Network Connection 1539 I211 Gigabit Network Connection 153a Ethernet Connection I217-LM 103c 1909 ZBook 15 17aa 220e ThinkPad T440p 153b Ethernet Connection I217-V - 1547 DSL3510 Thunderbolt Port [Cactus Ridge] - 1549 DSL3510 Thunderbolt Controller [Cactus Ridge] + 1547 DSL3510 Thunderbolt Controller [Cactus Ridge 4C 2012] + 1548 DSL3310 Thunderbolt Controller [Cactus Ridge 2C 2012] + 1549 DSL2210 Thunderbolt Controller [Port Ridge 1C 2011] 154a Ethernet Server Adapter X520-4 8086 011a Ethernet Converged Network Adapter X520-4 8086 011b Ethernet Converged Network Adapter X520-4 @@ -21687,6 +22677,9 @@ 154d Ethernet 10G 2P X520 Adapter 8086 7b11 10GbE 2P X520 Adapter 1557 82599 10 Gigabit Network Connection + 17aa 4008 82599EN 10 Gigabit Network Connection + 1bd4 001c 10G SFP+ SP ER101Fi4 Rack Adapter + 1bd4 0030 10G SFP+ SP EP101Fi4A Adapter 8086 0001 Ethernet OCP Server Adapter X520-1 1558 Ethernet Converged Network Adapter X520-Q1 8086 011a Ethernet Converged Network Adapter X520-Q1 @@ -21700,15 +22693,37 @@ 8086 0001 Ethernet Server Bypass Adapter X520-SR2 8086 0002 Ethernet Server Bypass Adapter X520-LR2 1560 Ethernet Controller X540 + 1563 Ethernet Controller 10G X550T + 1028 1fa8 Ethernet 10G 4P X550/I350 rNDC + 1028 1fa9 Ethernet 10G 4P X550 rNDC + 1590 00d1 Ethernet 10Gb 2-port 562T Adapter + 1590 00d2 Ethernet 10Gb 2-port 562FLR-T Adapter + 8086 0001 Ethernet Converged Network Adapter X550-T2 + 8086 001a Ethernet Converged Network Adapter X550-T2 + 8086 0022 Ethernet Converged Network Adapter X550-T2 + 1565 X550 Virtual Function + 1566 DSL4410 Thunderbolt NHI [Redwood Ridge 2C 2013] + 1567 DSL4410 Thunderbolt Bridge [Redwood Ridge 2C 2013] + 1568 DSL4510 Thunderbolt NHI [Redwood Ridge 4C 2013] + 1569 DSL4510 Thunderbolt Bridge [Redwood Ridge 4C 2013] + 156a DSL5320 Thunderbolt 2 NHI [Falcon Ridge 2C 2013] + 156b DSL5320 Thunderbolt 2 Bridge [Falcon Ridge 2C 2013] + 156c DSL5520 Thunderbolt 2 NHI [Falcon Ridge 4C 2013] + 156d DSL5520 Thunderbolt 2 Bridge [Falcon Ridge 4C 2013] 156f Ethernet Connection I219-LM 1570 Ethernet Connection I219-V 1571 XL710/X710 Virtual Function 1572 Ethernet Controller X710 for 10GbE SFP+ + 1028 0000 Ethernet 10G X710 rNDC 1028 1f99 Ethernet 10G 4P X710/I350 rNDC + 1028 1f9c Ethernet 10G 4P X710 SFP+ rNDC + 103c 0000 Ethernet 10Gb 562SFP+ Adapter 103c 22fc HP Ethernet 10Gb 2-port 562FLR-SFP+ Adapter 103c 22fd HP Ethernet 10Gb 2-port 562SFP+ Adapter 1137 0000 Ethernet Converged NIC X710-4 1137 013b Ethernet Converged NIC X710-4 + 1590 0000 Ethernet 10GbE 4P 563SFP+ Adapter + 1590 0225 Ethernet 10GbE 4P 563SFP+ Adapter 17aa 0000 ThinkServer X710 AnyFabric for 10GbE SFP+ 17aa 4001 ThinkServer X710-4 AnyFabric for 10GbE SFP+ 17aa 4002 ThinkServer X710-2 AnyFabric for 10GbE SFP+ @@ -21722,31 +22737,46 @@ 8086 0008 Ethernet Converged Network Adapter X710-2 8086 0009 Ethernet Controller X710 for 10GbE SFP+ 8086 000a Ethernet Controller X710 for 10GbE SFP+ + 8086 000b Ethernet Server Adapter X710-DA2 for OCP 8086 000d Ethernet Controller X710 for 10GbE SFP+ + 8086 0010 Ethernet Converged Network Adapter X710 8086 4005 Ethernet Controller XL710 for 10 Gigabit SFP+ 8086 4006 Ethernet Controller X710 for 10GbE SFP+ + 1575 DSL6340 Thunderbolt 3 NHI [Alpine Ridge 2C 2015] + 1576 DSL6340 Thunderbolt 3 Bridge [Alpine Ridge 2C 2015] + 1577 DSL6540 Thunderbolt 3 NHI [Alpine Ridge 4C 2015] + 1578 DSL6540 Thunderbolt 3 Bridge [Alpine Ridge 4C 2015] 157b I210 Gigabit Network Connection 157c I210 Gigabit Backplane Connection + 157d DSL5110 Thunderbolt 2 NHI (Low Power) [Win Ridge 2C 2014] + 157e DSL5110 Thunderbolt 2 Bridge (Low Power) [Win Ridge 2C 2014] 1580 Ethernet Controller XL710 for 40GbE backplane 1581 Ethernet Controller X710 for 10GbE backplane + 1028 0000 Ethernet 10G X710-k bNDC 1028 1f98 Ethernet 10G 4P X710-k bNDC + 1028 1f9e Ethernet 10G 2P X710-k bNDC + 1590 0000 Ethernet 2-port 563i Adapter + 1590 00f8 Ethernet 2-port 563i Adapter + 8086 0000 Ethernet Converged Network Adapter XL710-Q2 1583 Ethernet Controller XL710 for 40GbE QSFP+ 1028 0000 Ethernet 40G 2P XL710 QSFP+ rNDC 1028 1f9f Ethernet 40G 2P XL710 QSFP+ rNDC - 108e 0000 Oracle 10 Gb and 40 Gb Ethernet Adapter - 108e 7b1b Oracle 10 Gb and 40 Gb Ethernet Adapter - 1137 0000 Ethernet Converged NIC XL710-Q2 - 1137 013c Ethernet Converged NIC XL710-Q2 + 108e 0000 10 Gb/40 Gb Ethernet Adapter + 108e 7b1b 10 Gb/40 Gb Ethernet Adapter + 1137 0000 Ethernet Converged NIC XL710-QDA2 + 1137 013c Ethernet Converged NIC XL710-QDA2 8086 0000 Ethernet Converged Network Adapter XL710-Q2 8086 0001 Ethernet Converged Network Adapter XL710-Q2 8086 0002 Ethernet Converged Network Adapter XL710-Q2 8086 0003 Ethernet I/O Module XL710-Q2 + 8086 0004 Ethernet Server Adapter XL710-Q2OCP 8086 0006 Ethernet Converged Network Adapter XL710-Q2 1584 Ethernet Controller XL710 for 40GbE QSFP+ 8086 0000 Ethernet Converged Network Adapter XL710-Q1 8086 0001 Ethernet Converged Network Adapter XL710-Q1 8086 0002 Ethernet Converged Network Adapter XL710-Q1 8086 0003 Ethernet I/O Module XL710-Q1 + 8086 0004 Ethernet Server Adapter XL710-Q1OCP 1585 Ethernet Controller X710 for 10GbE QSFP+ 1586 Ethernet Controller X710 for 10GBASE-T 108e 0000 Ethernet Controller X710 for 10GBASE-T @@ -21754,14 +22784,27 @@ 1587 Ethernet Controller XL710 for 20GbE backplane 103c 0000 HP Flex-20 20Gb 2-port 660FLB Adapter 103c 22fe HP Flex-20 20Gb 2-port 660FLB Adapter - 103c 22ff HP Flex-20 20Gb 2-port 660M Adapter 1588 Ethernet Controller XL710 for 20GbE backplane 103c 0000 HP Flex-20 20Gb 2-port 660M Adapter 103c 22ff HP Flex-20 20Gb 2-port 660M Adapter 1589 Ethernet Controller X710/X557-AT 10GBASE-T + 108e 0000 Quad Port 10GBase-T Adapter + 108e 7b1c Quad Port 10GBase-T Adapter 8086 0000 Ethernet Converged Network Adapter X710-T 8086 0001 Ethernet Converged Network Adapter X710-T4 8086 0002 Ethernet Converged Network Adapter X710-T4 + 8086 1003 Ethernet Converged Network Adapter X710-T + 158a Ethernet Controller XXV710 for 25GbE backplane + 158b Ethernet Controller XXV710 for 25GbE SFP28 + 8086 0000 Ethernet Network Adapter XXV710 + 8086 0001 Ethernet Network Adapter XXV710-2 + 8086 0002 Ethernet Network Adapter XXV710-2 + 8086 0003 Ethernet Network Adapter XXV710-1 + 8086 0004 Ethernet Network Adapter XXV710-1 + 8086 0005 Ethernet Network Adapter OCP XXV710-2 + 8086 0006 Ethernet Network Adapter OCP XXV710-2 + 8086 0007 Ethernet Network Adapter OCP XXV710-1 + 8086 0008 Ethernet Network Adapter OCP XXV710-1 15a0 Ethernet Connection (2) I218-LM 15a1 Ethernet Connection (2) I218-V 15a2 Ethernet Connection (3) I218-LM @@ -21770,18 +22813,40 @@ 15a5 Ethernet Switch FM10000 Host Virtual Interface 15a8 Ethernet Connection X552 Virtual Function 15aa Ethernet Connection X552 10 GbE Backplane + 1059 0120 T4008 10GbE interface 15ab Ethernet Connection X552 10 GbE Backplane 15ac Ethernet Connection X552 10 GbE SFP+ 15ad Ethernet Connection X552/X557-AT 10GBASE-T + 15ae Ethernet Connection X552 1000BASE-T + 15b5 DSL6340 USB 3.1 Controller [Alpine Ridge] + 15b6 DSL6540 USB 3.1 Controller [Alpine Ridge] 15b7 Ethernet Connection (2) I219-LM 15b8 Ethernet Connection (2) I219-V + 15b9 Ethernet Connection (3) I219-LM + 15bf JHL6240 Thunderbolt 3 NHI (Low Power) [Alpine Ridge LP 2016] + 15c0 JHL6240 Thunderbolt 3 Bridge (Low Power) [Alpine Ridge LP 2016] + 15d0 Ethernet SDI Adapter FM10420-100GbE-QDA2 + 15d1 Ethernet Controller 10G X550T + 8086 0002 Ethernet Converged Network Adapter X550-T1 + 8086 0021 Ethernet Converged Network Adapter X550-T1 + 8086 00a2 Ethernet Converged Network Adapter X550-T1 + 15d2 JHL6540 Thunderbolt 3 NHI (C step) [Alpine Ridge 4C 2016] + 15d3 JHL6540 Thunderbolt 3 Bridge (C step) [Alpine Ridge 4C 2016] + 15d5 Ethernet SDI Adapter FM10420-25GbE-DA2 + 8086 0001 Intel(R) Ethernet SDI Adapter FM10420-25GbE-DA2 + 15d6 Ethernet Connection (5) I219-V + 15d7 Ethernet Connection (4) I219-LM + 15d8 Ethernet Connection (4) I219-V + 15d9 JHL6340 Thunderbolt 3 NHI (C step) [Alpine Ridge 2C 2016] + 15da JHL6340 Thunderbolt 3 Bridge (C step) [Alpine Ridge 2C 2016] + 15e3 Ethernet Connection (5) I219-LM 1600 Broadwell-U Host Bridge -OPI 1601 Broadwell-U PCI Express x16 Controller 1602 Broadwell-U Integrated Graphics - 1603 Broadwell-U Camarillo Device + 1603 Broadwell-U Processor Thermal Subsystem 1604 Broadwell-U Host Bridge -OPI 1605 Broadwell-U PCI Express x8 Controller - 1606 Broadwell-U Integrated Graphics + 1606 HD Graphics 1607 Broadwell-U CHAPS Device 1608 Broadwell-U Host Bridge -OPI 1609 Broadwell-U x4 PCIe @@ -21792,19 +22857,19 @@ 160e Broadwell-U Integrated Graphics 160f Broadwell-U SoftSKU 1610 Broadwell-U Host Bridge - DMI - 1612 Broadwell-U Integrated Graphics + 1612 HD Graphics 5600 1614 Broadwell-U Host Bridge - DMI - 1616 Broadwell-U Integrated Graphics + 1616 HD Graphics 5500 103c 2216 ZBook 15u G2 Mobile Workstation 1618 Broadwell-U Host Bridge - DMI 161a Broadwell-U Integrated Graphics 161b Broadwell-U Integrated Graphics 161d Broadwell-U Integrated Graphics - 161e Broadwell-U Integrated Graphics - 1622 Broadwell-U Integrated Graphics - 1626 Broadwell-U Integrated Graphics - 162a Broadwell-U Integrated Graphics - 162b Broadwell-U Integrated Graphics + 161e HD Graphics 5300 + 1622 Iris Pro Graphics 6200 + 1626 HD Graphics 6000 + 162a Iris Pro Graphics P6300 + 162b Iris Graphics 6100 162d Broadwell-U Integrated Graphics 162e Broadwell-U Integrated Graphics 1632 Broadwell-U Integrated Graphics @@ -21813,25 +22878,40 @@ 163b Broadwell-U Integrated Graphics 163d Broadwell-U Integrated Graphics 163e Broadwell-U Integrated Graphics - 1900 Sky Lake Host Bridge/DRAM Registers - 1901 Sky Lake PCIe Controller (x16) - 1904 Sky Lake Host Bridge/DRAM Registers - 1905 Sky Lake PCIe Controller (x8) - 1908 Sky Lake Host Bridge/DRAM Registers - 1909 Sky Lake PCIe Controller (x4) - 190c Sky Lake Host Bridge/DRAM Registers - 190f Sky Lake Host Bridge/DRAM Registers - 1910 Sky Lake Host Bridge/DRAM Registers - 1911 Sky Lake Gaussian Mixture Model - 1912 Sky Lake Integrated Graphics - 1916 Sky Lake Integrated Graphics - 1918 Sky Lake Host Bridge/DRAM Registers - 1919 Sky Lake Imaging Unit - 191e Sky Lake Integrated Graphics - 191f Sky Lake Host Bridge/DRAM Registers - 1926 Sky Lake Integrated Graphics - 1932 Sky Lake Integrated Graphics - 193b Sky Lake Integrated Graphics + 1900 Skylake Host Bridge/DRAM Registers + 1901 Skylake PCIe Controller (x16) + 1902 HD Graphics 510 + 1903 Skylake Processor Thermal Subsystem + 1904 Skylake Host Bridge/DRAM Registers + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 1905 Skylake PCIe Controller (x8) + 1906 HD Graphics 510 + 17aa 382a B51-80 Laptop + 1908 Skylake Host Bridge/DRAM Registers + 1909 Skylake PCIe Controller (x4) + 190c Skylake Host Bridge/DRAM Registers + 190f Skylake Host Bridge/DRAM Registers + 1910 Skylake Host Bridge/DRAM Registers + 1911 Skylake Gaussian Mixture Model + 1912 HD Graphics 530 + 1916 HD Graphics 520 + 1028 06f3 Latitude 3570 + 1918 Skylake Host Bridge/DRAM Registers + 1919 Skylake Imaging Unit + 191b HD Graphics 530 + 191d HD Graphics P530 + 191e HD Graphics 515 + 191f Skylake Host Bridge/DRAM Registers + 1921 HD Graphics 520 + 1926 Iris Graphics 540 + 1927 Iris Graphics 550 + 192b Iris Graphics 555 + 192d Iris Graphics P555 + 1932 Iris Pro Graphics 580 + 193a Iris Pro Graphics P580 + 193b Iris Pro Graphics 580 + 193d Iris Pro Graphics P580 1960 80960RP (i960RP) Microprocessor 101e 0431 MegaRAID 431 RAID Controller 101e 0438 MegaRAID 438 Ultra2 LVD RAID Controller @@ -21859,6 +22939,7 @@ e4bf 3100 CX1-BAND 1962 80960RM (i960RM) Microprocessor 105a 0000 SuperTrak SX6000 I2O CPU + 19df DNV SMBus controller 1a21 82840 840 [Carmel] Chipset Host Bridge (Hub A) 1a23 82840 840 [Carmel] Chipset AGP Bridge 1a24 82840 840 [Carmel] Chipset PCI Bridge (Hub B) @@ -22071,30 +23152,39 @@ 1d76 C600/X79 series chipset Multi-Function Glue 1e00 7 Series/C210 Series Chipset Family 4-port SATA Controller [IDE mode] 1e01 7 Series Chipset Family 4-port SATA Controller [IDE mode] + 144d c652 NP300E5C series laptop 1e02 7 Series/C210 Series Chipset Family 6-port SATA Controller [AHCI mode] 1043 84ca P8 series motherboard 1849 1e02 Motherboard 1e03 7 Series Chipset Family 6-port SATA Controller [AHCI mode] + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A + 144d c652 NP300E5C series laptop 1e04 7 Series/C210 Series Chipset Family SATA Controller [RAID mode] 1e05 7 Series Chipset SATA Controller [RAID mode] 1e06 7 Series/C210 Series Chipset Family SATA Controller [RAID mode] 1e07 7 Series Chipset Family SATA Controller [RAID mode] 1e08 7 Series/C210 Series Chipset Family 2-port SATA Controller [IDE mode] 1e09 7 Series Chipset Family 2-port SATA Controller [IDE mode] + 144d c652 NP300E5C series laptop 1e0e 7 Series/C210 Series Chipset Family SATA Controller [RAID mode] 1e10 7 Series/C210 Series Chipset Family PCI Express Root Port 1 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8H77-I Motherboard + 144d c652 NP300E5C series laptop 1849 1e10 Motherboard 1e12 7 Series/C210 Series Chipset Family PCI Express Root Port 2 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1e14 7 Series/C210 Series Chipset Family PCI Express Root Port 3 1e16 7 Series/C210 Series Chipset Family PCI Express Root Port 4 + 1043 108d VivoBook X202EV 1043 1477 N56VZ + 144d c652 NP300E5C series laptop 1849 1618 Z77 Extreme4 motherboard 1e18 7 Series/C210 Series Chipset Family PCI Express Root Port 5 1043 84ca P8H77-I Motherboard @@ -22105,40 +23195,52 @@ 1e1e 7 Series/C210 Series Chipset Family PCI Express Root Port 8 1849 1e1e Motherboard 1e20 7 Series/C210 Series Chipset Family High Definition Audio Controller + 1028 054b Dell XPS One 2710 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 8415 P8H77-I Motherboard 1043 8445 ASUS P8Z77-V LX Motherboard + 144d c652 NP300E5C series laptop 1849 1898 Z77 Extreme4 motherboard 1e22 7 Series/C210 Series Chipset Family SMBus Controller + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8 series motherboard + 144d c652 NP300E5C series laptop 1849 1e22 Motherboard 1e24 7 Series/C210 Series Chipset Family Thermal Management Controller 1043 1517 Zenbook Prime UX31A 1e25 7 Series/C210 Series Chipset Family DMI to PCI Bridge 1e26 7 Series/C210 Series Chipset Family USB Enhanced Host Controller #1 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8 series motherboard + 144d c652 NP300E5C series laptop 1849 1e26 Motherboard 1e2d 7 Series/C210 Series Chipset Family USB Enhanced Host Controller #2 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8 series motherboard + 144d c652 NP300E5C series laptop 1849 1e2d Motherboard 1e31 7 Series/C210 Series Chipset Family USB xHCI Host Controller 103c 17ab ProBook 6570b + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8 series motherboard 1849 1e31 Motherboard 1e33 7 Series/C210 Series Chipset Family LAN Controller 1e3a 7 Series/C210 Series Chipset Family MEI Controller #1 + 1043 108d VivoBook X202EV 1043 1477 N56VZ 1043 1517 Zenbook Prime UX31A 1043 84ca P8 series motherboard + 144d c652 NP300E5C series laptop 1849 1e3a Motherboard 1e3b 7 Series/C210 Series Chipset Family MEI Controller #2 1e3c 7 Series/C210 Series Chipset Family IDE-r Controller @@ -22177,7 +23279,9 @@ 1e5b UM77 Express Chipset LPC Controller 1e5c 7 Series Chipset Family LPC Controller 1e5d HM75 Express Chipset LPC Controller + 144d c652 NP300E5C series laptop 1e5e 7 Series Chipset Family LPC Controller + 1043 108d VivoBook X202EV 1e5f 7 Series Chipset Family LPC Controller 1f00 Atom processor C2000 SoC Transaction Router 1f01 Atom processor C2000 SoC Transaction Router @@ -22276,6 +23380,37 @@ 225c Xeon Phi coprocessor SE10/7120 series 225d Xeon Phi coprocessor 3120 series 225e Xeon Phi coprocessor 31S1 + 2280 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series SoC Transaction Register + 2284 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series High Definition Audio Controller + 2286 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO1 DMA Controller + 228a Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO1 HSUART Controller #1 + 228c Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO1 HSUART Controller #2 + 2292 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx SMBus Controller + 2294 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series MMC Controller + 2295 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series SDIO Controller + 2296 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series SD Controller + 2298 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series Trusted Execution Engine + 229c Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCU + 22a3 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series SATA Controller + 22a4 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series SATA AHCI Controller + 22a8 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series Low Power Engine Audio + 22b0 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCI Configuration Registers + 22b1 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Integrated Graphics Controller + 22b5 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series USB xHCI Controller + 22b8 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series Imaging Unit + 22c0 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 DMA Controller + 22c1 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #1 + 22c2 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #2 + 22c3 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #3 + 22c4 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #4 + 22c5 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #5 + 22c6 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #6 + 22c7 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series LPIO2 I2C Controller #7 + 22c8 Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCI Express Port #1 + 22ca Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCI Express Port #2 + 22cc Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCI Express Port #3 + 22ce Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series PCI Express Port #4 + 22dc Atom/Celeron/Pentium Processor x5-E8000/J3xxx/N3xxx Series Power Management Controller 2310 DH89xxCC LPC Controller 2323 DH89xxCC 4 Port SATA AHCI Controller 2330 DH89xxCC SMBus Controller @@ -22351,7 +23486,7 @@ 104d 80df Vaio PCG-FX403 147b 0505 BL7 motherboard 147b 0507 TH7II-RAID - 8086 4532 D815EEA2 mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 4557 D815EGEW Mainboard 8086 5744 S845WD1-E mainboard 2443 82801BA/BAM SMBus Controller @@ -22366,7 +23501,7 @@ 147b 0505 BL7 motherboard 147b 0507 TH7II-RAID 15d9 3280 Supermicro P4SBE Mainboard - 8086 4532 D815EEA2 mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 4557 D815EGEW Mainboard 8086 5744 S845WD1-E mainboard 2444 82801BA/BAM UHCI USB 1.1 Controller #2 @@ -22379,7 +23514,7 @@ 104d 80df Vaio PCG-FX403 147b 0505 BL7 motherboard 147b 0507 TH7II-RAID - 8086 4532 D815EEA2 mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 5744 S845WD1-E mainboard 2445 82801BA/BAM AC'97 Audio Controller 0e11 000b Compaq Deskpro EN Audio @@ -22393,12 +23528,14 @@ 147b 0505 BL7 motherboard 147b 0507 TH7II-RAID 8086 4557 D815EGEW Mainboard + 8086 4656 Desktop Board D815EFV 2446 82801BA/BAM AC'97 Modem Controller 1025 1016 Travelmate 612 TX 104d 80df Vaio PCG-FX403 2448 82801 Mobile PCI Bridge + 1028 040a Latitude E6410 1028 040b Latitude E6510 - 103c 0934 HP Compaq nw8240 Mobile Workstation + 103c 0934 Compaq nw8240 Mobile Workstation 103c 099c NX6110/NC6120 103c 309f Compaq nx9420 Notebook 103c 30a3 Compaq nw8440 @@ -22411,6 +23548,7 @@ 144d c072 Notebook N150P 1458 5000 GA-D525TUD 1734 1055 Amilo M1420 + 17aa 2013 ThinkPad R60e 17aa 20ae ThinkPad T61/R61 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 17c0 4083 Medion WIM 2210 Notebook PC [MD96850] @@ -22461,7 +23599,7 @@ 147b 0505 BL7 motherboard 147b 0507 TH7II-RAID 15d9 3280 Supermicro P4SBE Mainboard - 8086 4532 D815EEA2 mainboard + 8086 4532 Desktop Board D815EEA2/D815EFV 8086 4557 D815EGEW Mainboard 8086 5744 S845WD1-E mainboard 244c 82801BAM ISA Bridge (LPC) @@ -22555,6 +23693,7 @@ 1025 005a TravelMate 290 1025 0064 Extensa 3000 series laptop: Intel 82801DBM (ICH4-M) 1028 0126 Optiplex GX260 + 1028 0160 Dimension 2400 1028 0163 Latitude D505 1028 018d Inspiron 700m/710m 1028 0196 Inspiron 5160 @@ -22582,6 +23721,7 @@ 1025 0064 Extensa 3000 series laptop: Intel 82801DBM (ICH4-M) 1028 0126 Optiplex GX260 1028 014f Latitude X300 + 1028 0160 Dimension 2400 1028 018d Inspiron 700m/710m 103c 088c NC8000 laptop 103c 0890 NC6000 laptop @@ -22603,6 +23743,7 @@ 1025 005a TravelMate 290 1025 0064 Extensa 3000 series laptop: Intel 82801DBM (ICH4-M) 1028 0126 Optiplex GX260 + 1028 0160 Dimension 2400 1028 0163 Latitude D505 1028 018d Inspiron 700m/710m 1028 0196 Inspiron 5160 @@ -22630,6 +23771,7 @@ 1028 0139 Latitude D400 1028 014f Latitude X300 1028 0152 Latitude D500 + 1028 0160 Dimension 2400 1028 0163 Latitude D505 1028 018d Inspiron 700m/710m [SigmaTel STAC9750,51] 1028 0196 Inspiron 5160 @@ -22646,7 +23788,6 @@ 1462 5800 845PE Max (MS-6580) 1734 1005 D1451 (SCENIC N300, i845GV) Sigmatel STAC9750T 1734 1055 Amilo M1420 - 8086 24c5 Dell Dimension 2400 24c6 82801DB/DBL/DBM (ICH4/ICH4-L/ICH4-M) AC'97 Modem Controller 1014 0524 ThinkPad T4x Series 1014 0525 ThinkPad @@ -22672,6 +23813,7 @@ 1025 005a TravelMate 290 1025 0064 Extensa 3000 series laptop: Intel 82801DBM (ICH4-M) 1028 0126 Optiplex GX260 + 1028 0160 Dimension 2400 1028 0163 Latitude D505 1028 018d Inspiron 700m/710m 1028 0196 Inspiron 5160 @@ -22708,6 +23850,7 @@ 24cb 82801DB (ICH4) IDE Controller 1014 0267 NetVista A30p 1028 0126 Optiplex GX260 + 1028 0160 Dimension 2400 1043 8089 P4B533 114a 0582 PC8 onboard IDE 1458 24c2 GA-8PE667 Ultra @@ -22728,6 +23871,7 @@ 1028 0126 Optiplex GX260 1028 0139 Latitude D400 1028 0152 Latitude D500 + 1028 0160 Dimension 2400 1028 0163 Latitude D505 1028 018d Inspiron 700m/710m 1028 0196 Inspiron 5160 @@ -22935,6 +24079,20 @@ 24df 82801ER (ICH5R) SATA Controller 1028 0168 Precision Workstation 670 Mainboard 24f0 Omni-Path HFI Silicon 100 Series [discrete] + 10a9 802e Omni-path HFI 100 Series, 1-port A-board + 10a9 802f Omni-path HFI 100 Series, 2-port A-board + 10a9 8030 Omni-path HFI 100 Series, 1-port B-board + 10a9 8031 Omni-path HFI 100 Series, 2-port B-board + 1590 00e7 100Gb 1-port OP101 QSFP28 x8 PCIe Gen3 with Intel Omni-Path Adapter + 1590 00e8 100Gb 1-port OP101 QSFP28 x16 PCIe Gen3 with Intel Omni-Path Adapter + 1590 021c Apollo 100Gb 1-port Intel Omni-Path Architecture 860z Mezzanine FIO Adapter + 15d9 0934 Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x16, SIOM Module + 1cb8 0001 Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x16, TC4600 QSFP28 + 1cb8 0002 Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x16, TC6600 Fixed Port + 8086 2628 Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x16 + 8086 2629 Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x8 + 8086 262a Omni-Path HFI Adapter 100 Series, 2 Ports, Split PCIe x16 + 8086 262d Omni-Path HFI Adapter 100 Series, 1 Port, PCIe x16, IO Module AHWKPTP100HF 24f1 Omni-Path HFI Silicon 100 Series [integrated] 24f3 Wireless 8260 # Snow Field Peak AC @@ -22989,6 +24147,7 @@ 2562 82845G/GL[Brookdale-G]/GE Chipset Integrated Graphics Device 0e11 00b9 Evo D510 SFF 1014 0267 NetVista A30p + 1028 0160 Dimension 2400 1734 1003 D1521 Mainboard (Fujitsu-Siemens) 1734 1004 D1451 Mainboard (SCENIC N300, i845GV) 2570 82865G/PE/P DRAM Controller/Host-Hub Interface @@ -23033,7 +24192,7 @@ 2589 E7220/E7221 PCI Express Root Port 258a E7221 Integrated Graphics Controller 2590 Mobile 915GM/PM/GMS/910GML Express Processor to DRAM Controller - 1014 0575 ThinkPad Z60t + 1014 0575 ThinkPad X41 / Z60t 1028 0182 Dell Latitude C610 103c 0934 Compaq nw8240/nx8220 103c 099c NX6110/NC6120 @@ -23043,8 +24202,9 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 2591 Mobile 915GM/PM Express PCI Express Root Port - 103c 0934 HP Compaq nw8240 Mobile Workstation + 103c 0934 Compaq nw8240 Mobile Workstation 2592 Mobile 915GM/GMS/910GML Express Graphics Controller + 1014 0582 ThinkPad X41 103c 099c NX6110/NC6120 103c 308a NC6220 1043 1881 GMA 900 915GM Integrated Graphics @@ -23217,6 +24377,7 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 2641 82801FBM (ICH6M) LPC Interface Bridge + 1014 0568 ThinkPad X41 103c 0934 Compaq nw8240/nx8220 103c 099c NX6110/NC6120 2642 82801FW/FRW (ICH6W/ICH6RW) LPC Interface Bridge @@ -23232,7 +24393,9 @@ 1028 0177 Dimension 8400 1462 7028 915P/G Neo2 2653 82801FBM (ICH6M) SATA Controller + 1014 056a ThinkPad X41 2658 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB UHCI #1 + 1014 0565 ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 103c 0934 Compaq nw8240/nx8220 @@ -23245,6 +24408,7 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 2659 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB UHCI #2 + 1014 0565 ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 103c 0934 Compaq nw8240/nx8220 @@ -23257,6 +24421,7 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 265a 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB UHCI #3 + 1014 0565 ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 103c 0934 Compaq nw8240/nx8220 @@ -23269,6 +24434,7 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 265b 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB UHCI #4 + 1014 0565 ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 103c 099c NX6110/NC6120 @@ -23280,6 +24446,7 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 265c 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB2 EHCI Controller + 1014 0566 ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 103c 0934 Compaq nw8240/nx8220 @@ -23293,13 +24460,13 @@ e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 2660 82801FB/FBM/FR/FW/FRW (ICH6 Family) PCI Express Port 1 - 103c 0934 HP Compaq nw8240 Mobile Workstation + 103c 0934 Compaq nw8240 Mobile Workstation 103c 099c NX6110/NC6120 e4bf 0ccd CCD-CALYPSO e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 2662 82801FB/FBM/FR/FW/FRW (ICH6 Family) PCI Express Port 2 - 103c 0934 HP Compaq nw8240 Mobile Workstation + 103c 0934 Compaq nw8240 Mobile Workstation e4bf 0ccd CCD-CALYPSO e4bf 0cd3 CD3-JIVE e4bf 58b1 XB1 @@ -23320,6 +24487,7 @@ 1462 7028 915P/G Neo2 1af4 1100 QEMU Virtual Machine 266a 82801FB/FBM/FR/FW/FRW (ICH6 Family) SMBus Controller + 1014 056b ThinkPad X41 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 1043 80a6 P5GD1-VW Mainboard @@ -23335,6 +24503,7 @@ 103c 0934 Compaq nw8240/nx8220 103c 099c NX6110/NC6120 266e 82801FB/FBM/FR/FW/FRW (ICH6 Family) AC'97 Audio Controller + 1014 0581 ThinkPad X41 (Analog Devices AD1981B codec) 1025 006a Realtek ALC 655 codec (in Acer TravelMate 2410 serie laptop) 1028 0177 Dimension 8400 1028 0179 Optiplex GX280 @@ -23451,6 +24620,7 @@ 1043 2582 P5GD1-VW Mainboard 1734 105b Scenic W620 2792 Mobile 915GM/GMS/910GML Express Graphics Controller + 1014 0582 ThinkPad X41 103c 099c NX6110/NC6120 1043 1881 GMA 900 915GM Integrated Graphics e4bf 0ccd CCD-CALYPSO @@ -23462,6 +24632,7 @@ 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1043 1237 A6J-Q008 1071 8209 Medion MIM 2240 Notebook PC [MD98100] 17aa 2015 ThinkPad T60 @@ -23472,10 +24643,12 @@ 1071 8209 Medion MIM 2240 Notebook PC [MD98100] 27a2 Mobile 945GM/GMS, 943/940GML Express Integrated Graphics Controller 103c 30a1 NC2400 + 103c 30d5 530 Laptop 17aa 201a ThinkPad R60/T60/X60 series 9902 1584 CCE MPL-D10H120F 27a6 Mobile 945GM/GMS/GME, 943/940GML Express Integrated Graphics Controller 103c 30a1 NC2400 + 103c 30d5 530 Laptop 1775 11cc CC11/CL11 integrated graphics (secondary) 17aa 201a ThinkPad R60/T60/X60 series 27ac Mobile 945GSE Express Memory Controller Hub @@ -23488,6 +24661,7 @@ 8086 544e DeskTop Board D945GTP 27b8 82801GB/GR (ICH7 Family) LPC Interface Bridge 1028 01e6 PowerEdge 860 + 103c 2a8c Compaq 500B Microtower 1043 8179 P5KPL-VM Motherboard 107b 5048 E4500 1462 7418 Wind PC MS-7418 @@ -23498,6 +24672,7 @@ 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1071 8209 Medion MIM 2240 Notebook PC [MD98100] 10f7 8338 Panasonic CF-Y5 laptop 17aa 2009 ThinkPad R60/T60/X60 series @@ -23513,6 +24688,7 @@ 1028 01ad OptiPlex GX620 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 + 103c 2a8c Compaq 500B Microtower 1043 8179 P5KPL-VM Motherboard 107b 5048 E4500 1462 2310 MSI Hetis 945 @@ -23541,6 +24717,7 @@ 27c5 82801GBM/GHM (ICH7-M Family) SATA Controller [AHCI mode] 103c 309f Compaq nx9420 Notebook 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 17aa 200d ThinkPad R60/T60/X60 series 27c6 82801GHM (ICH7-M DH) SATA Controller [RAID mode] 27c8 NM10/ICH7 Family USB UHCI Controller #1 @@ -23550,9 +24727,11 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1043 1237 A6J-Q008 1043 8179 P5KPL-VM,P5LD2-VM Mainboard 105b 0d7c D270S/D250S Motherboard @@ -23573,6 +24752,7 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 @@ -23596,6 +24776,7 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 @@ -23617,6 +24798,7 @@ 1028 01d7 XPS M1210 1028 01df PowerEdge SC440 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 @@ -23639,9 +24821,11 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1043 1237 A6J-Q008 1043 8179 P5KPL-VM,P5LD2-VM Mainboard 105b 0d7c D270S/D250S Motherboard @@ -23662,6 +24846,7 @@ 1458 5001 GA-D525TUD 1462 7418 Wind PC MS-7418 1775 11cc CC11/CL11 + 17aa 2011 ThinkPad R60e 8086 544b Desktop Board D425KT 27d2 NM10/ICH7 Family PCI Express Port 2 103c 309f Compaq nx9420 Notebook @@ -23670,12 +24855,14 @@ 144d c072 Notebook N150P 1462 7418 Wind PC MS-7418 1775 11cc CC11/CL11 + 17aa 2011 ThinkPad R60e 8086 544b Desktop Board D425KT 27d4 NM10/ICH7 Family PCI Express Port 3 1071 8209 Medion MIM 2240 Notebook PC [MD98100] 144d c072 Notebook N150P 1462 7418 Wind PC MS-7418 1775 11cc CC11/CL11 + 17aa 2011 ThinkPad R60e 8086 544b Desktop Board D425KT 27d6 NM10/ICH7 Family PCI Express Port 4 103c 30a3 Compaq nw8440 @@ -23683,14 +24870,17 @@ 144d c072 Notebook N150P 1462 7418 Wind PC MS-7418 1775 11cc CC11/CL11 + 17aa 2011 ThinkPad R60e 8086 544b Desktop Board D425KT 27d8 NM10/ICH7 Family High Definition Audio Controller 1025 006c 9814 WKMI 1028 01d7 XPS M1210 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1043 1123 A6J-Q008 1043 13c4 Asus G2P 1043 817f P5LD2-VM Mainboard (Realtek ALC 882 codec) @@ -23722,6 +24912,7 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 1043 8179 P5KPL-VM Motherboard 105b 0d7c D270S/D250S Motherboard 1071 8209 Medion MIM 2240 Notebook PC [MD98100] @@ -23747,9 +24938,11 @@ 1028 01df PowerEdge SC440 1028 01e6 PowerEdge 860 103c 2a3b Pavilion A1512X + 103c 2a8c Compaq 500B Microtower 103c 309f Compaq nx9420 Notebook 103c 30a1 NC2400 103c 30a3 Compaq nw8440 + 103c 30d5 530 Laptop 1043 1237 A6J-Q008 1043 8179 P5KPL-VM Motherboard 107b 5048 E4500 @@ -23794,6 +24987,12 @@ 1028 01da OptiPlex 745 1462 7235 P965 Neo MS-7235 mainboard 2826 C600/X79 series chipset SATA RAID Controller + 1d49 0100 ThinkSystem RAID 331 + 1d49 0101 ThinkSystem RAID 331 + 1d49 0102 ThinkSystem RAID 331 + 1d49 0103 ThinkSystem RAID 331 + 1d49 0104 ThinkSystem RAID 331 + 1d49 0105 ThinkSystem RAID 331 2827 C610/X99 series chipset sSATA Controller [RAID mode] 2828 82801HM/HEM (ICH8M/ICH8M-E) SATA Controller [IDE mode] 1028 01f3 Inspiron 1420 @@ -23960,7 +25159,7 @@ 1028 01f9 Dell Latitude D630 1028 01ff Dell Precision M4300 1028 0256 Studio 1735 - 103c 2802 HP Compaq dc7700p + 103c 2802 Compaq dc7700p 103c 30c0 Compaq 6710b 103c 30c1 Compaq 6910p 103c 30cc Pavilion dv6700 @@ -24551,8 +25750,10 @@ 2e27 4 Series Chipset Serial KT Controller 2e29 4 Series Chipset PCI Express Root Port 2e30 4 Series Chipset DRAM Controller + 103c 2a8c Compaq 500B Microtower 2e31 4 Series Chipset PCI Express Root Port 2e32 4 Series Chipset Integrated Graphics Controller + 103c 2a8c Compaq 500B Microtower 2e33 4 Series Chipset Integrated Graphics Controller 2e34 4 Series Chipset HECI Controller 2e35 4 Series Chipset HECI Controller @@ -24761,6 +25962,8 @@ 8086 4010 Dual Band Wireless AC 3165 # Stone Peak 1x1 8086 4210 Dual Band Wireless AC 3165 +# Stone Peak 1x1 + 3166 Intel Dual Band Wireless-AC 3165 Plus Bluetooth 3200 GD31244 PCI-X SATA HBA 1775 c200 C2K onboard SATA host bus adapter 3310 IOP348 I/O Processor @@ -25028,6 +26231,16 @@ 372b Xeon C5500/C3500 Core 372c Xeon C5500/C3500 Reserved 373f Xeon C5500/C3500 IOxAPIC + 37cd X722 Virtual Function + 37ce Ethernet Connection X722 for 10GbE backplane + 1590 0215 Ethernet 10Gb 2-port 568i Adapter + 37cf Ethernet Connection X722 for 10GbE QSFP+ + 37d0 Ethernet Connection X722 for 10GbE SFP+ + 37d1 Ethernet Connection X722 for 1GbE + 37d2 Ethernet Connection X722 for 10GBASE-T + 37d3 Ethernet Connection X722 for 10GbE SFP+ + 37d4 Ethernet Connection X722 for 10GbE QSFP+ + 37d9 X722 Hyper-V Virtual Function 3a00 82801JD/DO (ICH10 Family) 4-port SATA IDE Controller 3a02 82801JD/DO (ICH10 Family) SATA AHCI Controller 3a05 82801JD/DO (ICH10 Family) SATA RAID Controller @@ -25156,34 +26369,35 @@ 3a7c 82801JD/DO (ICH10 Family) Gigabit Ethernet Controller 3b00 5 Series/3400 Series Chipset LPC Interface Controller 3b01 Mobile 5 Series Chipset LPC Interface Controller - 3b02 5 Series Chipset LPC Interface Controller - 3b03 Mobile 5 Series Chipset LPC Interface Controller + 3b02 P55 Chipset LPC Interface Controller + 3b03 PM55 Chipset LPC Interface Controller 3b04 5 Series Chipset LPC Interface Controller 3b05 Mobile 5 Series Chipset LPC Interface Controller - 3b06 5 Series Chipset LPC Interface Controller - 3b07 Mobile 5 Series Chipset LPC Interface Controller + 3b06 H55 Chipset LPC Interface Controller + 3b07 QM57 Chipset LPC Interface Controller + 1028 040a Latitude E6410 1028 040b Latitude E6510 e4bf 50c1 PC1-GROOVE - 3b08 5 Series Chipset LPC Interface Controller - 3b09 Mobile 5 Series Chipset LPC Interface Controller + 3b08 H57 Chipset LPC Interface Controller + 3b09 HM55 Chipset LPC Interface Controller 1025 0347 Aspire 7740G 144d c06a R730 Laptop 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] - 3b0a 5 Series Chipset LPC Interface Controller + 3b0a Q57 Chipset LPC Interface Controller 1028 02da OptiPlex 980 15d9 060d C7SIM-Q Motherboard - 3b0b Mobile 5 Series Chipset LPC Interface Controller + 3b0b HM57 Chipset LPC Interface Controller 3b0c 5 Series Chipset LPC Interface Controller 3b0d 5 Series/3400 Series Chipset LPC Interface Controller 3b0e 5 Series/3400 Series Chipset LPC Interface Controller - 3b0f 5 Series/3400 Series Chipset LPC Interface Controller + 3b0f QS57 Chipset LPC Interface Controller 3b10 5 Series/3400 Series Chipset LPC Interface Controller 3b11 5 Series/3400 Series Chipset LPC Interface Controller 3b12 3400 Series Chipset LPC Interface Controller 3b13 5 Series/3400 Series Chipset LPC Interface Controller - 3b14 3400 Series Chipset LPC Interface Controller + 3b14 3420 Chipset LPC Interface Controller 3b15 5 Series/3400 Series Chipset LPC Interface Controller - 3b16 3400 Series Chipset LPC Interface Controller + 3b16 3450 Chipset LPC Interface Controller 3b17 5 Series/3400 Series Chipset LPC Interface Controller 3b18 5 Series/3400 Series Chipset LPC Interface Controller 3b19 5 Series/3400 Series Chipset LPC Interface Controller @@ -25200,7 +26414,7 @@ 15d9 060d C7SIM-Q Motherboard 3b23 5 Series/3400 Series Chipset 4 port SATA AHCI Controller 3b25 5 Series/3400 Series Chipset SATA RAID Controller - 103c 3118 HP Smart Array B110i SATA RAID Controller + 103c 3118 Smart Array B110i SATA RAID Controller 3b26 5 Series/3400 Series Chipset 2 port SATA IDE Controller 3b28 5 Series/3400 Series Chipset 4 port SATA IDE Controller 144d c06a R730 Laptop @@ -25217,11 +26431,13 @@ 3b2e 5 Series/3400 Series Chipset 4 port SATA IDE Controller e4bf 50c1 PC1-GROOVE 3b2f 5 Series/3400 Series Chipset 6 port SATA AHCI Controller + 1028 040a Latitude E6410 1028 040b Latitude E6510 e4bf 50c1 PC1-GROOVE 3b30 5 Series/3400 Series Chipset SMBus Controller 1025 0347 Aspire 7740G 1028 02da OptiPlex 980 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 15d9 060d C7SIM-Q Motherboard @@ -25229,11 +26445,13 @@ e4bf 50c1 PC1-GROOVE 3b32 5 Series/3400 Series Chipset Thermal Subsystem 1025 0347 Aspire 7740G + 1028 040a Latitude E6410 144d c06a R730 Laptop 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 3b34 5 Series/3400 Series Chipset USB2 Enhanced Host Controller 1025 0347 Aspire 7740G 1028 02da OptiPlex 980 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 15d9 060d C7SIM-Q Motherboard @@ -25248,6 +26466,7 @@ 3b3c 5 Series/3400 Series Chipset USB2 Enhanced Host Controller 1025 0347 Aspire 7740G 1028 02da OptiPlex 980 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 15d9 060d C7SIM-Q Motherboard @@ -25259,19 +26478,23 @@ 3b41 5 Series/3400 Series Chipset LAN Controller 3b42 5 Series/3400 Series Chipset PCI Express Root Port 1 1028 02da OptiPlex 980 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 15d9 060d C7SIM-Q Motherboard 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 3b44 5 Series/3400 Series Chipset PCI Express Root Port 2 + 1028 040a Latitude E6410 1028 040b Latitude E6510 15d9 060d C7SIM-Q Motherboard 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 3b46 5 Series/3400 Series Chipset PCI Express Root Port 3 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] 3b48 5 Series/3400 Series Chipset PCI Express Root Port 4 + 1028 040a Latitude E6410 1028 040b Latitude E6510 144d c06a R730 Laptop 3b4a 5 Series/3400 Series Chipset PCI Express Root Port 5 @@ -25284,7 +26507,9 @@ 3b56 5 Series/3400 Series Chipset High Definition Audio 1025 0347 Aspire 7740G 1028 02da OptiPlex 980 + 1028 040a Latitude E6410 1028 040b Latitude E6510 + 1043 1373 ASUSTek G73-series gaming laptop 144d c06a R730 Laptop 15d9 060d C7SIM-Q Motherboard 17c0 10d2 Medion Akoya E7214 Notebook PC [MD98410] @@ -25431,6 +26656,7 @@ 1351 103c Compaq NC6220 4224 PRO/Wireless 2915ABG [Calexico2] Network Connection 4227 PRO/Wireless 3945ABG [Golan] Network Connection + 8086 1010 ThinkPad R60e 8086 1011 ThinkPad T60/R60e/X60s 8086 1014 PRO/Wireless 3945BG Network Connection 4229 PRO/Wireless 4965 AG or AGN [Kedron] Network Connection @@ -25549,6 +26775,41 @@ 530d 80310 (IOP) IO Processor 5845 QEMU NVM Express Controller 1af4 1100 QEMU Virtual Machine + 5a84 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Integrated Graphics Controller + 5a88 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Imaging Unit + 5a98 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Audio Cluster + 5a9a Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Trusted Execution Engine + 5aa2 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Integrated Sensor Hub + 5aa8 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series USB xHCI + 5aac Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #1 + 5aae Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #2 + 5ab0 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #3 + 5ab2 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #4 + 5ab4 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #5 + 5ab6 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #6 + 5ab8 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #7 + 5aba Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series I2C Controller #8 + 5abc Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series HSUART Controller #1 + 5abe Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series HSUART Controller #2 + 5ac0 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series HSUART Controller #3 + 5ac2 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SPI Controller #1 + 5ac4 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SPI Controller #2 + 5ac6 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SPI Controller #3 + 5ac8 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PWM Pin Controller + 5aca Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SDXC/MMC Host Controller + 5acc Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series eMMC Controller + 5ad0 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SDIO Controller + 5ad4 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SMBus Controller + 5ad6 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port B #1 + 5ad7 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port B #2 + 5ad8 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port A #1 + 5ad9 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port A #2 + 5ada Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port A #3 + 5adb Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series PCI Express Port A #4 + 5ae3 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series SATA AHCI Controller + 5ae8 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Low Pin Count Interface + 5aee Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series HSUART Controller #4 + 5af0 Atom/Celeron/Pentium Processor N4200/N3350/E3900 Series Host Bridge 65c0 5100 Chipset Memory Controller Hub 65e2 5100 Chipset PCI Express x4 Port 2 65e3 5100 Chipset PCI Express x4 Port 3 @@ -25626,6 +26887,10 @@ 6f45 Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D QPI Link 2 Debug 6f46 Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D QPI Link 2 Debug 6f47 Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D QPI Link 2 Debug + 6f50 Xeon Processor D Family QuickData Technology Register DMA Channel 0 + 6f51 Xeon Processor D Family QuickData Technology Register DMA Channel 1 + 6f52 Xeon Processor D Family QuickData Technology Register DMA Channel 2 + 6f53 Xeon Processor D Family QuickData Technology Register DMA Channel 3 6f60 Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D Home Agent 1 6f68 Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D Target Address/Thermal/RAS 6f6a Xeon E7 v4/Xeon E5 v4/Xeon E3 v4/Xeon D Channel Target Address Decoder @@ -26119,10 +27384,12 @@ 9c23 8 Series CHAP Counters 9c24 8 Series Thermal 9c26 8 Series USB EHCI #1 + 17aa 220c T440s 17aa 2214 ThinkPad X240 9c2d 8 Series USB EHCI #2 9c31 8 Series USB xHCI HC 17aa 2214 ThinkPad X240 + 8086 7270 Apple MacBookAir6,2 / MacBookPro11,1 9c35 8 Series SDIO Controller 9c36 8 Series Audio DSP Controller 9c3a 8 Series HECI #0 @@ -26181,6 +27448,53 @@ 9ce4 Wildcat Point-LP Serial IO UART Controller #1 9ce5 Wildcat Point-LP Serial IO GSPI Controller #0 9ce6 Wildcat Point-LP Serial IO GSPI Controller #1 + 9d03 Sunrise Point-LP SATA Controller [AHCI mode] + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d14 Sunrise Point-LP PCI Express Root Port #5 + 17aa 382a B51-80 Laptop + 9d15 Sunrise Point-LP PCI Express Root Port #6 + 17aa 382a B51-80 Laptop + 9d16 Sunrise Point-LP PCI Express Root Port #7 + 9d17 Sunrise Point-LP PCI Express Root Port #8 + 9d18 Sunrise Point-LP PCI Express Root Port #9 + 17aa 382a B51-80 Laptop + 9d21 Sunrise Point-LP PMC + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d23 Sunrise Point-LP SMBus + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d27 Sunrise Point-LP Serial IO UART Controller #0 + 9d28 Sunrise Point-LP Serial IO UART Controller #1 + 9d29 Sunrise Point-LP Serial IO SPI Controller #0 + 9d2a Sunrise Point-LP Serial IO SPI Controller #1 + 9d2d Sunrise Point-LP Secure Digital IO Controller + 9d2f Sunrise Point-LP USB 3.0 xHCI Controller + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d31 Sunrise Point-LP Thermal subsystem + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d3a Sunrise Point-LP CSME HECI #1 + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop + 9d43 Sunrise Point-LP LPC Controller + 17aa 382a B51-80 Laptop + 9d48 Sunrise Point-LP LPC Controller + 1028 06f3 Latitude 3570 + 9d60 Sunrise Point-LP Serial IO I2C Controller #0 + 1028 06f3 Latitude 3570 + 8086 9d60 100 Series PCH/Sunrise Point PCH I2C0 [Skylake/Kaby Lake LPSS I2C] + 9d61 Sunrise Point-LP Serial IO I2C Controller #1 + 9d62 Sunrise Point-LP Serial IO I2C Controller #2 + 9d63 Sunrise Point-LP Serial IO I2C Controller #3 + 9d64 Sunrise Point-LP Serial IO I2C Controller #4 + 9d65 Sunrise Point-LP Serial IO I2C Controller #5 + 9d66 Sunrise Point-LP Serial IO UART Controller #2 + 9d70 Sunrise Point-LP HD Audio + 1028 06f3 Latitude 3570 + 17aa 382a B51-80 Laptop a000 Atom Processor D4xx/D5xx/N4xx/N5xx DMI Bridge 1458 5000 GA-D525TUD 8086 4f4d DeskTop Board D510MO @@ -26198,6 +27512,7 @@ a012 Atom Processor D4xx/D5xx/N4xx/N5xx Integrated Graphics Controller 144d c072 Notebook N150P a013 Atom Processor D4xx/D5xx/N4xx/N5xx CHAPS counter + a102 Sunrise Point-H SATA controller [AHCI mode] a103 Sunrise Point-H SATA Controller [AHCI mode] a105 Sunrise Point-H SATA Controller [RAID mode] a107 Sunrise Point-H SATA Controller [RAID mode] @@ -26225,10 +27540,10 @@ a124 Sunrise Point-H SPI Controller a125 Sunrise Point-H Gigabit Ethernet Controller a126 Sunrise Point-H Northpeak - a127 Sunrise Point-H LPSS UART #0 - a128 Sunrise Point-H LPSS UART #1 - a129 Sunrise Point-H LPSS SPI #0 - a12a Sunrise Point-H LPSS SPI #1 + a127 Sunrise Point-H Serial IO UART #0 + a128 Sunrise Point-H Serial IO UART #1 + a129 Sunrise Point-H Serial IO SPI #0 + a12a Sunrise Point-H Serial IO SPI #1 a12f Sunrise Point-H USB 3.0 xHCI Controller a130 Sunrise Point-H USB Device Controller (OTG) a131 Sunrise Point-H Thermal subsystem @@ -26271,9 +27586,9 @@ a15d Sunrise Point-H LPC Controller a15e Sunrise Point-H LPC Controller a15f Sunrise Point-H LPC Controller - a160 Sunrise Point-H LPSS I2C Controller #0 - a161 Sunrise Point-H LPSS I2C Controller #1 - a166 Sunrise Point-H LPSS UART Controller #2 + a160 Sunrise Point-H Serial IO I2C Controller #0 + a161 Sunrise Point-H Serial IO I2C Controller #1 + a166 Sunrise Point-H Serial IO UART Controller #2 a167 Sunrise Point-H PCI Root Port #17 a168 Sunrise Point-H PCI Root Port #18 a169 Sunrise Point-H PCI Root Port #19 @@ -26370,9 +27685,10 @@ 8384 SigmaTel 8401 TRENDware International Inc. 8686 ScaleMP - 1010 vSMPowered system controller [vSMP CTL] + 1010 vSMP Foundation controller [vSMP CTL] + 1011 vSMP Foundation MEX/FLX controller [vSMP CTL] 8800 Trigem Computer Inc. - 2008 Video assistent component + 2008 Video assistant component 8866 T-Square Design Inc. 8888 Silicon Magic 8912 TRX @@ -26875,6 +28191,11 @@ bdbd Blackmagic Design a137 DeckLink Studio 4K a138 Decklink SDI 4K a139 Intensity Pro 4K + a13b DeckLink Micro Recorder + a13d DeckLink 4K Pro + a13e UltraStudio 4K Extreme + a13f DeckLink Quad 2 + a140 DeckLink Duo 2 c001 TSI Telsys c0a9 Micron/Crucial Technology c0de Motorola @@ -26950,6 +28271,7 @@ dada Datapath Limited 1153 VisionDVI-DL 1154 VisionSDI2 db10 Diablo Technologies +dc93 Dawicontrol GmbH dcba Dynamic Engineering 0046 PCIe Altera Cyclone IV # VPX format Receiver Controller Board @@ -26985,6 +28307,9 @@ deaf Middle Digital Inc. 9052 PC Weasel Watchdog Timer # formerly SoftHard Technology Ltd. deda XIMEA + 4001 Camera CB +# Thunderbolt based camera MT family + 4021 Camera MT e000 Winbond e000 W89C940 e159 Tiger Jet Network Inc. diff --git a/sys/boot/i386/pxeldr/pxeboot.8 b/sys/boot/i386/pxeldr/pxeboot.8 index d2ca703..7f57d5a 100644 --- a/sys/boot/i386/pxeldr/pxeboot.8 +++ b/sys/boot/i386/pxeldr/pxeboot.8 @@ -85,6 +85,14 @@ expects to fetch .Pa /boot/loader.rc from the specified server before loading any other files. .Pp +.Nm +defaults to a conservative 1024 byte NFS data packet size. +This may be changed by setting the +.Va nfs.read_size +variable in +.Pa /boot/loader.conf . +Valid values range from 1024 to 4096 bytes. +.Pp In all other respects, .Nm acts just like diff --git a/sys/cam/ata/ata_xpt.c b/sys/cam/ata/ata_xpt.c index 3429bb29a6..584b75d 100644 --- a/sys/cam/ata/ata_xpt.c +++ b/sys/cam/ata/ata_xpt.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include <sys/interrupt.h> #include <sys/sbuf.h> +#include <sys/eventhandler.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/sysctl.h> @@ -827,12 +828,24 @@ noerror: { struct ccb_pathinq cpi; int16_t *ptr; + int veto = 0; ident_buf = &softc->ident_data; for (ptr = (int16_t *)ident_buf; ptr < (int16_t *)ident_buf + sizeof(struct ata_params)/2; ptr++) { *ptr = le16toh(*ptr); } + + /* + * Allow others to veto this ATA disk attachment. This + * is mainly used by VMs, whose disk controllers may + * share the disks with the simulated ATA controllers. + */ + EVENTHANDLER_INVOKE(ada_probe_veto, path, ident_buf, &veto); + if (veto) { + goto device_fail; + } + if (strncmp(ident_buf->model, "FX", 2) && strncmp(ident_buf->model, "NEC", 3) && strncmp(ident_buf->model, "Pioneer", 7) && diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 44057d5..c85f9f3 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -713,6 +713,14 @@ static struct da_quirk_entry da_quirk_table[] = {T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*", "*"}, /*quirks*/ DA_Q_NO_RC16 }, + { + /* + * I-O Data USB Flash Disk + * PR: usb/211716 + */ + {T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*", + "*"}, /*quirks*/ DA_Q_NO_RC16 + }, /* ATA/SATA devices over SAS/USB/... */ { /* Hitachi Advanced Format (4k) drives */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 1700194..d6ccf3d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -120,9 +120,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include <sys/spa.h> #include <sys/zio.h> +#include <sys/spa_impl.h> #include <sys/zio_compress.h> +#include <sys/zio_checksum.h> #include <sys/zfs_context.h> #include <sys/arc.h> #include <sys/refcount.h> @@ -154,10 +279,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - uint_t arc_reduce_dnlc_percent = 3; /* @@ -229,13 +350,14 @@ uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; +boolean_t zfs_compressed_arc_enabled = B_TRUE; + static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); @@ -268,6 +390,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, + &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); /* * We don't have a tunable for arc_free_target due to the dependency on @@ -349,7 +473,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -415,6 +539,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures @@ -559,16 +703,12 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_l2_padding_needed; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; @@ -583,9 +723,6 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; @@ -628,6 +765,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -661,16 +801,12 @@ static arc_stats_t arc_stats = { { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "l2_padding_needed", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, @@ -685,9 +821,6 @@ static arc_stats_t arc_stats = { { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, @@ -760,8 +893,12 @@ static arc_state_t *arc_l2c_only; #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; @@ -821,6 +958,7 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing @@ -831,9 +969,10 @@ typedef struct l1arc_buf_hdr { #endif arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; @@ -846,8 +985,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed or padded data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; @@ -856,9 +994,6 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -867,20 +1002,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; @@ -984,9 +1136,6 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) } #endif -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -995,25 +1144,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -1066,16 +1225,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -1110,41 +1259,47 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, - &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, - &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, - &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, - &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, - &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, - &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, @@ -1177,12 +1332,10 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ void *l2rcb_data; /* temporary buffer */ } l2arc_read_callback_t; @@ -1195,7 +1348,7 @@ typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -1203,21 +1356,22 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static void l2arc_trim(const arc_buf_hdr_t *hdr) { @@ -1226,13 +1380,9 @@ l2arc_trim(const arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) - return; - if (hdr->b_l2hdr.b_asize != 0) { + if (HDR_GET_PSIZE(hdr) != 0) { trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, - hdr->b_l2hdr.b_asize, 0); - } else { - ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); + HDR_GET_PSIZE(hdr), 0); } } @@ -1253,14 +1403,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -1282,7 +1432,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -1320,13 +1470,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -1354,12 +1504,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1445,7 +1595,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1459,7 +1609,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1532,166 +1682,138 @@ retry: } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); +#define ARC_MINTIME (hz>>4) /* 62 ms */ -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); +} - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; - return (equal); + void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - NULL, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif @@ -1733,7 +1855,7 @@ arc_buf_watch(arc_buf_t *buf) procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = buf->b_hdr->b_size; + ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); @@ -1745,11 +1867,14 @@ arc_buf_watch(arc_buf_t *buf) static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1771,29 +1896,29 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_thawed != NULL) - kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); - buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); @@ -1803,53 +1928,246 @@ arc_buf_thaw(arc_buf_t *buf) void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_remove(list, hdr); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { @@ -1866,15 +2184,9 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_insert(list, hdr); - - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } @@ -1889,8 +2201,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, { arc_state_t *old_state; int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; + uint32_t bufcnt; + boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* @@ -1903,20 +2215,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1924,25 +2236,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; /* * An L1 header always exists here, since if we're @@ -1953,38 +2257,38 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1993,34 +2297,53 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { - ASSERT3P(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2029,9 +2352,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3P(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -2109,39 +2452,85 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -2158,7 +2547,7 @@ arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -2172,12 +2561,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -2186,179 +2575,106 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - atomic_add_64(&arc_loaned_bytes, hdr->b_size); -} - -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - - /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); - } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); -} - -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. - */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); - return; - } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - l2arc_data_free_t *df; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; - df->l2df_func = free_func; + df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } -/* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. - */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *hdr = buf->b_hdr; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); } +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - size_t align, asize, len; + arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. */ - if (!HDR_HAS_L1HDR(hdr)) - return; + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); +} - /* - * The bufer has been chosen for writing to L2ARC, but it's - * not being written just yet. In other words, - * b_tmp_cdata points to exactly the same buffer as b_data, - * l2arc_transform_buf hasn't been called. - */ - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, - hdr->b_l1hdr.b_buf->b_data); - ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; - } +static void +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; /* - * Nothing to do if the temporary buffer was not required. + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. */ - if (hdr->b_l1hdr.b_tmp_cdata == NULL) - return; - - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - len = hdr->b_size; - align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize, - zio_data_buf_free); - hdr->b_l1hdr.b_tmp_cdata = NULL; + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* @@ -2366,56 +2682,43 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); - /* free up data associated with the buf */ + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); } - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; - - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } - - (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; - /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. - */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ @@ -2423,13 +2726,53 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); - ASSERT(buf->b_efunc == NULL); + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; @@ -2437,54 +2780,224 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) } static void -arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ ASSERT(HDR_HAS_L2HDR(hdr)); - list_remove(&dev->l2ad_buflist, hdr); + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() + * The header has been reallocated so we need to re-insert it into any + * lists it was on. */ - arc_buf_l2_cdata_free(hdr); + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); +} + +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); + + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2492,13 +3005,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2524,40 +3040,22 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } - } #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); + } } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -2571,133 +3069,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } int32_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2724,11 +3124,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. it's b_tmp_cdata field) during it's write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. @@ -2739,11 +3139,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2756,6 +3158,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2775,7 +3178,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2783,37 +3185,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -3057,12 +3461,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -3074,7 +3478,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -3098,8 +3502,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -3362,36 +3766,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3411,9 +3792,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3618,7 +3996,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3711,6 +4089,20 @@ arc_reclaim_thread(void *dummy __unused) int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3782,57 +4174,12 @@ arc_reclaim_thread(void *dummy __unused) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } -static void -arc_user_evicts_thread(void *dummy __unused) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_user_evicts_cv, - &arc_user_evicts_lock, hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - thread_exit(); -} - /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -3917,18 +4264,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -3968,12 +4314,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -3981,11 +4328,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -3998,9 +4343,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -4011,6 +4357,37 @@ arc_get_data_buf(arc_buf_t *buf) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -4054,7 +4431,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -4088,7 +4465,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4157,8 +4534,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -4167,7 +4544,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -4176,17 +4553,29 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + +static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -4206,33 +4595,32 @@ arc_read_done(zio_t *zio) arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot_byteswap[bswap].ob_func; - func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); -#ifdef illumos - arc_buf_watch(buf); -#endif + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -4246,31 +4634,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -4340,7 +4747,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4357,8 +4763,8 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4390,7 +4796,8 @@ top: ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4411,10 +4818,9 @@ top: acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } @@ -4437,34 +4843,36 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4474,20 +4882,21 @@ top: if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4497,26 +4906,9 @@ top: /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4527,54 +4919,60 @@ top: hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; + } + + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4583,6 +4981,11 @@ top: vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4590,9 +4993,10 @@ top: * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), @@ -4601,11 +5005,6 @@ top: curthread->td_ru.ru_inblock++; #endif - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4627,23 +5026,20 @@ top: cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; - if (b_asize > hdr->b_size) { - ASSERT3U(b_compress, ==, - ZIO_COMPRESS_OFF); - b_data = zio_data_buf_alloc(b_asize); + uint64_t asize = vdev_psize_to_asize(vd, size); + if (asize != size) { + b_data = zio_data_buf_alloc(asize); cb->l2rcb_data = b_data; } else { - b_data = buf->b_data; + b_data = hdr->b_l1hdr.b_pdata; } ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + asize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4652,27 +5048,19 @@ top: * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3U(b_asize, ==, 0); - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + asize, b_data, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4702,8 +5090,8 @@ top: } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -4714,20 +5102,6 @@ top: return (0); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4743,85 +5117,38 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); - } else { + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); mutex_exit(hash_lock); - } - -} - -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); + mutex_exit(hash_lock); } - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); } /* @@ -4855,15 +5182,18 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4906,48 +5236,87 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); + arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); - uint64_t *size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); - } - hdr->b_l1hdr.b_datacnt -= 1; + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); @@ -4955,25 +5324,25 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - + nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -4987,8 +5356,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -5022,28 +5389,85 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -5074,9 +5498,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -5084,7 +5510,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -5093,7 +5519,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -5127,19 +5553,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5149,9 +5575,8 @@ arc_write_done(zio_t *zio) } zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) @@ -5160,16 +5585,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5178,7 +5601,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -5275,12 +5721,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); @@ -5292,8 +5740,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -5346,7 +5796,7 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given @@ -5388,6 +5838,117 @@ arc_lowmem(void *arg __unused, int howto __unused) } #endif +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { @@ -5397,9 +5958,6 @@ arc_init(void) cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -5440,14 +5998,17 @@ arc_init(void) * Allow the tunables to override our calculations if they are * reasonable. */ - if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) + if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) { arc_c_max = zfs_arc_max; + arc_c_min = MIN(arc_c_min, arc_c_max); + } if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; @@ -5492,68 +6053,10 @@ arc_init(void) zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - - refcount_create(&arc_anon->arcs_size); - refcount_create(&arc_mru->arcs_size); - refcount_create(&arc_mru_ghost->arcs_size); - refcount_create(&arc_mfu->arcs_size); - refcount_create(&arc_mfu_ghost->arcs_size); - refcount_create(&arc_l2c_only->arcs_size); - + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; - arc_eviction_list = NULL; - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + arc_reclaim_thread_exit = B_FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5572,10 +6075,7 @@ arc_init(void) EVENTHANDLER_PRI_FIRST); #endif - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5634,10 +6134,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5645,22 +6145,10 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); - - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5671,27 +6159,7 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -5998,9 +6466,13 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6023,13 +6495,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -6087,33 +6559,27 @@ top: */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); l2arc_trim(hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -6140,23 +6606,21 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* @@ -6164,10 +6628,11 @@ l2arc_read_done(zio_t *zio) * move it and free the buffer. */ if (cb->l2rcb_data != NULL) { - ASSERT3U(hdr->b_size, <, zio->io_size); - ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF); - if (zio->io_error == 0) - bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size); + ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); + if (zio->io_error == 0) { + bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr)); + } /* * The following must be done regardless of whether @@ -6181,28 +6646,23 @@ l2arc_read_done(zio_t *zio) * needs real data. */ zio_data_buf_free(cb->l2rcb_data, zio->io_size); - zio->io_size = zio->io_orig_size = hdr->b_size; - zio->io_data = zio->io_orig_data = buf->b_data; + zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata; } - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -6215,7 +6675,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -6228,9 +6688,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -6380,12 +6841,11 @@ top: */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -6406,40 +6866,26 @@ top: * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, - buf_compress_minsz; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); - const boolean_t do_headroom_boost = *headroom_boost; int try; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; - - /* * Copy buffers for L2ARC writing. */ for (try = 0; try <= 3; try++) { @@ -6462,20 +6908,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; - size_t align; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); - ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, + HDR_GET_LSIZE(hdr)); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { @@ -6486,7 +6930,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6501,16 +6945,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - align = (size_t)1 << dev->l2ad_vdev->vdev_ashift; - buf_a_sz = P2ROUNDUP(buf_sz, align); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); @@ -6536,63 +6971,75 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_write_pios); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing). - */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + ASSERT(HDR_HAS_L1HDR(hdr)); - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t size = arc_hdr_size(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + size); - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); + void *to_write; + if (!HDR_SHARED_DATA(hdr) && size == asize) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(asize); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(asize); + } + + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + if (asize != size) + bzero(to_write + size, asize - size); + l2arc_free_data_on_write(to_write, asize, type); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, asize, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + write_sz += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + + write_asize += size; + write_psize += asize; + dev->l2ad_hand += asize; mutex_exit(hash_lock); - write_sz += buf_sz; - write_asize += buf_a_sz; + (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -6609,89 +7056,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (0); } - mutex_enter(&dev->l2ad_mtx); - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - write_asize = 0; - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - boolean_t compress; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - /* - * Save a pointer to the original buffer data we had previously - * stashed away. - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - - compress = HDR_L2COMPRESS(hdr) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz; - if (l2arc_transform_buf(hdr, compress)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - - /* - * Get the new buffer size that accounts for compression - * and padding. - */ - buf_sz = hdr->b_l2hdr.b_asize; - - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); - - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - /* - * If the data was padded or compressed, then it - * it is in a new buffer. - */ - if (hdr->b_l1hdr.b_tmp_cdata != NULL) - buf_data = hdr->b_l1hdr.b_tmp_cdata; - wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - (void) zio_nowait(wzio); - - write_asize += buf_sz; - dev->l2ad_hand += buf_sz; - } - } - - mutex_exit(&dev->l2ad_mtx); - ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); @@ -6716,203 +7080,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } /* - * Transforms, possibly compresses and pads, an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The buffer size was already ashift aligned. - * The original hdr contents were left untouched except for b_tmp_cdata, - * which is reset to NULL. The caller must keep a pointer to the original - * data. - * *) The buffer was incompressible. The buffer size was not ashift aligned. - * b_tmp_cdata was replaced with a temporary data buffer which holds a padded - * (aligned) copy of the data. Once writing is done, invoke - * l2arc_release_cdata_buf on this hdr to free the temporary buffer. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress) -{ - void *cdata; - size_t align, asize, csize, len, rounded; - - ASSERT(HDR_HAS_L2HDR(hdr)); - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - cdata = zio_data_buf_alloc(asize); - ASSERT3P(cdata, !=, NULL); - if (compress) - csize = zio_compress_data(ZIO_COMPRESS_LZ4, - hdr->b_l1hdr.b_tmp_cdata, cdata, len); - else - csize = len; - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, asize); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } - - rounded = P2ROUNDUP(csize, align); - ASSERT3U(rounded, <=, asize); - if (rounded < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression did not save space. - */ - if (P2PHASE(len, align) != 0) { - /* - * Use compression buffer for a copy of data padded to - * the proper size. Compression algorithm remains set - * to ZIO_COMPRESS_OFF. - */ - ASSERT3U(len, <, asize); - bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len); - bzero((char *)cdata + len, asize - len); - l2hdr->b_asize = asize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_padding_needed); - } else { - ASSERT3U(len, ==, asize); - /* - * The original buffer is good as is, - * release the compressed buffer. - * l2hdr will be left unmodified except for b_tmp_cdata. - */ - zio_data_buf_free(cdata, asize); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - if (compress) - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - uint64_t csize; - void *cdata; - - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed or padded data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - size_t align, asize, len; - enum zio_compress comp = hdr->b_l2hdr.b_compress; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (hdr->b_l1hdr.b_tmp_cdata != NULL) { - ASSERT(comp != ZIO_COMPRESS_EMPTY); - len = hdr->b_size; - align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; - asize = P2ROUNDUP(len, align); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else { - ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY); - } -} - -/* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ @@ -6924,7 +7091,6 @@ l2arc_feed_thread(void *dummy __unused) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -6962,7 +7128,7 @@ l2arc_feed_thread(void *dummy __unused) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -6995,7 +7161,7 @@ l2arc_feed_thread(void *dummy __unused) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -7090,7 +7256,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 9d5c398..e27aa18 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -45,6 +45,9 @@ #include <sys/zfeature.h> #include <sys/blkptr.h> #include <sys/range_tree.h> +#include <sys/callb.h> + +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing @@ -52,16 +55,82 @@ */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -71,6 +140,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); @@ -83,6 +153,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -112,8 +183,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -124,7 +193,7 @@ dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; @@ -175,7 +244,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; @@ -207,7 +276,7 @@ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; @@ -232,8 +301,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -318,15 +385,181 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; - dbuf_clear(db); - dbuf_destroy(db); + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; + + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } void @@ -354,7 +587,7 @@ retry: goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); @@ -362,10 +595,30 @@ retry: mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -377,8 +630,23 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -536,7 +804,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -551,8 +819,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -563,6 +829,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -574,6 +841,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -642,7 +910,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -691,7 +959,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -728,8 +996,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -846,7 +1112,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -902,9 +1168,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1026,7 +1293,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1131,7 +1398,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1142,7 +1409,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1388,7 +1655,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); - } else if (do_free_accounting) { + } + + /* + * The dn_struct_rwlock prevents db_blkptr from changing + * due to a write from syncing context completing + * while we are running, so we want to acquire it before + * looking at db_blkptr. + */ + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; @@ -1404,11 +1684,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dnode_willuse_space(dn, -willfree, tx); } - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); @@ -1540,7 +1815,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1549,12 +1824,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1718,7 +1989,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1736,10 +2007,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1751,59 +2022,62 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1814,15 +2088,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* * If this dbuf is referenced from an indirect dbuf, @@ -1915,7 +2199,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -1964,7 +2248,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -1989,76 +2273,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2096,10 +2316,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2128,7 +2375,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2222,6 +2470,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2302,18 +2551,8 @@ top: return (SET_ERROR(ENOENT)); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } + if (db->db_buf != NULL) ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); - } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); @@ -2331,13 +2570,19 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(dn->dn_objset->os_spa, + arc_alloc_buf(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(refcount_is_zero(&db->db_holds)); + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -2411,7 +2656,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); + ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2482,8 +2727,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2528,55 +2775,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -2660,6 +2896,28 @@ dmu_buf_get_blkptr(dmu_buf_t *db) return (dbi->db_blkptr); } +objset_t * +dmu_buf_get_objset(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_objset); +} + +dnode_t * +dmu_buf_dnode_enter(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_ENTER(dbi); + return (DB_DNODE(dbi)); +} + +void +dmu_buf_dnode_exit(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_EXIT(dbi); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -2864,7 +3122,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3130,10 +3388,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3149,8 +3404,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3327,8 +3580,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, + &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 9ce9665..75b4cf8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -131,6 +131,26 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { }; int +dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + uint64_t blkid; + dmu_buf_impl_t *db; + + blkid = dbuf_whichblock(dn, 0, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (0); +} +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { @@ -158,6 +178,29 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, } int +dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; + } + } + + return (err); +} + +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { @@ -1398,7 +1441,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1746,8 +1789,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2121,11 +2163,11 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); zio_compress_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c index e88968b..e7bfdaa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index fe0c0db..b6ae968 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 874a1ca..21ea6ef 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -160,11 +160,16 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); - if (dsp->dsa_drr->drr_type != DRR_BEGIN) { + if (dsp->dsa_drr->drr_type == DRR_BEGIN) { + dsp->dsa_sent_begin = B_TRUE; + } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } + if (dsp->dsa_drr->drr_type == DRR_END) { + dsp->dsa_sent_end = B_TRUE; + } fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); @@ -634,7 +639,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -646,7 +651,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -670,7 +675,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; @@ -700,7 +705,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); @@ -912,6 +917,8 @@ out: list_remove(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); + VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); + kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); @@ -3106,6 +3113,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; + drc->drc_newsnapobj = + dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); @@ -3141,8 +3151,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); } + drc->drc_newsnapobj = + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } - drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, @@ -3184,8 +3195,6 @@ static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { - int error; - #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if @@ -3196,23 +3205,30 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) zfs_destroy_unmount_origin(name); #endif - error = dsl_sync_task(drc->drc_tofs, + return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); - - if (error != 0) - dmu_recv_cleanup_ds(drc); - return (error); + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) +{ int error; - error = dsl_sync_task(drc->drc_tofs, - dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); + drc->drc_owner = owner; + + if (drc->drc_newfs) + error = dmu_recv_new_end(drc); + else + error = dmu_recv_existing_end(drc); if (error != 0) { dmu_recv_cleanup_ds(drc); @@ -3224,17 +3240,6 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) return (error); } -int -dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) -{ - drc->drc_owner = owner; - - if (drc->drc_newfs) - return (dmu_recv_new_end(drc)); - else - return (dmu_recv_existing_end(drc)); -} - /* * Return TRUE if this objset is currently being received into. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index dd0644a..a76e74b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 63af9e3..6838148 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -808,15 +808,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(dn->dn_objset, dn->dn_object, name, - 8, 0, NULL); + err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; return; } } - err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + err = zap_count_write_by_dnode(dn, name, add, &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 39bef75..d599ed3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 7179c41..daf539e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); @@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 8d97789..eb203fe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1060,19 +1060,6 @@ dsl_dataset_get_blkptr(dsl_dataset_t *ds) return (&dsl_dataset_phys(ds)->ds_bp); } -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_bp = *bp; - } -} - spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index 033c2f3..fee1eac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -688,7 +688,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -714,7 +714,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -746,7 +746,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 0e75746..9518ab7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -38,17 +38,8 @@ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); -/* - * Allow allocations to switch to gang blocks quickly. We do this to - * avoid having to load lots of space_maps in a given txg. There are, - * however, some cases where we want to avoid "fast" ganging and instead - * we want to do an exhaustive search of all metaslabs on this device. - * Currently we don't allow any gang, slog, or dump device related allocations - * to "fast" gang. - */ -#define CAN_FASTGANG(flags) \ - (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ - METASLAB_GANG_AVOID))) +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) @@ -277,6 +268,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + refcount_create_tracked(&mc->mc_alloc_slots); return (mc); } @@ -290,6 +283,8 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + refcount_destroy(&mc->mc_alloc_slots); + mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -489,7 +484,13 @@ metaslab_class_expandable_space(metaslab_class_t *mc) continue; } - space += tvd->vdev_max_asize - tvd->vdev_asize; + /* + * Calculate if we have enough space to add additional + * metaslabs. We report the expandable space in terms + * of the metaslab size since that's the unit of expansion. + */ + space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); @@ -527,9 +528,10 @@ metaslab_compare(const void *x1, const void *x2) /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below - * the zfs_mg_noalloc_threshold. If a metaslab group transitions - * from allocatable to non-allocatable or vice versa then the metaslab - * group's class is updated to reflect the transition. + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) @@ -538,22 +540,45 @@ metaslab_group_alloc_update(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; + boolean_t was_initialized; ASSERT(vd == vd->vdev_top); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ - mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); @@ -576,6 +601,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -592,6 +618,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + refcount_create_tracked(&mg->mg_alloc_queue_depth); mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); @@ -614,6 +643,7 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); + refcount_destroy(&mg->mg_alloc_queue_depth); kmem_free(mg, sizeof (metaslab_group_t)); } @@ -685,6 +715,15 @@ metaslab_group_passivate(metaslab_group_t *mg) metaslab_class_minblocksize_update(mc); } +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + uint64_t metaslab_group_get_space(metaslab_group_t *mg) { @@ -854,30 +893,97 @@ metaslab_group_fragmentation(metaslab_group_t *mg) * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group - * that can still handle allocations. + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. */ static boolean_t -metaslab_group_allocatable(metaslab_group_t *mg) +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize) { - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* - * We use two key metrics to determine if a metaslab group is - * considered allocatable -- free space and fragmentation. If - * the free space is greater than the free space threshold and - * the fragmentation is less than the fragmentation threshold then - * consider the group allocatable. There are two case when we will - * not consider these key metrics. The first is if the group is - * associated with a slog device and the second is if all groups - * in this metaslab class have already been consider ineligible + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible * for allocations. */ - return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && - (mg->mg_fragmentation == ZFS_FRAG_INVALID || - mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || - mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); + if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_t *mgp; + int64_t qdepth; + uint64_t qmax = mg->mg_max_alloc_queue_depth; + + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); + + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); + + qdepth = refcount_count(&mg->mg_alloc_queue_depth); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { + qmax = mgp->mg_max_alloc_queue_depth; + + qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); + } + return (B_FALSE); } /* @@ -2145,8 +2251,57 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) return (0); } +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_add(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + for (int d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + } +#endif +} + static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, +metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { spa_t *spa = mg->mg_vd->vdev_spa; @@ -2173,10 +2328,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, if (msp->ms_weight < asize) { spa_dbgmsg(spa, "%s: failed to meet weight " "requirement: vdev %llu, txg %llu, mg %p, " - "msp %p, psize %llu, asize %llu, " + "msp %p, asize %llu, " "weight %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, - mg, msp, psize, asize, msp->ms_weight); + mg, msp, asize, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } @@ -2258,7 +2413,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, msp->ms_access_txg = txg + metaslab_unload_delay; mutex_exit(&msp->ms_lock); - return (offset); } @@ -2275,7 +2429,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int all_zero; int zio_lock = B_FALSE; boolean_t allocatable; - uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -2345,7 +2498,6 @@ top: all_zero = B_TRUE; do { ASSERT(mg->mg_activation_count == 1); - vd = mg->mg_vd; /* @@ -2361,24 +2513,23 @@ top: /* * Determine if the selected metaslab group is eligible - * for allocations. If we're ganging or have requested - * an allocation for the smallest gang block size - * then we don't want to avoid allocating to the this - * metaslab group. If we're in this condition we should - * try to allocate from any device possible so that we - * don't inadvertently return ENOSPC and suspend the pool + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ - if (allocatable && CAN_FASTGANG(flags) && - psize > SPA_GANGBLOCKSIZE) - allocatable = metaslab_group_allocatable(mg); + if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize); + } if (!allocatable) goto next; + ASSERT(mg->mg_initialized); + /* - * Avoid writing single-copy data to a failing vdev - * unless the user instructs us that it is okay. + * Avoid writing single-copy data to a failing vdev. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && @@ -2398,8 +2549,32 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, psize, asize, txg, distance, - dva, d); + uint64_t offset = metaslab_group_alloc(mg, asize, txg, + distance, dva, d); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be + * out of space. We must notify the allocation + * throttle to start skipping allocation + * attempts to this metaslab group until more + * space becomes available. + * + * Note: this failure cannot be caused by the + * allocation throttle since the allocation + * throttle is only responsible for skipping + * devices and not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -2580,9 +2755,57 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) return (0); } +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, + int flags) +{ + uint64_t available_slots = 0; + boolean_t slot_reserved = B_FALSE; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); + if (reserved_slots < mc->mc_alloc_max_slots) + available_slots = mc->mc_alloc_max_slots - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags)) { + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (int d = 0; d < slots; d++) { + reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +{ + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (int d = 0; d < slots; d++) { + (void) refcount_remove(&mc->mc_alloc_slots, zio); + } + mutex_exit(&mc->mc_lock); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -2608,11 +2831,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); } + } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c index bae4a78..7e9265a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -74,6 +74,13 @@ refcount_create(refcount_t *rc) } void +refcount_create_tracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + +void refcount_create_untracked(refcount_t *rc) { refcount_create(rc); @@ -232,4 +239,84 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +refcount_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +refcount_not_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} #endif /* ZFS_DEBUG */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index c8cadc2..c94f63d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -1333,7 +1333,6 @@ spa_unload(spa_t *spa) ddt_unload(spa); - /* * Drop and purge level 2 cache */ @@ -3638,6 +3637,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; /* * Create "The Godfather" zio to hold all async IOs @@ -3823,6 +3823,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); @@ -5522,7 +5523,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) + nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; @@ -6635,6 +6636,8 @@ spa_sync(spa_t *spa, uint64_t txg) vdev_t *vd; dmu_tx_t *tx; int error; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); @@ -6646,6 +6649,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -6705,6 +6712,38 @@ spa_sync(spa_t *spa, uint64_t txg) } /* + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. + */ + uint64_t queue_depth_total = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + mg->mg_max_alloc_queue_depth = max_queue_depth; + queue_depth_total += mg->mg_max_alloc_queue_depth; + } + metaslab_class_t *mc = spa_normal_class(spa); + ASSERT0(refcount_count(&mc->mc_alloc_slots)); + mc->mc_alloc_max_slots = queue_depth_total; + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + + ASSERT3U(mc->mc_alloc_max_slots, <=, + max_queue_depth * rvd->vdev_children); + + /* * Iterate to convergence. */ do { @@ -6846,6 +6885,10 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * Update usable space statistics. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index ee37dec..2fd7f49 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -657,6 +657,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -713,6 +714,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_active_count++; } + avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + /* * Every pool starts with the default cachefile */ @@ -791,6 +795,7 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } + avl_destroy(&spa->spa_alloc_tree); list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); @@ -824,6 +829,7 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index 714c528..5bf6ddd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -43,51 +43,83 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef int arc_evict_func_t(void *priv); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { @@ -95,11 +127,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -119,19 +150,17 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 4964126..6862599 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -36,6 +36,7 @@ #include <sys/zfs_context.h> #include <sys/refcount.h> #include <sys/zrlock.h> +#include <sys/multilist.h> #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 4de0b8e..e076088 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. @@ -78,6 +78,7 @@ struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; +typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, @@ -418,7 +419,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 -void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, +void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. @@ -444,7 +445,7 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); -int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, +int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); @@ -464,6 +465,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); +int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags); /* * Add a reference to a dmu buffer that has already been held via @@ -616,6 +619,10 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); */ void *dmu_buf_get_user(dmu_buf_t *db); +objset_t *dmu_buf_get_objset(dmu_buf_t *db); +dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); +void dmu_buf_dnode_exit(dmu_buf_t *db); + /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); @@ -798,7 +805,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ -void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 8cb6341..6f913e2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -301,6 +301,8 @@ typedef struct dmu_sendarg { uint64_t dsa_last_data_offset; uint64_t dsa_resume_object; uint64_t dsa_resume_offset; + boolean_t dsa_sent_begin; + boolean_t dsa_sent_end; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 98780b2..e55faf3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -149,7 +149,7 @@ typedef struct dnode_phys { blkptr_t dn_spill; } dnode_phys_t; -typedef struct dnode { +struct dnode { /* * Protects the structure of the dnode, including the number of levels * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* @@ -247,7 +247,7 @@ typedef struct dnode { /* holds prefetch structure */ struct zfetch dn_zfetch; -} dnode_t; +}; /* * Adds a level of indirection between the dbuf and the dnode to avoid diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 17d15d7..54c63b9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -86,13 +86,6 @@ struct dsl_pool; #define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" /* - * This field is present (with value=0) if this dataset may contain large - * blocks (>128KB). If it is present, then this dataset - * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature. - */ -#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" - -/* * These fields are set on datasets that are in the middle of a resumable * receive, and allow the sender to resume the send if it is interrupted. */ @@ -272,7 +265,6 @@ int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); -void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 74031f1..592aea5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); -#define METASLAB_HINTBP_FAVOR 0x0 -#define METASLAB_HINTBP_AVOID 0x1 -#define METASLAB_GANG_HEADER 0x2 -#define METASLAB_GANG_CHILD 0x4 -#define METASLAB_GANG_AVOID 0x8 +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_ASYNC_ALLOC 0x8 +#define METASLAB_DONT_THROTTLE 0x10 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); @@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, + zio_t *, int); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -86,10 +90,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); +boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index eb7c932..071a6d5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -59,11 +59,42 @@ extern "C" { * to use a block allocator that best suits that class. */ struct metaslab_class { + kmutex_t mc_lock; spa_t *mc_spa; metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; uint64_t mc_aliquot; + + /* + * Track the number of metaslab groups that have been initialized + * and can accept allocations. An initialized metaslab group is + * one has been completely added to the config (i.e. we have + * updated the MOS config and the space has been added to the pool). + */ + uint64_t mc_groups; + + /* + * Toggle to enable/disable the allocation throttle. + */ + boolean_t mc_alloc_throttle_enabled; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mc_alloc_slots + * refcount. The mc_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t mc_alloc_max_slots; + refcount_t mc_alloc_slots; + uint64_t mc_alloc_groups; /* # of allocatable groups */ + uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ @@ -86,6 +117,15 @@ struct metaslab_group { avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + + /* + * A metaslab group is considered to be initialized only after + * we have updated the MOS config and added the space to the pool. + * We only allow allocation attempts to a metaslab group if it + * has been initialized. + */ + boolean_t mg_initialized; + uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; @@ -94,6 +134,27 @@ struct metaslab_group { taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; + + /* + * Each metaslab group can handle mg_max_alloc_queue_depth allocations + * which are tracked by mg_alloc_queue_depth. It's possible for a + * metaslab group to handle more allocations than its max. This + * can occur when gang blocks are required or when other groups + * are unable to handle their share of allocations. + */ + uint64_t mg_max_alloc_queue_depth; + refcount_t mg_alloc_queue_depth; + + /* + * A metalab group that can no longer allocate the minimum block + * size will set mg_no_free_space. Once a metaslab group is out + * of space then its share of work must be distributed to other + * groups. + */ + boolean_t mg_no_free_space; + + uint64_t mg_allocations; + uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h index 3423645..11baa58 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_REFCOUNT_H @@ -64,6 +64,7 @@ typedef struct refcount { void refcount_create(refcount_t *rc); void refcount_create_untracked(refcount_t *rc); +void refcount_create_tracked(refcount_t *rc); void refcount_destroy(refcount_t *rc); void refcount_destroy_many(refcount_t *rc, uint64_t number); int refcount_is_zero(refcount_t *rc); @@ -73,6 +74,9 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); +boolean_t refcount_held(refcount_t *, void *); +boolean_t refcount_not_held(refcount_t *, void *); void refcount_sysinit(void); void refcount_fini(void); @@ -85,6 +89,7 @@ typedef struct refcount { #define refcount_create(rc) ((rc)->rc_count = 0) #define refcount_create_untracked(rc) ((rc)->rc_count = 0) +#define refcount_create_tracked(rc) ((rc)->rc_count = 0) #define refcount_destroy(rc) ((rc)->rc_count = 0) #define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define refcount_is_zero(rc) ((rc)->rc_count == 0) @@ -100,6 +105,9 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) +#define refcount_held(rc, holder) ((rc)->rc_count > 0) +#define refcount_not_held(rc, holder) (B_TRUE) #define refcount_sysinit() #define refcount_fini() diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 105f889..984c174 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index b454b4e..f8afa18 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -159,6 +159,8 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ + kmutex_t spa_alloc_lock; + avl_tree_t spa_alloc_tree; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 77e291b..6f3013d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +extern int zfs_vdev_queue_depth_pct; +extern uint32_t zfs_vdev_async_write_max_active; + /* * Virtual device operations */ @@ -190,7 +193,18 @@ struct vdev { uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ - boolean_t vdev_ishole; /* is a hole in the namespace */ + boolean_t vdev_ishole; /* is a hole in the namespace */ + kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ + + /* + * The queue depth parameters determine how many async writes are + * still pending (i.e. allocated by net yet issued to disk) per + * top-level (vdev_async_write_queue_depth) and the maximum allowed + * (vdev_max_async_write_queue_depth). These values only apply to + * top-level vdevs. + */ + uint64_t vdev_async_write_queue_depth; + uint64_t vdev_max_async_write_queue_depth; /* * Leaf vdev state. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index 2f5ce88..18da25c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -216,8 +216,14 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); +int zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp); -int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, +int zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, refcount_t *towrite, refcount_t *tooverwrite); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h index 99123a6..dc522f9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -197,8 +197,8 @@ typedef struct zap_name { boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); -void zap_unlockdir(zap_t *zap); + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); +void zap_unlockdir(zap_t *zap, void *tag); void zap_evict(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); @@ -217,9 +217,10 @@ void fzap_prefetch(zap_name_t *zn); int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, refcount_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); + const void *val, void *tag, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx); int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); @@ -229,7 +230,7 @@ void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx); + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index fc7be5b..741c673 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -175,6 +175,7 @@ enum zio_flag { ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, + ZIO_FLAG_IO_ALLOCATING = 1 << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -182,27 +183,27 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 15, - ZIO_FLAG_TRYHARD = 1 << 16, - ZIO_FLAG_OPTIONAL = 1 << 17, + ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 16, + ZIO_FLAG_TRYHARD = 1 << 17, + ZIO_FLAG_OPTIONAL = 1 << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 19, - ZIO_FLAG_IO_BYPASS = 1 << 20, - ZIO_FLAG_IO_REWRITE = 1 << 21, - ZIO_FLAG_RAW = 1 << 22, - ZIO_FLAG_GANG_CHILD = 1 << 23, - ZIO_FLAG_DDT_CHILD = 1 << 24, - ZIO_FLAG_GODFATHER = 1 << 25, - ZIO_FLAG_NOPWRITE = 1 << 26, - ZIO_FLAG_REEXECUTED = 1 << 27, - ZIO_FLAG_DELEGATED = 1 << 28, + ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 20, + ZIO_FLAG_IO_BYPASS = 1 << 21, + ZIO_FLAG_IO_REWRITE = 1 << 22, + ZIO_FLAG_RAW = 1 << 23, + ZIO_FLAG_GANG_CHILD = 1 << 24, + ZIO_FLAG_DDT_CHILD = 1 << 25, + ZIO_FLAG_GODFATHER = 1 << 26, + ZIO_FLAG_NOPWRITE = 1 << 27, + ZIO_FLAG_REEXECUTED = 1 << 28, + ZIO_FLAG_DELEGATED = 1 << 29, }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -243,6 +244,7 @@ enum zio_wait_type { typedef void zio_done_func_t(zio_t *zio); +extern boolean_t zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; /* @@ -430,7 +432,6 @@ struct zio { blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; - zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -456,9 +457,11 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; + hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; avl_node_t io_queue_node; avl_node_t io_offset_node; + avl_node_t io_alloc_node; /* Internal pipeline state */ enum zio_flag io_flags; @@ -467,6 +470,7 @@ struct zio { enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; + enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; @@ -492,6 +496,8 @@ struct zio { list_node_t io_trim_link; }; +extern int zio_timestamp_compare(const void *, const void *); + extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); @@ -554,8 +560,8 @@ extern void zio_interrupt(zio_t *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); -extern zio_t *zio_walk_parents(zio_t *cio); -extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); +extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); @@ -564,6 +570,10 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index 0a9d772..f4b0faa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; #endif +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h index 33b8edb..96b3b01 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _ZIO_IMPL_H @@ -108,35 +108,37 @@ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ - ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ - ZIO_STAGE_READY = 1 << 16, /* RWFCI */ + ZIO_STAGE_READY = 1 << 18, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */ + ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ - ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -187,22 +189,27 @@ enum zio_stage { #define ZIO_REWRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_WRITE_BP_INIT) #define ZIO_WRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_CHECKSUM_GENERATE | \ ZIO_STAGE_DDT_WRITE) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 28ff6c9..35b4ae3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -441,6 +441,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); @@ -757,6 +758,7 @@ vdev_free(vdev_t *vd) } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -1071,7 +1073,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); - while ((pio = zio_walk_parents(zio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); @@ -2780,7 +2783,8 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole); + !vd->vdev_cant_write && !vd->vdev_ishole && + vd->vdev_mg->mg_initialized); } boolean_t @@ -2808,6 +2812,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd = vd->vdev_top; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); @@ -2818,8 +2823,15 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - if (vd->vdev_max_asize != 0) - vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; + /* + * Report expandable space on top-level, non-auxillary devices only. + * The expandable space is reported in terms of metaslab sized units + * since that determines how much space the pool can expand. + */ + if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { + vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); + } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c index 50d8593..37309a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -250,7 +250,8 @@ vdev_cache_fill(zio_t *fio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - while ((pio = zio_walk_parents(fio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) vdev_cache_hit(vc, ve, pio); if (fio->io_error || ve->ve_missed_update) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 681a670..cd3444e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -241,34 +241,6 @@ vdev_disk_rele(vdev_t *vd) } } -static uint64_t -vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) -{ - ASSERT(vd->vdev_wholedisk); - - vdev_disk_t *dvd = vd->vdev_tsd; - dk_efi_t dk_ioc; - efi_gpt_t *efi; - uint64_t avail_space = 0; - int efisize = EFI_LABEL_SIZE * 2; - - dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); - dk_ioc.dki_lba = 1; - dk_ioc.dki_length = efisize; - dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; - efi = dk_ioc.dki_data; - - if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, - FKIOCTL, kcred, NULL) == 0) { - uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); - - if (capacity > efi_altern_lba) - avail_space = (capacity - efi_altern_lba) * blksz; - } - kmem_free(dk_ioc.dki_data, efisize); - return (avail_space); -} - /* * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when * even a fallback to DKIOCGMEDIAINFO fails. @@ -559,10 +531,7 @@ skip_open: * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ - *max_psize += vdev_disk_get_space(vd, capacity, blksz); - zfs_dbgmsg("capacity change: vdev %s, psize %llu, " - "max_psize %llu", vd->vdev_path, *psize, - *max_psize); + *max_psize = capacity * blksz; } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 0ac5fb7..ac994f5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -730,7 +730,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, if (vd->vdev_spa->spa_splitting_newspa || (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && - vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) { + vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index 70e0be9..f159d3e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -299,9 +299,10 @@ vdev_mirror_scrub_done(zio_t *zio) if (zio->io_error == 0) { zio_t *pio; + zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio)) != NULL) { + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); bcopy(zio->io_data, pio->io_data, pio->io_size); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 55405b7..bec3f84 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -34,6 +34,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/metaslab_impl.h> /* * ZFS I/O Scheduler @@ -175,6 +176,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; +/* + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. + */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + + #ifdef __FreeBSD__ #ifdef _KERNEL SYSCTL_DECL(_vfs_zfs_vdev); @@ -245,6 +263,9 @@ TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_write_gap_limit, 0, "Acceptable gap between two writes being aggregated"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN, + &zfs_vdev_queue_depth_pct, 0, + "Queue depth percentage for each top-level"); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) @@ -402,6 +423,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); @@ -423,6 +445,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); @@ -492,7 +515,8 @@ vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; - while ((pio = zio_walk_parents(aio)) != NULL) { + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(aio, &zl)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 4dc931b..bf68c9f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) uint64_t blk, off; int err; dmu_buf_t *db; + dnode_t *dn; int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); @@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + /* + * Note: this is equivalent to dmu_buf_hold(), but we use + * _dnode_enter / _by_dnode because it's faster because we don't + * have to hold the dnode. + */ + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); if (err) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) */ blk = (idx*2) >> (bs-3); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } @@ -505,8 +515,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); if (err) return (err); @@ -589,7 +601,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) } static int -zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; @@ -611,9 +624,9 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) uint64_t object = zap->zap_object; zap_put_leaf(l); - zap_unlockdir(zap); + zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, - FALSE, FALSE, &zn->zn_zap); + FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err) return (err); @@ -676,7 +689,8 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) } static void -zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; @@ -696,9 +710,9 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; - zap_unlockdir(zap); + zap_unlockdir(zap, tag); err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, &zn->zn_zap); + RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err) return; @@ -788,7 +802,7 @@ fzap_lookup(zap_name_t *zn, int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx) + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; @@ -817,7 +831,7 @@ retry: if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tx, &l); + err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; @@ -825,26 +839,27 @@ retry: out: if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) + const void *val, void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_NEED_CD, tx)); + val, ZAP_NEED_CD, tag, tx)); } int fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err, create; @@ -874,14 +889,14 @@ retry: } if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tx, &l); + err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index fbd7e02..f013101 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -43,7 +43,8 @@ extern inline mzap_phys_t *zap_m_phys(zap_t *zap); -static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +static int mzap_upgrade(zap_t **zapp, + void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) @@ -459,20 +460,18 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) return (zap); } -int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, +static int +zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; - dmu_buf_t *db; krw_t lt; - int err; - *zapp = NULL; + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; - err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); - if (err) - return (err); + *zapp = NULL; #ifdef ZFS_DEBUG { @@ -520,10 +519,12 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; - return (mzap_upgrade(zapp, tx, 0)); + int err = mzap_upgrade(zapp, tag, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); } - err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); - ASSERT0(err); + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } @@ -532,15 +533,49 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, return (0); } +static int +zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + return (err); + } + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { + dmu_buf_rele(db, tag); + } + return (err); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + return (err); +} + void -zap_unlockdir(zap_t *zap) +zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); - dmu_buf_rele(zap->zap_dbuf, NULL); + dmu_buf_rele(zap->zap_dbuf, tag); } static int -mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) +mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; int i, sz, nchunks; @@ -578,7 +613,8 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); + err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, + tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err) @@ -617,9 +653,9 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, - B_FALSE, B_FALSE, &zap)); - VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); - zap_unlockdir(zap); + B_FALSE, B_FALSE, FTAG, &zap)); + VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags)); + zap_unlockdir(zap, FTAG); } } @@ -714,7 +750,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) zap_t *zap; int err; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); if (!zap->zap_ismicro) { @@ -722,7 +758,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) } else { *count = zap->zap_m.zap_num_entries; } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -779,25 +815,19 @@ zap_lookup(objset_t *os, uint64_t zapobj, const char *name, num_integers, buf, MT_EXACT, NULL, 0, NULL)); } -int -zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, +static int +zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { - zap_t *zap; - int err; + int err = 0; mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) { - zap_unlockdir(zap); + if (zn == NULL) return (SET_ERROR(ENOTSUP)); - } if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, @@ -824,7 +854,51 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } } zap_name_free(zn); - zap_unlockdir(zap); + return (err); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, MT_EXACT, NULL, 0, NULL)); +} + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); return (err); } @@ -836,18 +910,18 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -859,19 +933,19 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -894,12 +968,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { @@ -916,7 +990,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, } } zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -928,17 +1002,17 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -997,22 +1071,24 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, const uint64_t *intval = val; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { - err = mzap_upgrade(&zn->zn_zap, tx, 0); - if (err == 0) - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, + FTAG, tx); + } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { mze = mze_find(zn); @@ -1025,7 +1101,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1038,19 +1114,19 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1074,25 +1150,27 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_update(zn, integer_size, num_integers, val, tx); + err = fzap_update(zn, integer_size, num_integers, val, + FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx, 0); - if (err == 0) + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { err = fzap_update(zn, integer_size, num_integers, - val, tx); + val, FTAG, tx); + } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mze = mze_find(zn); @@ -1106,7 +1184,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1119,19 +1197,19 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_name_t *zn; int err; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } - err = fzap_update(zn, integer_size, num_integers, val, tx); + err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1150,12 +1228,12 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { @@ -1172,7 +1250,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, } } zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1184,17 +1262,17 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1226,7 +1304,7 @@ zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap); + zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { @@ -1273,7 +1351,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, &zc->zc_zap); + RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err) return (err); @@ -1340,7 +1418,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, &zc->zc_zap); + RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); if (err) return (err); } else { @@ -1377,7 +1455,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) int err; zap_t *zap; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); @@ -1390,12 +1468,12 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) } else { fzap_get_stats(zap, zs); } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (0); } int -zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, +zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, refcount_t *towrite, refcount_t *tooverwrite) { zap_t *zap; @@ -1409,7 +1487,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks * - * This also accomodates the case where an add operation to a fairly + * This also accommodates the case where an add operation to a fairly * large microzap results in a promotion to fatzap. */ if (name == NULL) { @@ -1422,10 +1500,11 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation - * and hence we donot want to trigger an upgrade. + * and hence we do not want to trigger an upgrade. */ - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); - if (err) + err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) return (err); if (!zap->zap_ismicro) { @@ -1471,6 +1550,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, } } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 66e53f2..29cf371 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -256,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -293,7 +294,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 98fd449..9a57fd4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -41,6 +41,7 @@ #include <sys/trim_map.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/metaslab_impl.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -80,6 +81,10 @@ const char *zio_type_name[ZIO_TYPES] = { "zio_ioctl" }; +boolean_t zio_dva_throttle_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, + &zio_dva_throttle_enabled, 0, ""); + /* * ========================================================================== * I/O kmem caches @@ -141,6 +146,8 @@ int zio_buf_debug_limit = 0; #endif #endif +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + void zio_init(void) { @@ -334,7 +341,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -352,7 +359,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -401,52 +408,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ -/* - * NOTE - Callers to zio_walk_parents() and zio_walk_children must - * continue calling these functions until they return NULL. - * Otherwise, the next caller will pick up the list walk in - * some indeterminate state. (Otherwise every caller would - * have to pass in a cookie to keep the state represented by - * io_walk_link, which gets annoying.) - */ zio_t * -zio_walk_parents(zio_t *cio) +zio_walk_parents(zio_t *cio, zio_link_t **zl) { - zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; - zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); - cio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_child == cio); - return (zl->zl_parent); + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); } zio_t * -zio_walk_children(zio_t *pio) +zio_walk_children(zio_t *pio, zio_link_t **zl) { - zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; - zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); - pio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_parent == pio); - return (zl->zl_child); + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { - zio_t *pio = zio_walk_parents(cio); + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); - VERIFY(zio_walk_parents(cio) == NULL); + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } @@ -515,6 +509,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; } @@ -538,9 +533,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) (*countp)--; if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); - zio_execute(pio); + /* + * Dispatch the parent zio in its own taskq so that + * the child can continue to make progress. This also + * prevents overflowing the stack when we have deeply nested + * parent-child relationships. + */ + zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } @@ -553,6 +557,30 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) zio->io_error = zio->io_child_error[c]; } +int +zio_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_queued_timestamp < z2->io_queued_timestamp) + return (-1); + if (z1->io_queued_timestamp > z2->io_queued_timestamp) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -621,6 +649,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -818,7 +847,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); @@ -939,6 +968,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); return (zio); } @@ -1027,8 +1057,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + void *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -1063,9 +1093,30 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + metaslab_class_t *mc = spa_normal_class(pio->io_spa); + + ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) @@ -1172,38 +1223,15 @@ zio_read_bp_init(zio_t *zio) static int zio_write_bp_init(zio_t *zio) { - spa_t *spa = zio->io_spa; - zio_prop_t *zp = &zio->io_prop; - enum zio_compress compress = zp->zp_compress; - blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_size; - uint64_t psize = lsize; - int pass = 1; - - /* - * If our children haven't all reached the ready stage, - * wait for them and then repeat this pipeline stage. - */ - if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || - zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); - if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); - if (zio->io_children_ready != NULL) { - /* - * Now that all our children are ready, run the callback - * associated with this zio in case it wants to modify the - * data to be written. - */ - ASSERT3U(zp->zp_level, >, 0); - zio->io_children_ready(zio); - } - ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); @@ -1220,6 +1248,7 @@ zio_write_bp_init(zio_t *zio) */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } @@ -1237,10 +1266,54 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ zio->io_bp_override = NULL; - BP_ZERO(bp); + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_write_compress(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_prop_t *zp = &zio->io_prop; + enum zio_compress compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = zio->io_size; + uint64_t psize = lsize; + int pass = 1; + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + if (zio->io_children_ready != NULL) { + /* + * Now that all our children are ready, run the callback + * associated with this zio in case it wants to modify the + * data to be written. + */ + ASSERT3U(zp->zp_level, >, 0); + zio->io_children_ready(zio); } + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + ASSERT(zio->io_bp_override == NULL); + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're @@ -1308,6 +1381,14 @@ zio_write_bp_init(zio_t *zio) psize, lsize, NULL); } } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; } /* @@ -1360,7 +1441,6 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); } @@ -1537,6 +1617,8 @@ zio_execute(zio_t *zio) { zio->io_executor = curthread; + ASSERT3U(zio->io_queued_timestamp, >, 0); + while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; @@ -1570,6 +1652,7 @@ zio_execute(zio_t *zio) } zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) @@ -1593,6 +1676,8 @@ zio_wait(zio_t *zio) ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); zio_execute(zio); @@ -1624,6 +1709,8 @@ zio_nowait(zio_t *zio) zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); zio_execute(zio); } @@ -1648,6 +1735,7 @@ zio_reexecute(zio_t *pio) pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1664,8 +1752,9 @@ zio_reexecute(zio_t *pio) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio); + zio_link_t *zl = NULL; + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; @@ -1678,8 +1767,10 @@ zio_reexecute(zio_t *pio) * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ - if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); zio_execute(pio); + } } void @@ -2073,6 +2164,7 @@ static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2086,10 +2178,43 @@ zio_write_gang_block(zio_t *pio) zio_prop_t zp; int error; - error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, - METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio); + } pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } @@ -2128,11 +2253,25 @@ zio_write_gang_block(zio_t *pio) zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; - zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio, flags)); + } + zio_nowait(cio); } /* @@ -2361,7 +2500,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); @@ -2390,7 +2529,8 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_phys_fill(ddp, zio->io_bp); - while ((pio = zio_walk_parents(zio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); @@ -2411,7 +2551,8 @@ zio_ddt_child_write_done(zio_t *zio) dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { - while (zio_walk_parents(zio) != NULL) + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); @@ -2589,6 +2730,97 @@ zio_ddt_free(zio_t *zio) * Allocate and free blocks * ========================================================================== */ + +static zio_t * +zio_io_to_allocate(spa_t *spa) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + + zio = avl_first(&spa->spa_alloc_tree); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + zio->io_prop.zp_copies, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_tree, zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static int +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + mutex_enter(&spa->spa_alloc_lock); + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + avl_add(&spa->spa_alloc_tree, zio); + + nio = zio_io_to_allocate(zio->io_spa); + mutex_exit(&spa->spa_alloc_lock); + + if (nio == zio) + return (ZIO_PIPELINE_CONTINUE); + + if (nio != NULL) { + ASSERT3U(nio->io_queued_timestamp, <=, + zio->io_queued_timestamp); + ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); + /* + * We are passing control to a new zio so make sure that + * it is processed by a different thread. We do this to + * avoid stack overflows that can occur when parents are + * throttled and children are making progress. We allow + * it to go to the head of the taskq since it's already + * been waiting. + */ + zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); + } + return (ZIO_PIPELINE_STOP); +} + +void +zio_allocate_dispatch(spa_t *spa) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_lock); + zio = zio_io_to_allocate(spa); + mutex_exit(&spa->spa_alloc_lock); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + static int zio_dva_allocate(zio_t *zio) { @@ -2609,18 +2841,20 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - /* - * The dump device does not support gang blocks so allocation on - * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid - * the "fast" gang feature. - */ - flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; - flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? - METASLAB_GANG_CHILD : 0; + if (zio->io_flags & ZIO_FLAG_NODATA) { + flags |= METASLAB_DONT_THROTTLE; + } + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { + flags |= METASLAB_GANG_CHILD; + } + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { + flags |= METASLAB_ASYNC_ALLOC; + } + error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); - if (error) { + if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); @@ -2685,21 +2919,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, ASSERT(txg > spa_syncing_txg(spa)); - /* - * ZIL blocks are always contiguous (i.e. not gang blocks) so we - * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" - * when allocating them. - */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); } if (error == 0) { @@ -2775,6 +3002,8 @@ zio_vdev_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } + ASSERT3P(zio->io_logical, !=, zio); + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -3193,6 +3422,7 @@ zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; + zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) @@ -3210,12 +3440,26 @@ zio_ready(zio_t *zio) if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; - if (zio->io_error) + if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + spa_normal_class(zio->io_spa), + zio->io_prop.zp_copies, zio); + zio_allocate_dispatch(zio->io_spa); + } + } + mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; - pio = zio_walk_parents(zio); + pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* @@ -3226,7 +3470,7 @@ zio_ready(zio_t *zio) * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { - pio_next = zio_walk_parents(zio); + pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } @@ -3246,6 +3490,66 @@ zio_ready(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), + 1, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa); +} + static int zio_done(zio_t *zio) { @@ -3255,6 +3559,8 @@ zio_done(zio_t *zio) vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; + metaslab_class_t *mc = spa_normal_class(spa); + zio_link_t *zl = NULL; /* * If our children haven't all completed, @@ -3266,6 +3572,30 @@ zio_done(zio_t *zio) zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(mc->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(bp != NULL); + metaslab_group_alloc_verify(spa, zio->io_bp, zio); + VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); + } + for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); @@ -3435,13 +3765,15 @@ zio_done(zio_t *zio) * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { - zio_remove_child(pio, zio, zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } @@ -3505,10 +3837,11 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); - zio_remove_child(pio, zio, zl); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } @@ -3532,9 +3865,10 @@ zio_done(zio_t *zio) static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, + zio_write_bp_init, zio_free_bp_init, zio_issue_async, - zio_write_bp_init, + zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, @@ -3543,6 +3877,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_ddt_free, zio_gang_assemble, zio_gang_issue, + zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c index dac118a..ddd13fe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; - spa_t *spa = zio->io_spa; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); @@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) spa->spa_cksum_tmpls[checksum], &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 73a30a1..81cf1d7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -900,7 +900,8 @@ typedef enum { SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ - SPA_LOAD_ERROR /* load failed */ + SPA_LOAD_ERROR, /* load failed */ + SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 7005665..4625ab8 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -269,7 +269,6 @@ dev/hwpmc/hwpmc_x86.c optional hwpmc dev/hyperv/netvsc/hv_net_vsc.c optional hyperv dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv -dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv @@ -283,6 +282,7 @@ dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv +dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/hyperv/vmbus/amd64/hyperv_machdep.c optional hyperv dev/hyperv/vmbus/amd64/vmbus_vector.S optional hyperv dev/lindev/full.c optional lindev diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 8fa32d4..7640410 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -246,7 +246,6 @@ dev/hwpmc/hwpmc_x86.c optional hwpmc dev/hyperv/netvsc/hv_net_vsc.c optional hyperv dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv -dev/hyperv/stordisengage/hv_ata_pci_disengage.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv @@ -260,6 +259,7 @@ dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv +dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/hyperv/vmbus/i386/hyperv_machdep.c optional hyperv dev/hyperv/vmbus/i386/vmbus_vector.S optional hyperv dev/ichwd/ichwd.c optional ichwd diff --git a/sys/dev/hyperv/include/vmbus.h b/sys/dev/hyperv/include/vmbus.h index bf72a9b..8cf13fa 100644 --- a/sys/dev/hyperv/include/vmbus.h +++ b/sys/dev/hyperv/include/vmbus.h @@ -84,12 +84,18 @@ struct vmbus_chanpkt_hdr { #define VMBUS_CHANPKT_TYPE_GPA 0x0009 #define VMBUS_CHANPKT_TYPE_COMP 0x000b +#define VMBUS_CHANPKT_FLAG_NONE 0 #define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ #define VMBUS_CHANPKT_CONST_DATA(pkt) \ (const void *)((const uint8_t *)(pkt) + \ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) +/* Include padding */ +#define VMBUS_CHANPKT_DATALEN(pkt) \ + (VMBUS_CHANPKT_GETLEN((pkt)->cph_tlen) -\ + VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) + struct vmbus_rxbuf_desc { uint32_t rb_len; uint32_t rb_ofs; @@ -103,8 +109,16 @@ struct vmbus_chanpkt_rxbuf { struct vmbus_rxbuf_desc cp_rxbuf[]; } __packed; +struct vmbus_chan_br { + void *cbr; + bus_addr_t cbr_paddr; + int cbr_txsz; + int cbr_rxsz; +}; + struct vmbus_channel; struct hyperv_guid; +struct task; typedef void (*vmbus_chan_callback_t)(struct vmbus_channel *, void *); @@ -117,7 +131,13 @@ vmbus_get_channel(device_t dev) int vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg); +int vmbus_chan_open_br(struct vmbus_channel *chan, + const struct vmbus_chan_br *cbr, const void *udata, + int udlen, vmbus_chan_callback_t cb, void *cbarg); void vmbus_chan_close(struct vmbus_channel *chan); +void vmbus_chan_intr_drain(struct vmbus_channel *chan); +void vmbus_chan_run_task(struct vmbus_channel *chan, + struct task *task); int vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl); @@ -156,5 +176,9 @@ uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan); bool vmbus_chan_is_primary(const struct vmbus_channel *chan); const struct hyperv_guid * vmbus_chan_guid_inst(const struct vmbus_channel *chan); +int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, + int dlen_max); +bool vmbus_chan_rx_empty(const struct vmbus_channel *chan); +bool vmbus_chan_tx_empty(const struct vmbus_channel *chan); #endif /* !_VMBUS_H_ */ diff --git a/sys/dev/hyperv/include/vmbus_xact.h b/sys/dev/hyperv/include/vmbus_xact.h new file mode 100644 index 0000000..62fda01 --- /dev/null +++ b/sys/dev/hyperv/include/vmbus_xact.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_XACT_H_ +#define _VMBUS_XACT_H_ + +#include <sys/param.h> +#include <sys/bus.h> +#include <machine/bus.h> + +struct vmbus_xact; +struct vmbus_xact_ctx; + +struct vmbus_xact_ctx *vmbus_xact_ctx_create(bus_dma_tag_t dtag, + size_t req_size, size_t resp_size, + size_t priv_size); +void vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx); +struct vmbus_xact *vmbus_xact_get(struct vmbus_xact_ctx *ctx, + size_t req_len); +void vmbus_xact_put(struct vmbus_xact *xact); + +void *vmbus_xact_req_data(const struct vmbus_xact *xact); +bus_addr_t vmbus_xact_req_paddr(const struct vmbus_xact *xact); +void *vmbus_xact_priv(const struct vmbus_xact *xact, + size_t priv_len); +void vmbus_xact_activate(struct vmbus_xact *xact); +void vmbus_xact_deactivate(struct vmbus_xact *xact); +const void *vmbus_xact_wait(struct vmbus_xact *xact, + size_t *resp_len); +void vmbus_xact_wakeup(struct vmbus_xact *xact, + const void *data, size_t dlen); +void vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, + const void *data, size_t dlen); + +#endif /* !_VMBUS_XACT_H_ */ diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.c b/sys/dev/hyperv/netvsc/hv_net_vsc.c index 5b9281f..66ff860 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.c +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.c @@ -37,6 +37,7 @@ #include <sys/param.h> #include <sys/kernel.h> #include <sys/socket.h> +#include <sys/limits.h> #include <sys/lock.h> #include <net/if.h> #include <net/if_arp.h> @@ -44,118 +45,130 @@ #include <machine/atomic.h> #include <dev/hyperv/include/hyperv.h> -#include "hv_net_vsc.h" -#include "hv_rndis.h" -#include "hv_rndis_filter.h" +#include <dev/hyperv/include/vmbus_xact.h> +#include <dev/hyperv/netvsc/hv_net_vsc.h> +#include <dev/hyperv/netvsc/hv_rndis_filter.h> +#include <dev/hyperv/netvsc/if_hnreg.h> +#include <dev/hyperv/netvsc/if_hnvar.h> MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper-V netvsc driver"); /* * Forward declarations */ -static void hv_nv_on_channel_callback(struct vmbus_channel *chan, - void *xrxr); -static int hv_nv_init_send_buffer_with_net_vsp(struct hn_softc *sc); -static int hv_nv_init_rx_buffer_with_net_vsp(struct hn_softc *); -static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); -static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); -static int hv_nv_connect_to_vsp(struct hn_softc *sc); -static void hv_nv_on_send_completion(netvsc_dev *net_dev, - struct vmbus_channel *, const struct vmbus_chanpkt_hdr *pkt); -static void hv_nv_on_receive_completion(struct vmbus_channel *chan, - uint64_t tid, uint32_t status); -static void hv_nv_on_receive(netvsc_dev *net_dev, - struct hn_rx_ring *rxr, struct vmbus_channel *chan, - const struct vmbus_chanpkt_hdr *pkt); - -/* - * - */ -static inline netvsc_dev * -hv_nv_alloc_net_device(struct hn_softc *sc) +static int hn_nvs_conn_chim(struct hn_softc *sc); +static int hn_nvs_conn_rxbuf(struct hn_softc *); +static int hn_nvs_disconn_chim(struct hn_softc *sc); +static int hn_nvs_disconn_rxbuf(struct hn_softc *sc); +static void hn_nvs_sent_none(struct hn_send_ctx *sndc, + struct hn_softc *, struct vmbus_channel *chan, + const void *, int); + +struct hn_send_ctx hn_send_ctx_none = + HN_SEND_CTX_INITIALIZER(hn_nvs_sent_none, NULL); + +static const uint32_t hn_nvs_version[] = { + HN_NVS_VERSION_5, + HN_NVS_VERSION_4, + HN_NVS_VERSION_2, + HN_NVS_VERSION_1 +}; + +uint32_t +hn_chim_alloc(struct hn_softc *sc) { - netvsc_dev *net_dev; + int i, bmap_cnt = sc->hn_chim_bmap_cnt; + u_long *bmap = sc->hn_chim_bmap; + uint32_t ret = HN_NVS_CHIM_IDX_INVALID; - net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_WAITOK | M_ZERO); + for (i = 0; i < bmap_cnt; ++i) { + int idx; - net_dev->sc = sc; - net_dev->destroy = FALSE; - sc->net_dev = net_dev; + idx = ffsl(~bmap[i]); + if (idx == 0) + continue; - return (net_dev); -} + --idx; /* ffsl is 1-based */ + KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, + ("invalid i %d and idx %d", i, idx)); -/* - * XXX unnecessary; nuke it. - */ -static inline netvsc_dev * -hv_nv_get_outbound_net_device(struct hn_softc *sc) -{ - return sc->net_dev; -} + if (atomic_testandset_long(&bmap[i], idx)) + continue; -/* - * XXX unnecessary; nuke it. - */ -static inline netvsc_dev * -hv_nv_get_inbound_net_device(struct hn_softc *sc) -{ - return sc->net_dev; + ret = i * LONG_BIT + idx; + break; + } + return (ret); } -int -hv_nv_get_next_send_section(netvsc_dev *net_dev) +static const void * +hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, + void *req, int reqlen, size_t *resplen0, uint32_t type) { - unsigned long bitsmap_words = net_dev->bitsmap_words; - unsigned long *bitsmap = net_dev->send_section_bitsmap; - unsigned long idx; - int ret = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - int i; - - for (i = 0; i < bitsmap_words; i++) { - idx = ffsl(~bitsmap[i]); - if (0 == idx) - continue; + struct hn_send_ctx sndc; + size_t resplen, min_resplen = *resplen0; + const struct hn_nvs_hdr *hdr; + int error; - idx--; - KASSERT(i * BITS_PER_LONG + idx < net_dev->send_section_count, - ("invalid i %d and idx %lu", i, idx)); + KASSERT(min_resplen >= sizeof(*hdr), + ("invalid minimum response len %zu", min_resplen)); - if (atomic_testandset_long(&bitsmap[i], idx)) - continue; + /* + * Execute the xact setup by the caller. + */ + hn_send_ctx_init_simple(&sndc, hn_nvs_sent_xact, xact); - ret = i * BITS_PER_LONG + idx; - break; + vmbus_xact_activate(xact); + error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, + req, reqlen, &sndc); + if (error) { + vmbus_xact_deactivate(xact); + return (NULL); } + hdr = vmbus_xact_wait(xact, &resplen); - return (ret); + /* + * Check this NVS response message. + */ + if (resplen < min_resplen) { + if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); + return (NULL); + } + if (hdr->nvs_type != type) { + if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " + "expect 0x%08x\n", hdr->nvs_type, type); + return (NULL); + } + /* All pass! */ + *resplen0 = resplen; + return (hdr); } -/* - * Net VSC initialize receive buffer with net VSP - * - * Net VSP: Network virtual services client, also known as the - * Hyper-V extensible switch and the synthetic data path. - */ -static int -hv_nv_init_rx_buffer_with_net_vsp(struct hn_softc *sc) +static __inline int +hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) { - netvsc_dev *net_dev; - nvsp_msg *init_pkt; - int ret = 0; - net_dev = hv_nv_get_outbound_net_device(sc); - if (!net_dev) { - return (ENODEV); - } + return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, + req, reqlen, &hn_send_ctx_none)); +} - net_dev->rx_buf = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), - PAGE_SIZE, 0, net_dev->rx_buf_size, &net_dev->rxbuf_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); - if (net_dev->rx_buf == NULL) { - device_printf(sc->hn_dev, "allocate rxbuf failed\n"); - return ENOMEM; - } +static int +hn_nvs_conn_rxbuf(struct hn_softc *sc) +{ + struct vmbus_xact *xact = NULL; + struct hn_nvs_rxbuf_conn *conn; + const struct hn_nvs_rxbuf_connresp *resp; + size_t resp_len; + uint32_t status; + int error, rxbuf_size; + + /* + * Limit RXBUF size for old NVS. + */ + if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) + rxbuf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; + else + rxbuf_size = NETVSC_RECEIVE_BUFFER_SIZE; /* * Connect the RXBUF GPADL to the primary channel. @@ -164,97 +177,67 @@ hv_nv_init_rx_buffer_with_net_vsp(struct hn_softc *sc) * Only primary channel has RXBUF connected to it. Sub-channels * just share this RXBUF. */ - ret = vmbus_chan_gpadl_connect(sc->hn_prichan, - net_dev->rxbuf_dma.hv_paddr, net_dev->rx_buf_size, - &net_dev->rx_buf_gpadl_handle); - if (ret != 0) { - device_printf(sc->hn_dev, "rxbuf gpadl connect failed: %d\n", - ret); + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", + error); goto cleanup; } - - /* sema_wait(&ext->channel_init_sema); KYS CHECK */ - - /* Notify the NetVsp of the gpadl handle */ - init_pkt = &net_dev->channel_init_packet; - - memset(init_pkt, 0, sizeof(nvsp_msg)); - - init_pkt->hdr.msg_type = nvsp_msg_1_type_send_rx_buf; - init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = - net_dev->rx_buf_gpadl_handle; - init_pkt->msgs.vers_1_msgs.send_rx_buf.id = - NETVSC_RECEIVE_BUFFER_ID; - /* Send the gpadl notification request */ + /* + * Connect RXBUF to NVS. + */ - ret = vmbus_chan_send(sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, - init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); - if (ret != 0) { + xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); + error = ENXIO; goto cleanup; } - - sema_wait(&net_dev->channel_init_sema); - - /* Check the response */ - if (init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.status - != nvsp_status_success) { - ret = EINVAL; + conn = vmbus_xact_req_data(xact); + conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; + conn->nvs_gpadl = sc->hn_rxbuf_gpadl; + conn->nvs_sig = HN_NVS_RXBUF_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, + HN_NVS_TYPE_RXBUF_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); + error = EIO; goto cleanup; } - net_dev->rx_section_count = - init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections; + status = resp->nvs_status; + vmbus_xact_put(xact); + xact = NULL; - net_dev->rx_sections = malloc(net_dev->rx_section_count * - sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_WAITOK); - memcpy(net_dev->rx_sections, - init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections, - net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section)); - - - /* - * For first release, there should only be 1 section that represents - * the entire receive buffer - */ - if (net_dev->rx_section_count != 1 - || net_dev->rx_sections->offset != 0) { - ret = EINVAL; + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); + error = EIO; goto cleanup; } + sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; - goto exit; + return (0); cleanup: - hv_nv_destroy_rx_buffer(net_dev); - -exit: - return (ret); + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_rxbuf(sc); + return (error); } -/* - * Net VSC initialize send buffer with net VSP - */ static int -hv_nv_init_send_buffer_with_net_vsp(struct hn_softc *sc) +hn_nvs_conn_chim(struct hn_softc *sc) { - netvsc_dev *net_dev; - nvsp_msg *init_pkt; - int ret = 0; - - net_dev = hv_nv_get_outbound_net_device(sc); - if (!net_dev) { - return (ENODEV); - } - - net_dev->send_buf = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), - PAGE_SIZE, 0, net_dev->send_buf_size, &net_dev->txbuf_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); - if (net_dev->send_buf == NULL) { - device_printf(sc->hn_dev, "allocate chimney txbuf failed\n"); - return ENOMEM; - } + struct vmbus_xact *xact = NULL; + struct hn_nvs_chim_conn *chim; + const struct hn_nvs_chim_connresp *resp; + size_t resp_len; + uint32_t status, sectsz; + int error; /* * Connect chimney sending buffer GPADL to the primary channel. @@ -263,533 +246,420 @@ hv_nv_init_send_buffer_with_net_vsp(struct hn_softc *sc) * Only primary channel has chimney sending buffer connected to it. * Sub-channels just share this chimney sending buffer. */ - ret = vmbus_chan_gpadl_connect(sc->hn_prichan, - net_dev->txbuf_dma.hv_paddr, net_dev->send_buf_size, - &net_dev->send_buf_gpadl_handle); - if (ret != 0) { - device_printf(sc->hn_dev, "chimney sending buffer gpadl " - "connect failed: %d\n", ret); + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_chim_dma.hv_paddr, NETVSC_SEND_BUFFER_SIZE, + &sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); goto cleanup; } - /* Notify the NetVsp of the gpadl handle */ - - init_pkt = &net_dev->channel_init_packet; - - memset(init_pkt, 0, sizeof(nvsp_msg)); - - init_pkt->hdr.msg_type = nvsp_msg_1_type_send_send_buf; - init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = - net_dev->send_buf_gpadl_handle; - init_pkt->msgs.vers_1_msgs.send_rx_buf.id = - NETVSC_SEND_BUFFER_ID; - - /* Send the gpadl notification request */ + /* + * Connect chimney sending buffer to NVS + */ - ret = vmbus_chan_send(sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, - init_pkt, sizeof(nvsp_msg), (uint64_t)init_pkt); - if (ret != 0) { + xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); + error = ENXIO; + goto cleanup; + } + chim = vmbus_xact_req_data(xact); + chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; + chim->nvs_gpadl = sc->hn_chim_gpadl; + chim->nvs_sig = HN_NVS_CHIM_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, + HN_NVS_TYPE_CHIM_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); + error = EIO; goto cleanup; } - sema_wait(&net_dev->channel_init_sema); + status = resp->nvs_status; + sectsz = resp->nvs_sectsz; + vmbus_xact_put(xact); + xact = NULL; - /* Check the response */ - if (init_pkt->msgs.vers_1_msgs.send_send_buf_complete.status - != nvsp_status_success) { - ret = EINVAL; + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); + error = EIO; goto cleanup; } + if (sectsz == 0) { + if_printf(sc->hn_ifp, "zero chimney sending buffer " + "section size\n"); + return (0); + } + + sc->hn_chim_szmax = sectsz; + sc->hn_chim_cnt = NETVSC_SEND_BUFFER_SIZE / sc->hn_chim_szmax; + if (NETVSC_SEND_BUFFER_SIZE % sc->hn_chim_szmax != 0) { + if_printf(sc->hn_ifp, "chimney sending sections are " + "not properly aligned\n"); + } + if (sc->hn_chim_cnt % LONG_BIT != 0) { + if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", + sc->hn_chim_cnt % LONG_BIT); + } - net_dev->send_section_size = - init_pkt->msgs.vers_1_msgs.send_send_buf_complete.section_size; - net_dev->send_section_count = - net_dev->send_buf_size / net_dev->send_section_size; - net_dev->bitsmap_words = howmany(net_dev->send_section_count, - BITS_PER_LONG); - net_dev->send_section_bitsmap = - malloc(net_dev->bitsmap_words * sizeof(long), M_NETVSC, - M_WAITOK | M_ZERO); + sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; + sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), + M_NETVSC, M_WAITOK | M_ZERO); - goto exit; + /* Done! */ + sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; + if (bootverbose) { + if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", + sc->hn_chim_szmax, sc->hn_chim_cnt); + } + return (0); cleanup: - hv_nv_destroy_send_buffer(net_dev); - -exit: - return (ret); + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_chim(sc); + return (error); } -/* - * Net VSC destroy receive buffer - */ static int -hv_nv_destroy_rx_buffer(netvsc_dev *net_dev) +hn_nvs_disconn_rxbuf(struct hn_softc *sc) { - nvsp_msg *revoke_pkt; - int ret = 0; - - /* - * If we got a section count, it means we received a - * send_rx_buf_complete msg - * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, - * we need to send a revoke msg here - */ - if (net_dev->rx_section_count) { - /* Send the revoke receive buffer */ - revoke_pkt = &net_dev->revoke_packet; - memset(revoke_pkt, 0, sizeof(nvsp_msg)); + int error; - revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_rx_buf; - revoke_pkt->msgs.vers_1_msgs.revoke_rx_buf.id = - NETVSC_RECEIVE_BUFFER_ID; - - ret = vmbus_chan_send(net_dev->sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, 0, revoke_pkt, sizeof(nvsp_msg), - (uint64_t)(uintptr_t)revoke_pkt); + if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { + struct hn_nvs_rxbuf_disconn disconn; /* - * If we failed here, we might as well return and have a leak - * rather than continue and a bugchk + * Disconnect RXBUF from NVS. */ - if (ret != 0) { - return (ret); + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; + disconn.nvs_sig = HN_NVS_RXBUF_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs rxbuf disconn failed: %d\n", error); + return (error); } - } - - /* Tear down the gpadl on the vsp end */ - if (net_dev->rx_buf_gpadl_handle) { - ret = vmbus_chan_gpadl_disconnect(net_dev->sc->hn_prichan, - net_dev->rx_buf_gpadl_handle); + sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; + /* - * If we failed here, we might as well return and have a leak - * rather than continue and a bugchk + * Wait for the hypervisor to receive this NVS request. */ - if (ret != 0) { - return (ret); - } - net_dev->rx_buf_gpadl_handle = 0; - } - - if (net_dev->rx_buf) { - /* Free up the receive buffer */ - hyperv_dmamem_free(&net_dev->rxbuf_dma, net_dev->rx_buf); - net_dev->rx_buf = NULL; + while (!vmbus_chan_tx_empty(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect RXBUF. + */ + pause("lingtx", (200 * hz) / 1000); } - if (net_dev->rx_sections) { - free(net_dev->rx_sections, M_NETVSC); - net_dev->rx_sections = NULL; - net_dev->rx_section_count = 0; + if (sc->hn_rxbuf_gpadl != 0) { + /* + * Disconnect RXBUF from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "rxbuf gpadl disconn failed: %d\n", error); + return (error); + } + sc->hn_rxbuf_gpadl = 0; } - - return (ret); + return (0); } -/* - * Net VSC destroy send buffer - */ static int -hv_nv_destroy_send_buffer(netvsc_dev *net_dev) +hn_nvs_disconn_chim(struct hn_softc *sc) { - nvsp_msg *revoke_pkt; - int ret = 0; + int error; + + if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { + struct hn_nvs_chim_disconn disconn; - /* - * If we got a section count, it means we received a - * send_rx_buf_complete msg - * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, - * we need to send a revoke msg here - */ - if (net_dev->send_section_size) { - /* Send the revoke send buffer */ - revoke_pkt = &net_dev->revoke_packet; - memset(revoke_pkt, 0, sizeof(nvsp_msg)); - - revoke_pkt->hdr.msg_type = - nvsp_msg_1_type_revoke_send_buf; - revoke_pkt->msgs.vers_1_msgs.revoke_send_buf.id = - NETVSC_SEND_BUFFER_ID; - - ret = vmbus_chan_send(net_dev->sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, 0, - revoke_pkt, sizeof(nvsp_msg), - (uint64_t)(uintptr_t)revoke_pkt); /* - * If we failed here, we might as well return and have a leak - * rather than continue and a bugchk + * Disconnect chimney sending buffer from NVS. */ - if (ret != 0) { - return (ret); + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; + disconn.nvs_sig = HN_NVS_CHIM_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs chim disconn failed: %d\n", error); + return (error); } - } - - /* Tear down the gpadl on the vsp end */ - if (net_dev->send_buf_gpadl_handle) { - ret = vmbus_chan_gpadl_disconnect(net_dev->sc->hn_prichan, - net_dev->send_buf_gpadl_handle); + sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; /* - * If we failed here, we might as well return and have a leak - * rather than continue and a bugchk + * Wait for the hypervisor to receive this NVS request. */ - if (ret != 0) { - return (ret); - } - net_dev->send_buf_gpadl_handle = 0; + while (!vmbus_chan_tx_empty(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect chimney + * sending buffer. + */ + pause("lingtx", (200 * hz) / 1000); } - if (net_dev->send_buf) { - /* Free up the receive buffer */ - hyperv_dmamem_free(&net_dev->txbuf_dma, net_dev->send_buf); - net_dev->send_buf = NULL; + if (sc->hn_chim_gpadl != 0) { + /* + * Disconnect chimney sending buffer from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "chim gpadl disconn failed: %d\n", error); + return (error); + } + sc->hn_chim_gpadl = 0; } - if (net_dev->send_section_bitsmap) { - free(net_dev->send_section_bitsmap, M_NETVSC); + if (sc->hn_chim_bmap != NULL) { + free(sc->hn_chim_bmap, M_NETVSC); + sc->hn_chim_bmap = NULL; } - - return (ret); + return (0); } - -/* - * Attempt to negotiate the caller-specified NVSP version - * - * For NVSP v2, Server 2008 R2 does not set - * init_pkt->msgs.init_msgs.init_compl.negotiated_prot_vers - * to the negotiated version, so we cannot rely on that. - */ static int -hv_nv_negotiate_nvsp_protocol(struct hn_softc *sc, netvsc_dev *net_dev, - uint32_t nvsp_ver) +hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) { - nvsp_msg *init_pkt; - int ret; - - init_pkt = &net_dev->channel_init_packet; - memset(init_pkt, 0, sizeof(nvsp_msg)); - init_pkt->hdr.msg_type = nvsp_msg_type_init; - - /* - * Specify parameter as the only acceptable protocol version - */ - init_pkt->msgs.init_msgs.init.p1.protocol_version = nvsp_ver; - init_pkt->msgs.init_msgs.init.protocol_version_2 = nvsp_ver; - - /* Send the init request */ - ret = vmbus_chan_send(sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, - init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); - if (ret != 0) - return (-1); - - sema_wait(&net_dev->channel_init_sema); - - if (init_pkt->msgs.init_msgs.init_compl.status != nvsp_status_success) + struct vmbus_xact *xact; + struct hn_nvs_init *init; + const struct hn_nvs_init_resp *resp; + size_t resp_len; + uint32_t status; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs init\n"); + return (ENXIO); + } + init = vmbus_xact_req_data(xact); + init->nvs_type = HN_NVS_TYPE_INIT; + init->nvs_ver_min = nvs_ver; + init->nvs_ver_max = nvs_ver; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, + HN_NVS_TYPE_INIT_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec init failed\n"); + vmbus_xact_put(xact); + return (EIO); + } + + status = resp->nvs_status; + vmbus_xact_put(xact); + + if (status != HN_NVS_STATUS_OK) { + if (bootverbose) { + /* + * Caller may try another NVS version, and will log + * error if there are no more NVS versions to try, + * so don't bark out loud here. + */ + if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", + nvs_ver); + } return (EINVAL); - + } return (0); } /* - * Send NDIS version 2 config packet containing MTU. - * - * Not valid for NDIS version 1. + * Configure MTU and enable VLAN. */ static int -hv_nv_send_ndis_config(struct hn_softc *sc, uint32_t mtu) +hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) { - netvsc_dev *net_dev; - nvsp_msg *init_pkt; - int ret; + struct hn_nvs_ndis_conf conf; + int error; - net_dev = hv_nv_get_outbound_net_device(sc); - if (!net_dev) - return (-ENODEV); + memset(&conf, 0, sizeof(conf)); + conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; + conf.nvs_mtu = mtu; + conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; - /* - * Set up configuration packet, write MTU - * Indicate we are capable of handling VLAN tags - */ - init_pkt = &net_dev->channel_init_packet; - memset(init_pkt, 0, sizeof(nvsp_msg)); - init_pkt->hdr.msg_type = nvsp_msg_2_type_send_ndis_config; - init_pkt->msgs.vers_2_msgs.send_ndis_config.mtu = mtu; - init_pkt-> - msgs.vers_2_msgs.send_ndis_config.capabilities.u1.u2.ieee8021q - = 1; - - /* Send the configuration packet */ - ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, - init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); - if (ret != 0) - return (-EINVAL); + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &conf, sizeof(conf)); + if (error) { + if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); + return (error); + } + if (bootverbose) + if_printf(sc->hn_ifp, "nvs ndis conf done\n"); + sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; return (0); } -/* - * Net VSC connect to VSP - */ static int -hv_nv_connect_to_vsp(struct hn_softc *sc) +hn_nvs_init_ndis(struct hn_softc *sc) { - netvsc_dev *net_dev; - nvsp_msg *init_pkt; - uint32_t ndis_version; - uint32_t protocol_list[] = { NVSP_PROTOCOL_VERSION_1, - NVSP_PROTOCOL_VERSION_2, - NVSP_PROTOCOL_VERSION_4, - NVSP_PROTOCOL_VERSION_5 }; - int i; - int protocol_number = nitems(protocol_list); - int ret = 0; - device_t dev = sc->hn_dev; - struct ifnet *ifp = sc->arpcom.ac_ifp; - - net_dev = hv_nv_get_outbound_net_device(sc); + struct hn_nvs_ndis_init ndis; + int error; + + memset(&ndis, 0, sizeof(ndis)); + ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; + ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); + ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); + if (error) + if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); + return (error); +} - /* - * Negotiate the NVSP version. Try the latest NVSP first. - */ - for (i = protocol_number - 1; i >= 0; i--) { - if (hv_nv_negotiate_nvsp_protocol(sc, net_dev, - protocol_list[i]) == 0) { - net_dev->nvsp_version = protocol_list[i]; - if (bootverbose) - device_printf(dev, "Netvsc: got version 0x%x\n", - net_dev->nvsp_version); - break; +static int +hn_nvs_init(struct hn_softc *sc) +{ + int i, error; + + if (device_is_attached(sc->hn_dev)) { + /* + * NVS version and NDIS version MUST NOT be changed. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } - } - if (i < 0) { - if (bootverbose) - device_printf(dev, "failed to negotiate a valid " - "protocol.\n"); - return (EPROTO); + error = hn_nvs_doinit(sc, sc->hn_nvs_ver); + if (error) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x " + "failed: %d\n", sc->hn_nvs_ver, error); + } + return (error); } /* - * Set the MTU if supported by this NVSP protocol version - * This needs to be right after the NVSP init message per Haiyang - */ - if (net_dev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) - ret = hv_nv_send_ndis_config(sc, ifp->if_mtu); - - /* - * Send the NDIS version + * Find the supported NVS version and set NDIS version accordingly. */ - init_pkt = &net_dev->channel_init_packet; - - memset(init_pkt, 0, sizeof(nvsp_msg)); - - if (net_dev->nvsp_version <= NVSP_PROTOCOL_VERSION_4) { - ndis_version = NDIS_VERSION_6_1; - } else { - ndis_version = NDIS_VERSION_6_30; - } - - init_pkt->hdr.msg_type = nvsp_msg_1_type_send_ndis_vers; - init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_major_vers = - (ndis_version & 0xFFFF0000) >> 16; - init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_minor_vers = - ndis_version & 0xFFFF; - - /* Send the init request */ - - ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, - init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); - if (ret != 0) { - goto cleanup; + for (i = 0; i < nitems(hn_nvs_version); ++i) { + error = hn_nvs_doinit(sc, hn_nvs_version[i]); + if (!error) { + sc->hn_nvs_ver = hn_nvs_version[i]; + + /* Set NDIS version according to NVS version. */ + sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; + if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) + sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; + + if (bootverbose) { + if_printf(sc->hn_ifp, "NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + } + return (0); + } } - /* - * TODO: BUGBUG - We have to wait for the above msg since the netvsp - * uses KMCL which acknowledges packet (completion packet) - * since our Vmbus always set the VMBUS_CHANPKT_FLAG_RC flag - */ - /* sema_wait(&NetVscChannel->channel_init_sema); */ - - /* Post the big receive buffer to NetVSP */ - if (net_dev->nvsp_version <= NVSP_PROTOCOL_VERSION_2) - net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; - else - net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; - net_dev->send_buf_size = NETVSC_SEND_BUFFER_SIZE; - - ret = hv_nv_init_rx_buffer_with_net_vsp(sc); - if (ret == 0) - ret = hv_nv_init_send_buffer_with_net_vsp(sc); - -cleanup: - return (ret); -} - -/* - * Net VSC disconnect from VSP - */ -static void -hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) -{ - hv_nv_destroy_rx_buffer(net_dev); - hv_nv_destroy_send_buffer(net_dev); -} - -void -hv_nv_subchan_attach(struct vmbus_channel *chan, struct hn_rx_ring *rxr) -{ - KASSERT(rxr->hn_rx_idx == vmbus_chan_subidx(chan), - ("chan%u subidx %u, rxr%d mismatch", - vmbus_chan_id(chan), vmbus_chan_subidx(chan), rxr->hn_rx_idx)); - vmbus_chan_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, - NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, - hv_nv_on_channel_callback, rxr); + if_printf(sc->hn_ifp, "no NVS available\n"); + return (ENXIO); } -/* - * Net VSC on device add - * - * Callback when the device belonging to this driver is added - */ -netvsc_dev * -hv_nv_on_device_add(struct hn_softc *sc, void *additional_info, - struct hn_rx_ring *rxr) +int +hn_nvs_attach(struct hn_softc *sc, int mtu) { - struct vmbus_channel *chan = sc->hn_prichan; - netvsc_dev *net_dev; - int ret = 0; - - net_dev = hv_nv_alloc_net_device(sc); - if (net_dev == NULL) - return NULL; + int error; - /* Initialize the NetVSC channel extension */ + /* + * Initialize NVS. + */ + error = hn_nvs_init(sc); + if (error) + return (error); - sema_init(&net_dev->channel_init_sema, 0, "netdev_sema"); + if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { + /* + * Configure NDIS before initializing it. + */ + error = hn_nvs_conf_ndis(sc, mtu); + if (error) + return (error); + } /* - * Open the channel + * Initialize NDIS. */ - KASSERT(rxr->hn_rx_idx == vmbus_chan_subidx(chan), - ("chan%u subidx %u, rxr%d mismatch", - vmbus_chan_id(chan), vmbus_chan_subidx(chan), rxr->hn_rx_idx)); - ret = vmbus_chan_open(chan, - NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, - NULL, 0, hv_nv_on_channel_callback, rxr); - if (ret != 0) - goto cleanup; + error = hn_nvs_init_ndis(sc); + if (error) + return (error); /* - * Connect with the NetVsp + * Connect RXBUF. */ - ret = hv_nv_connect_to_vsp(sc); - if (ret != 0) - goto close; - - return (net_dev); - -close: - /* Now, we can close the channel safely */ - vmbus_chan_close(chan); + error = hn_nvs_conn_rxbuf(sc); + if (error) + return (error); -cleanup: /* - * Free the packet buffers on the netvsc device packet queue. - * Release other resources. + * Connect chimney sending buffer. */ - sema_destroy(&net_dev->channel_init_sema); - free(net_dev, M_NETVSC); - - return (NULL); + error = hn_nvs_conn_chim(sc); + if (error) + return (error); + return (0); } -/* - * Net VSC on device remove - */ -int -hv_nv_on_device_remove(struct hn_softc *sc, boolean_t destroy_channel) +void +hn_nvs_detach(struct hn_softc *sc) { - netvsc_dev *net_dev = sc->net_dev;; - - /* Stop outbound traffic ie sends and receives completions */ - net_dev->destroy = TRUE; - - hv_nv_disconnect_from_vsp(net_dev); - /* At this point, no one should be accessing net_dev except in here */ - - /* Now, we can close the channel safely */ - - vmbus_chan_close(sc->hn_prichan); + /* NOTE: there are no requests to stop the NVS. */ + hn_nvs_disconn_rxbuf(sc); + hn_nvs_disconn_chim(sc); +} - sema_destroy(&net_dev->channel_init_sema); - free(net_dev, M_NETVSC); +void +hn_nvs_sent_xact(struct hn_send_ctx *sndc, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data, int dlen) +{ - return (0); + vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); } -/* - * Net VSC on send completion - */ static void -hv_nv_on_send_completion(netvsc_dev *net_dev, struct vmbus_channel *chan, - const struct vmbus_chanpkt_hdr *pkt) +hn_nvs_sent_none(struct hn_send_ctx *sndc __unused, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data __unused, int dlen __unused) { - const nvsp_msg *nvsp_msg_pkt; - netvsc_packet *net_vsc_pkt; - - nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkt); - - if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_type_init_complete - || nvsp_msg_pkt->hdr.msg_type - == nvsp_msg_1_type_send_rx_buf_complete - || nvsp_msg_pkt->hdr.msg_type - == nvsp_msg_1_type_send_send_buf_complete - || nvsp_msg_pkt->hdr.msg_type - == nvsp_msg5_type_subchannel) { - /* Copy the response back */ - memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, - sizeof(nvsp_msg)); - sema_post(&net_dev->channel_init_sema); - } else if (nvsp_msg_pkt->hdr.msg_type == - nvsp_msg_1_type_send_rndis_pkt_complete) { - /* Get the send context */ - net_vsc_pkt = - (netvsc_packet *)(unsigned long)pkt->cph_xactid; - if (NULL != net_vsc_pkt) { - if (net_vsc_pkt->send_buf_section_idx != - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { - u_long mask; - int idx; - - idx = net_vsc_pkt->send_buf_section_idx / - BITS_PER_LONG; - KASSERT(idx < net_dev->bitsmap_words, - ("invalid section index %u", - net_vsc_pkt->send_buf_section_idx)); - mask = 1UL << - (net_vsc_pkt->send_buf_section_idx % - BITS_PER_LONG); - - KASSERT(net_dev->send_section_bitsmap[idx] & - mask, - ("index bitmap 0x%lx, section index %u, " - "bitmap idx %d, bitmask 0x%lx", - net_dev->send_section_bitsmap[idx], - net_vsc_pkt->send_buf_section_idx, - idx, mask)); - atomic_clear_long( - &net_dev->send_section_bitsmap[idx], mask); - } - - /* Notify the layer above us */ - net_vsc_pkt->compl.send.on_send_completion(chan, - net_vsc_pkt->compl.send.send_completion_context); + /* EMPTY */ +} - } - } +void +hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) +{ + u_long mask; + uint32_t idx; + + idx = chim_idx / LONG_BIT; + KASSERT(idx < sc->hn_chim_bmap_cnt, + ("invalid chimney index 0x%x", chim_idx)); + + mask = 1UL << (chim_idx % LONG_BIT); + KASSERT(sc->hn_chim_bmap[idx] & mask, + ("index bitmap 0x%lx, chimney index %u, " + "bitmap idx %d, bitmask 0x%lx", + sc->hn_chim_bmap[idx], chim_idx, idx, mask)); + + atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } /* @@ -798,241 +668,75 @@ hv_nv_on_send_completion(netvsc_dev *net_dev, struct vmbus_channel *chan, * Returns 0 on success, non-zero on failure. */ int -hv_nv_on_send(struct vmbus_channel *chan, netvsc_packet *pkt) +hv_nv_on_send(struct vmbus_channel *chan, uint32_t rndis_mtype, + struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { - nvsp_msg send_msg; + struct hn_nvs_rndis rndis; int ret; - send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt; - if (pkt->is_data_pkt) { - /* 0 is RMC_DATA */ - send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 0; - } else { - /* 1 is RMC_CONTROL */ - send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 1; - } - - send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_idx = - pkt->send_buf_section_idx; - send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_size = - pkt->send_buf_section_size; + rndis.nvs_type = HN_NVS_TYPE_RNDIS; + rndis.nvs_rndis_mtype = rndis_mtype; + rndis.nvs_chim_idx = sndc->hn_chim_idx; + rndis.nvs_chim_sz = sndc->hn_chim_sz; - if (pkt->gpa_cnt) { - ret = vmbus_chan_send_sglist(chan, pkt->gpa, pkt->gpa_cnt, - &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); + if (gpa_cnt) { + ret = hn_nvs_send_sglist(chan, gpa, gpa_cnt, + &rndis, sizeof(rndis), sndc); } else { - ret = vmbus_chan_send(chan, - VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, - &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); + ret = hn_nvs_send(chan, VMBUS_CHANPKT_FLAG_RC, + &rndis, sizeof(rndis), sndc); } return (ret); } -/* - * Net VSC on receive - * - * In the FreeBSD Hyper-V virtual world, this function deals exclusively - * with virtual addresses. - */ -static void -hv_nv_on_receive(netvsc_dev *net_dev, struct hn_rx_ring *rxr, - struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) -{ - const struct vmbus_chanpkt_rxbuf *pkt; - const nvsp_msg *nvsp_msg_pkt; - netvsc_packet vsc_pkt; - netvsc_packet *net_vsc_pkt = &vsc_pkt; - int count = 0; - int i = 0; - int status = nvsp_status_success; - - nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkthdr); - - /* Make sure this is a valid nvsp packet */ - if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg_1_type_send_rndis_pkt) { - if_printf(rxr->hn_ifp, "packet hdr type %u is invalid!\n", - nvsp_msg_pkt->hdr.msg_type); - return; - } - - pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; - - if (pkt->cp_rxbuf_id != NETVSC_RECEIVE_BUFFER_ID) { - if_printf(rxr->hn_ifp, "rxbuf_id %d is invalid!\n", - pkt->cp_rxbuf_id); - return; - } - - count = pkt->cp_rxbuf_cnt; - - /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ - for (i = 0; i < count; i++) { - net_vsc_pkt->status = nvsp_status_success; - net_vsc_pkt->data = ((uint8_t *)net_dev->rx_buf + - pkt->cp_rxbuf[i].rb_ofs); - net_vsc_pkt->tot_data_buf_len = pkt->cp_rxbuf[i].rb_len; - - hv_rf_on_receive(net_dev, rxr, net_vsc_pkt); - if (net_vsc_pkt->status != nvsp_status_success) { - status = nvsp_status_failure; - } - } - - /* - * Moved completion call back here so that all received - * messages (not just data messages) will trigger a response - * message back to the host. - */ - hv_nv_on_receive_completion(chan, pkt->cp_hdr.cph_xactid, status); -} - -/* - * Net VSC on receive completion - * - * Send a receive completion packet to RNDIS device (ie NetVsp) - */ -static void -hv_nv_on_receive_completion(struct vmbus_channel *chan, uint64_t tid, - uint32_t status) -{ - nvsp_msg rx_comp_msg; - int retries = 0; - int ret = 0; - - rx_comp_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt_complete; - - /* Pass in the status */ - rx_comp_msg.msgs.vers_1_msgs.send_rndis_pkt_complete.status = - status; - -retry_send_cmplt: - /* Send the completion */ - ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 0, - &rx_comp_msg, sizeof(nvsp_msg), tid); - if (ret == 0) { - /* success */ - /* no-op */ - } else if (ret == EAGAIN) { - /* no more room... wait a bit and attempt to retry 3 times */ - retries++; - - if (retries < 4) { - DELAY(100); - goto retry_send_cmplt; - } - } -} - -/* - * Net VSC receiving vRSS send table from VSP - */ -static void -hv_nv_send_table(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) -{ - netvsc_dev *net_dev; - const nvsp_msg *nvsp_msg_pkt; - int i; - uint32_t count; - const uint32_t *table; - - net_dev = hv_nv_get_inbound_net_device(sc); - if (!net_dev) - return; - - nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkt); - - if (nvsp_msg_pkt->hdr.msg_type != - nvsp_msg5_type_send_indirection_table) { - printf("Netvsc: !Warning! receive msg type not " - "send_indirection_table. type = %d\n", - nvsp_msg_pkt->hdr.msg_type); - return; - } - - count = nvsp_msg_pkt->msgs.vers_5_msgs.send_table.count; - if (count != VRSS_SEND_TABLE_SIZE) { - printf("Netvsc: Received wrong send table size: %u\n", count); - return; - } - - table = (const uint32_t *) - ((const uint8_t *)&nvsp_msg_pkt->msgs.vers_5_msgs.send_table + - nvsp_msg_pkt->msgs.vers_5_msgs.send_table.offset); - - for (i = 0; i < count; i++) - net_dev->vrss_send_table[i] = table[i]; -} - -/* - * Net VSC on channel callback - */ -static void -hv_nv_on_channel_callback(struct vmbus_channel *chan, void *xrxr) +int +hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) { - struct hn_rx_ring *rxr = xrxr; - struct hn_softc *sc = rxr->hn_ifp->if_softc; - netvsc_dev *net_dev; - void *buffer; - int bufferlen = NETVSC_PACKET_SIZE; - - net_dev = hv_nv_get_inbound_net_device(sc); - if (net_dev == NULL) - return; - - buffer = rxr->hn_rdbuf; - do { - struct vmbus_chanpkt_hdr *pkt = buffer; - uint32_t bytes_rxed; - int ret; - - bytes_rxed = bufferlen; - ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); - if (ret == 0) { - if (bytes_rxed > 0) { - switch (pkt->cph_type) { - case VMBUS_CHANPKT_TYPE_COMP: - hv_nv_on_send_completion(net_dev, chan, - pkt); - break; - case VMBUS_CHANPKT_TYPE_RXBUF: - hv_nv_on_receive(net_dev, rxr, chan, pkt); - break; - case VMBUS_CHANPKT_TYPE_INBAND: - hv_nv_send_table(sc, pkt); - break; - default: - if_printf(rxr->hn_ifp, - "unknown chan pkt %u\n", - pkt->cph_type); - break; - } - } - } else if (ret == ENOBUFS) { - /* Handle large packet */ - if (bufferlen > NETVSC_PACKET_SIZE) { - free(buffer, M_NETVSC); - buffer = NULL; - } - - /* alloc new buffer */ - buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT); - if (buffer == NULL) { - if_printf(rxr->hn_ifp, - "hv_cb malloc buffer failed, len=%u\n", - bytes_rxed); - bufferlen = 0; - break; - } - bufferlen = bytes_rxed; - } else { - /* No more packets */ - break; - } - } while (1); - - if (bufferlen > NETVSC_PACKET_SIZE) - free(buffer, M_NETVSC); - - hv_rf_channel_rollup(rxr, rxr->hn_txr); + struct vmbus_xact *xact; + struct hn_nvs_subch_req *req; + const struct hn_nvs_subch_resp *resp; + int error, nsubch_req; + uint32_t nsubch; + size_t resp_len; + + nsubch_req = *nsubch0; + KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); + return (ENXIO); + } + req = vmbus_xact_req_data(xact); + req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; + req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; + req->nvs_nsubch = nsubch_req; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, + HN_NVS_TYPE_SUBCH_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); + error = EIO; + goto done; + } + if (resp->nvs_status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", + resp->nvs_status); + error = EIO; + goto done; + } + + nsubch = resp->nvs_nsubch; + if (nsubch > nsubch_req) { + if_printf(sc->hn_ifp, "%u subchans are allocated, " + "requested %d\n", nsubch, nsubch_req); + nsubch = nsubch_req; + } + *nsubch0 = nsubch; + error = 0; +done: + vmbus_xact_put(xact); + return (error); } diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h index cdae2fb..130b543 100644 --- a/sys/dev/hyperv/netvsc/hv_net_vsc.h +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h @@ -62,949 +62,12 @@ #include <dev/hyperv/include/hyperv_busdma.h> #include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/netvsc/ndis.h> + #define HN_USE_TXDESC_BUFRING MALLOC_DECLARE(M_NETVSC); -#define NVSP_INVALID_PROTOCOL_VERSION (0xFFFFFFFF) - -#define NVSP_PROTOCOL_VERSION_1 2 -#define NVSP_PROTOCOL_VERSION_2 0x30002 -#define NVSP_PROTOCOL_VERSION_4 0x40000 -#define NVSP_PROTOCOL_VERSION_5 0x50000 -#define NVSP_MIN_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_1) -#define NVSP_MAX_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_2) - -#define NVSP_PROTOCOL_VERSION_CURRENT NVSP_PROTOCOL_VERSION_2 - -#define VERSION_4_OFFLOAD_SIZE 22 - -#define NVSP_OPERATIONAL_STATUS_OK (0x00000000) -#define NVSP_OPERATIONAL_STATUS_DEGRADED (0x00000001) -#define NVSP_OPERATIONAL_STATUS_NONRECOVERABLE (0x00000002) -#define NVSP_OPERATIONAL_STATUS_NO_CONTACT (0x00000003) -#define NVSP_OPERATIONAL_STATUS_LOST_COMMUNICATION (0x00000004) - -/* - * Maximun number of transfer pages (packets) the VSP will use on a receive - */ -#define NVSP_MAX_PACKETS_PER_RECEIVE 375 - -/* vRSS stuff */ -#define RNDIS_OBJECT_TYPE_RSS_CAPABILITIES 0x88 -#define RNDIS_OBJECT_TYPE_RSS_PARAMETERS 0x89 - -#define RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2 2 -#define RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2 2 - -struct rndis_obj_header { - uint8_t type; - uint8_t rev; - uint16_t size; -} __packed; - -/* rndis_recv_scale_cap/cap_flag */ -#define RNDIS_RSS_CAPS_MESSAGE_SIGNALED_INTERRUPTS 0x01000000 -#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_ISR 0x02000000 -#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_DPC 0x04000000 -#define RNDIS_RSS_CAPS_USING_MSI_X 0x08000000 -#define RNDIS_RSS_CAPS_RSS_AVAILABLE_ON_PORTS 0x10000000 -#define RNDIS_RSS_CAPS_SUPPORTS_MSI_X 0x20000000 -#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV4 0x00000100 -#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6 0x00000200 -#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6_EX 0x00000400 - -/* RNDIS_RECEIVE_SCALE_CAPABILITIES */ -struct rndis_recv_scale_cap { - struct rndis_obj_header hdr; - uint32_t cap_flag; - uint32_t num_int_msg; - uint32_t num_recv_que; - uint16_t num_indirect_tabent; -} __packed; - -/* rndis_recv_scale_param flags */ -#define RNDIS_RSS_PARAM_FLAG_BASE_CPU_UNCHANGED 0x0001 -#define RNDIS_RSS_PARAM_FLAG_HASH_INFO_UNCHANGED 0x0002 -#define RNDIS_RSS_PARAM_FLAG_ITABLE_UNCHANGED 0x0004 -#define RNDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED 0x0008 -#define RNDIS_RSS_PARAM_FLAG_DISABLE_RSS 0x0010 - -/* Hash info bits */ -#define RNDIS_HASH_FUNC_TOEPLITZ 0x00000001 -#define RNDIS_HASH_IPV4 0x00000100 -#define RNDIS_HASH_TCP_IPV4 0x00000200 -#define RNDIS_HASH_IPV6 0x00000400 -#define RNDIS_HASH_IPV6_EX 0x00000800 -#define RNDIS_HASH_TCP_IPV6 0x00001000 -#define RNDIS_HASH_TCP_IPV6_EX 0x00002000 - -#define RNDIS_RSS_INDIRECTION_TABLE_MAX_SIZE_REVISION_2 (128 * 4) -#define RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 40 - -#define ITAB_NUM 128 -#define HASH_KEYLEN RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 - -/* RNDIS_RECEIVE_SCALE_PARAMETERS */ -typedef struct rndis_recv_scale_param_ { - struct rndis_obj_header hdr; - - /* Qualifies the rest of the information */ - uint16_t flag; - - /* The base CPU number to do receive processing. not used */ - uint16_t base_cpu_number; - - /* This describes the hash function and type being enabled */ - uint32_t hashinfo; - - /* The size of indirection table array */ - uint16_t indirect_tabsize; - - /* The offset of the indirection table from the beginning of this - * structure - */ - uint32_t indirect_taboffset; - - /* The size of the hash secret key */ - uint16_t hashkey_size; - - /* The offset of the secret key from the beginning of this structure */ - uint32_t hashkey_offset; - - uint32_t processor_masks_offset; - uint32_t num_processor_masks; - uint32_t processor_masks_entry_size; -} rndis_recv_scale_param; - -typedef enum nvsp_msg_type_ { - nvsp_msg_type_none = 0, - - /* - * Init Messages - */ - nvsp_msg_type_init = 1, - nvsp_msg_type_init_complete = 2, - - nvsp_version_msg_start = 100, - - /* - * Version 1 Messages - */ - nvsp_msg_1_type_send_ndis_vers = nvsp_version_msg_start, - - nvsp_msg_1_type_send_rx_buf, - nvsp_msg_1_type_send_rx_buf_complete, - nvsp_msg_1_type_revoke_rx_buf, - - nvsp_msg_1_type_send_send_buf, - nvsp_msg_1_type_send_send_buf_complete, - nvsp_msg_1_type_revoke_send_buf, - - nvsp_msg_1_type_send_rndis_pkt, - nvsp_msg_1_type_send_rndis_pkt_complete, - - /* - * Version 2 Messages - */ - nvsp_msg_2_type_send_chimney_delegated_buf, - nvsp_msg_2_type_send_chimney_delegated_buf_complete, - nvsp_msg_2_type_revoke_chimney_delegated_buf, - - nvsp_msg_2_type_resume_chimney_rx_indication, - - nvsp_msg_2_type_terminate_chimney, - nvsp_msg_2_type_terminate_chimney_complete, - - nvsp_msg_2_type_indicate_chimney_event, - - nvsp_msg_2_type_send_chimney_packet, - nvsp_msg_2_type_send_chimney_packet_complete, - - nvsp_msg_2_type_post_chimney_rx_request, - nvsp_msg_2_type_post_chimney_rx_request_complete, - - nvsp_msg_2_type_alloc_rx_buf, - nvsp_msg_2_type_alloc_rx_buf_complete, - - nvsp_msg_2_type_free_rx_buf, - - nvsp_msg_2_send_vmq_rndis_pkt, - nvsp_msg_2_send_vmq_rndis_pkt_complete, - - nvsp_msg_2_type_send_ndis_config, - - nvsp_msg_2_type_alloc_chimney_handle, - nvsp_msg_2_type_alloc_chimney_handle_complete, - - nvsp_msg2_max = nvsp_msg_2_type_alloc_chimney_handle_complete, - - /* - * Version 4 Messages - */ - nvsp_msg4_type_send_vf_association, - nvsp_msg4_type_switch_data_path, - nvsp_msg4_type_uplink_connect_state_deprecated, - - nvsp_msg4_max = nvsp_msg4_type_uplink_connect_state_deprecated, - - /* - * Version 5 Messages - */ - nvsp_msg5_type_oid_query_ex, - nvsp_msg5_type_oid_query_ex_comp, - nvsp_msg5_type_subchannel, - nvsp_msg5_type_send_indirection_table, - - nvsp_msg5_max = nvsp_msg5_type_send_indirection_table, -} nvsp_msg_type; - -typedef enum nvsp_status_ { - nvsp_status_none = 0, - nvsp_status_success, - nvsp_status_failure, - /* Deprecated */ - nvsp_status_prot_vers_range_too_new, - /* Deprecated */ - nvsp_status_prot_vers_range_too_old, - nvsp_status_invalid_rndis_pkt, - nvsp_status_busy, - nvsp_status_max, -} nvsp_status; - -typedef struct nvsp_msg_hdr_ { - uint32_t msg_type; -} __packed nvsp_msg_hdr; - -/* - * Init Messages - */ - -/* - * This message is used by the VSC to initialize the channel - * after the channels has been opened. This message should - * never include anything other then versioning (i.e. this - * message will be the same for ever). - * - * Forever is a long time. The values have been redefined - * in Win7 to indicate major and minor protocol version - * number. - */ -typedef struct nvsp_msg_init_ { - union { - struct { - uint16_t minor_protocol_version; - uint16_t major_protocol_version; - } s; - /* Formerly min_protocol_version */ - uint32_t protocol_version; - } p1; - /* Formerly max_protocol_version */ - uint32_t protocol_version_2; -} __packed nvsp_msg_init; - -/* - * This message is used by the VSP to complete the initialization - * of the channel. This message should never include anything other - * then versioning (i.e. this message will be the same forever). - */ -typedef struct nvsp_msg_init_complete_ { - /* Deprecated */ - uint32_t negotiated_prot_vers; - uint32_t max_mdl_chain_len; - uint32_t status; -} __packed nvsp_msg_init_complete; - -typedef union nvsp_msg_init_uber_ { - nvsp_msg_init init; - nvsp_msg_init_complete init_compl; -} __packed nvsp_msg_init_uber; - -/* - * Version 1 Messages - */ - -/* - * This message is used by the VSC to send the NDIS version - * to the VSP. The VSP can use this information when handling - * OIDs sent by the VSC. - */ -typedef struct nvsp_1_msg_send_ndis_version_ { - uint32_t ndis_major_vers; - /* Deprecated */ - uint32_t ndis_minor_vers; -} __packed nvsp_1_msg_send_ndis_version; - -/* - * This message is used by the VSC to send a receive buffer - * to the VSP. The VSP can then use the receive buffer to - * send data to the VSC. - */ -typedef struct nvsp_1_msg_send_rx_buf_ { - uint32_t gpadl_handle; - uint16_t id; -} __packed nvsp_1_msg_send_rx_buf; - -typedef struct nvsp_1_rx_buf_section_ { - uint32_t offset; - uint32_t sub_allocation_size; - uint32_t num_sub_allocations; - uint32_t end_offset; -} __packed nvsp_1_rx_buf_section; - -/* - * This message is used by the VSP to acknowledge a receive - * buffer send by the VSC. This message must be sent by the - * VSP before the VSP uses the receive buffer. - */ -typedef struct nvsp_1_msg_send_rx_buf_complete_ { - uint32_t status; - uint32_t num_sections; - - /* - * The receive buffer is split into two parts, a large - * suballocation section and a small suballocation - * section. These sections are then suballocated by a - * certain size. - * - * For example, the following break up of the receive - * buffer has 6 large suballocations and 10 small - * suballocations. - * - * | Large Section | | Small Section | - * ------------------------------------------------------------ - * | | | | | | | | | | | | | | | | | | - * | | - * LargeOffset SmallOffset - */ - nvsp_1_rx_buf_section sections[1]; - -} __packed nvsp_1_msg_send_rx_buf_complete; - -/* - * This message is sent by the VSC to revoke the receive buffer. - * After the VSP completes this transaction, the VSP should never - * use the receive buffer again. - */ -typedef struct nvsp_1_msg_revoke_rx_buf_ { - uint16_t id; -} __packed nvsp_1_msg_revoke_rx_buf; - -/* - * This message is used by the VSC to send a send buffer - * to the VSP. The VSC can then use the send buffer to - * send data to the VSP. - */ -typedef struct nvsp_1_msg_send_send_buf_ { - uint32_t gpadl_handle; - uint16_t id; -} __packed nvsp_1_msg_send_send_buf; - -/* - * This message is used by the VSP to acknowledge a send - * buffer sent by the VSC. This message must be sent by the - * VSP before the VSP uses the sent buffer. - */ -typedef struct nvsp_1_msg_send_send_buf_complete_ { - uint32_t status; - - /* - * The VSC gets to choose the size of the send buffer and - * the VSP gets to choose the sections size of the buffer. - * This was done to enable dynamic reconfigurations when - * the cost of GPA-direct buffers decreases. - */ - uint32_t section_size; -} __packed nvsp_1_msg_send_send_buf_complete; - -/* - * This message is sent by the VSC to revoke the send buffer. - * After the VSP completes this transaction, the vsp should never - * use the send buffer again. - */ -typedef struct nvsp_1_msg_revoke_send_buf_ { - uint16_t id; -} __packed nvsp_1_msg_revoke_send_buf; - -/* - * This message is used by both the VSP and the VSC to send - * an RNDIS message to the opposite channel endpoint. - */ -typedef struct nvsp_1_msg_send_rndis_pkt_ { - /* - * This field is specified by RNIDS. They assume there's - * two different channels of communication. However, - * the Network VSP only has one. Therefore, the channel - * travels with the RNDIS packet. - */ - uint32_t chan_type; - - /* - * This field is used to send part or all of the data - * through a send buffer. This values specifies an - * index into the send buffer. If the index is - * 0xFFFFFFFF, then the send buffer is not being used - * and all of the data was sent through other VMBus - * mechanisms. - */ - uint32_t send_buf_section_idx; - uint32_t send_buf_section_size; -} __packed nvsp_1_msg_send_rndis_pkt; - -/* - * This message is used by both the VSP and the VSC to complete - * a RNDIS message to the opposite channel endpoint. At this - * point, the initiator of this message cannot use any resources - * associated with the original RNDIS packet. - */ -typedef struct nvsp_1_msg_send_rndis_pkt_complete_ { - uint32_t status; -} __packed nvsp_1_msg_send_rndis_pkt_complete; - - -/* - * Version 2 Messages - */ - -/* - * This message is used by the VSC to send the NDIS version - * to the VSP. The VSP can use this information when handling - * OIDs sent by the VSC. - */ -typedef struct nvsp_2_netvsc_capabilities_ { - union { - uint64_t as_uint64; - struct { - uint64_t vmq : 1; - uint64_t chimney : 1; - uint64_t sriov : 1; - uint64_t ieee8021q : 1; - uint64_t correlationid : 1; - uint64_t teaming : 1; - } u2; - } u1; -} __packed nvsp_2_netvsc_capabilities; - -typedef struct nvsp_2_msg_send_ndis_config_ { - uint32_t mtu; - uint32_t reserved; - nvsp_2_netvsc_capabilities capabilities; -} __packed nvsp_2_msg_send_ndis_config; - -/* - * NvspMessage2TypeSendChimneyDelegatedBuffer - */ -typedef struct nvsp_2_msg_send_chimney_buf_ -{ - /* - * On WIN7 beta, delegated_obj_max_size is defined as a uint32_t - * Since WIN7 RC, it was split into two uint16_t. To have the same - * struct layout, delegated_obj_max_size shall be the first field. - */ - uint16_t delegated_obj_max_size; - - /* - * The revision # of chimney protocol used between NVSC and NVSP. - * - * This revision is NOT related to the chimney revision between - * NDIS protocol and miniport drivers. - */ - uint16_t revision; - - uint32_t gpadl_handle; -} __packed nvsp_2_msg_send_chimney_buf; - - -/* Unsupported chimney revision 0 (only present in WIN7 beta) */ -#define NVSP_CHIMNEY_REVISION_0 0 - -/* WIN7 Beta Chimney QFE */ -#define NVSP_CHIMNEY_REVISION_1 1 - -/* The chimney revision since WIN7 RC */ -#define NVSP_CHIMNEY_REVISION_2 2 - - -/* - * NvspMessage2TypeSendChimneyDelegatedBufferComplete - */ -typedef struct nvsp_2_msg_send_chimney_buf_complete_ { - uint32_t status; - - /* - * Maximum number outstanding sends and pre-posted receives. - * - * NVSC should not post more than SendQuota/ReceiveQuota packets. - * Otherwise, it can block the non-chimney path for an indefinite - * amount of time. - * (since chimney sends/receives are affected by the remote peer). - * - * Note: NVSP enforces the quota restrictions on a per-VMBCHANNEL - * basis. It doesn't enforce the restriction separately for chimney - * send/receive. If NVSC doesn't voluntarily enforce "SendQuota", - * it may kill its own network connectivity. - */ - uint32_t send_quota; - uint32_t rx_quota; -} __packed nvsp_2_msg_send_chimney_buf_complete; - -/* - * NvspMessage2TypeRevokeChimneyDelegatedBuffer - */ -typedef struct nvsp_2_msg_revoke_chimney_buf_ { - uint32_t gpadl_handle; -} __packed nvsp_2_msg_revoke_chimney_buf; - - -#define NVSP_CHIMNEY_OBJECT_TYPE_NEIGHBOR 0 -#define NVSP_CHIMNEY_OBJECT_TYPE_PATH4 1 -#define NVSP_CHIMNEY_OBJECT_TYPE_PATH6 2 -#define NVSP_CHIMNEY_OBJECT_TYPE_TCP 3 - -/* - * NvspMessage2TypeAllocateChimneyHandle - */ -typedef struct nvsp_2_msg_alloc_chimney_handle_ { - uint64_t vsc_context; - uint32_t object_type; -} __packed nvsp_2_msg_alloc_chimney_handle; - -/* - * NvspMessage2TypeAllocateChimneyHandleComplete - */ -typedef struct nvsp_2_msg_alloc_chimney_handle_complete_ { - uint32_t vsp_handle; -} __packed nvsp_2_msg_alloc_chimney_handle_complete; - - -/* - * NvspMessage2TypeResumeChimneyRXIndication - */ -typedef struct nvsp_2_msg_resume_chimney_rx_indication { - /* - * Handle identifying the offloaded connection - */ - uint32_t vsp_tcp_handle; -} __packed nvsp_2_msg_resume_chimney_rx_indication; - - -#define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_FIRST_STAGE (0x01u) -#define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_RESERVED (~(0x01u)) - -/* - * NvspMessage2TypeTerminateChimney - */ -typedef struct nvsp_2_msg_terminate_chimney_ { - /* - * Handle identifying the offloaded object - */ - uint32_t vsp_handle; - - /* - * Terminate Offload Flags - * Bit 0: - * When set to 0, terminate the offload at the destination NIC - * Bit 1-31: Reserved, shall be zero - */ - uint32_t flags; - - union { - /* - * This field is valid only when bit 0 of flags is clear. - * It specifies the index into the premapped delegated - * object buffer. The buffer was sent through the - * NvspMessage2TypeSendChimneyDelegatedBuffer - * message at initialization time. - * - * NVSP will write the delegated state into the delegated - * buffer upon upload completion. - */ - uint32_t index; - - /* - * This field is valid only when bit 0 of flags is set. - * - * The seqence number of the most recently accepted RX - * indication when VSC sets its TCP context into - * "terminating" state. - * - * This allows NVSP to determines if there are any in-flight - * RX indications for which the acceptance state is still - * undefined. - */ - uint64_t last_accepted_rx_seq_no; - } f0; -} __packed nvsp_2_msg_terminate_chimney; - - -#define NVSP_TERMINATE_CHIMNEY_COMPLETE_FLAG_DATA_CORRUPTED 0x0000001u - -/* - * NvspMessage2TypeTerminateChimneyComplete - */ -typedef struct nvsp_2_msg_terminate_chimney_complete_ { - uint64_t vsc_context; - uint32_t flags; -} __packed nvsp_2_msg_terminate_chimney_complete; - -/* - * NvspMessage2TypeIndicateChimneyEvent - */ -typedef struct nvsp_2_msg_indicate_chimney_event_ { - /* - * When VscTcpContext is 0, event_type is an NDIS_STATUS event code - * Otherwise, EventType is an TCP connection event (defined in - * NdisTcpOffloadEventHandler chimney DDK document). - */ - uint32_t event_type; - - /* - * When VscTcpContext is 0, EventType is an NDIS_STATUS event code - * Otherwise, EventType is an TCP connection event specific information - * (defined in NdisTcpOffloadEventHandler chimney DDK document). - */ - uint32_t event_specific_info; - - /* - * If not 0, the event is per-TCP connection event. This field - * contains the VSC's TCP context. - * If 0, the event indication is global. - */ - uint64_t vsc_tcp_context; -} __packed nvsp_2_msg_indicate_chimney_event; - - -#define NVSP_1_CHIMNEY_SEND_INVALID_OOB_INDEX 0xffffu -#define NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX 0xffffffff - -/* - * NvspMessage2TypeSendChimneyPacket - */ -typedef struct nvsp_2_msg_send_chimney_pkt_ { - /* - * Identify the TCP connection for which this chimney send is - */ - uint32_t vsp_tcp_handle; - - /* - * This field is used to send part or all of the data - * through a send buffer. This values specifies an - * index into the send buffer. If the index is - * 0xFFFF, then the send buffer is not being used - * and all of the data was sent through other VMBus - * mechanisms. - */ - uint16_t send_buf_section_index; - uint16_t send_buf_section_size; - - /* - * OOB Data Index - * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, - * then there is no OOB data. - * - * This field shall be always 0xFFFFFFFF for now. It is reserved for - * the future. - */ - uint16_t oob_data_index; - - /* - * DisconnectFlags = 0 - * Normal chimney send. See MiniportTcpOffloadSend for details. - * - * DisconnectFlags = TCP_DISCONNECT_GRACEFUL_CLOSE (0x01) - * Graceful disconnect. See MiniportTcpOffloadDisconnect for details. - * - * DisconnectFlags = TCP_DISCONNECT_ABORTIVE_CLOSE (0x02) - * Abortive disconnect. See MiniportTcpOffloadDisconnect for details. - */ - uint16_t disconnect_flags; - - uint32_t seq_no; -} __packed nvsp_2_msg_send_chimney_pkt; - -/* - * NvspMessage2TypeSendChimneyPacketComplete - */ -typedef struct nvsp_2_msg_send_chimney_pkt_complete_ { - /* - * The NDIS_STATUS for the chimney send - */ - uint32_t status; - - /* - * Number of bytes that have been sent to the peer (and ACKed by the peer). - */ - uint32_t bytes_transferred; -} __packed nvsp_2_msg_send_chimney_pkt_complete; - - -#define NVSP_1_CHIMNEY_RECV_FLAG_NO_PUSH 0x0001u -#define NVSP_1_CHIMNEY_RECV_INVALID_OOB_INDEX 0xffffu - -/* - * NvspMessage2TypePostChimneyRecvRequest - */ -typedef struct nvsp_2_msg_post_chimney_rx_request_ { - /* - * Identify the TCP connection which this chimney receive request - * is for. - */ - uint32_t vsp_tcp_handle; - - /* - * OOB Data Index - * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, - * then there is no OOB data. - * - * This field shall be always 0xFFFFFFFF for now. It is reserved for - * the future. - */ - uint32_t oob_data_index; - - /* - * Bit 0 - * When it is set, this is a "no-push" receive. - * When it is clear, this is a "push" receive. - * - * Bit 1-15: Reserved and shall be zero - */ - uint16_t flags; - - /* - * For debugging and diagnoses purpose. - * The SeqNo is per TCP connection and starts from 0. - */ - uint32_t seq_no; -} __packed nvsp_2_msg_post_chimney_rx_request; - -/* - * NvspMessage2TypePostChimneyRecvRequestComplete - */ -typedef struct nvsp_2_msg_post_chimney_rx_request_complete_ { - /* - * The NDIS_STATUS for the chimney send - */ - uint32_t status; - - /* - * Number of bytes that have been sent to the peer (and ACKed by - * the peer). - */ - uint32_t bytes_xferred; -} __packed nvsp_2_msg_post_chimney_rx_request_complete; - -/* - * NvspMessage2TypeAllocateReceiveBuffer - */ -typedef struct nvsp_2_msg_alloc_rx_buf_ { - /* - * Allocation ID to match the allocation request and response - */ - uint32_t allocation_id; - - /* - * Length of the VM shared memory receive buffer that needs to - * be allocated - */ - uint32_t length; -} __packed nvsp_2_msg_alloc_rx_buf; - -/* - * NvspMessage2TypeAllocateReceiveBufferComplete - */ -typedef struct nvsp_2_msg_alloc_rx_buf_complete_ { - /* - * The NDIS_STATUS code for buffer allocation - */ - uint32_t status; - - /* - * Allocation ID from NVSP_2_MESSAGE_ALLOCATE_RECEIVE_BUFFER - */ - uint32_t allocation_id; - - /* - * GPADL handle for the allocated receive buffer - */ - uint32_t gpadl_handle; - - /* - * Receive buffer ID that is further used in - * NvspMessage2SendVmqRndisPacket - */ - uint64_t rx_buf_id; -} __packed nvsp_2_msg_alloc_rx_buf_complete; - -/* - * NvspMessage2TypeFreeReceiveBuffer - */ -typedef struct nvsp_2_msg_free_rx_buf_ { - /* - * Receive buffer ID previous returned in - * NvspMessage2TypeAllocateReceiveBufferComplete message - */ - uint64_t rx_buf_id; -} __packed nvsp_2_msg_free_rx_buf; - -/* - * This structure is used in defining the buffers in - * NVSP_2_MESSAGE_SEND_VMQ_RNDIS_PACKET structure - */ -typedef struct nvsp_xfer_page_range_ { - /* - * Specifies the ID of the receive buffer that has the buffer. This - * ID can be the general receive buffer ID specified in - * NvspMessage1TypeSendReceiveBuffer or it can be the shared memory - * receive buffer ID allocated by the VSC and specified in - * NvspMessage2TypeAllocateReceiveBufferComplete message - */ - uint64_t xfer_page_set_id; - - /* - * Number of bytes - */ - uint32_t byte_count; - - /* - * Offset in bytes from the beginning of the buffer - */ - uint32_t byte_offset; -} __packed nvsp_xfer_page_range; - -/* - * NvspMessage2SendVmqRndisPacket - */ -typedef struct nvsp_2_msg_send_vmq_rndis_pkt_ { - /* - * This field is specified by RNIDS. They assume there's - * two different channels of communication. However, - * the Network VSP only has one. Therefore, the channel - * travels with the RNDIS packet. It must be RMC_DATA - */ - uint32_t channel_type; - - /* - * Only the Range element corresponding to the RNDIS header of - * the first RNDIS message in the multiple RNDIS messages sent - * in one NVSP message. Information about the data portions as well - * as the subsequent RNDIS messages in the same NVSP message are - * embedded in the RNDIS header itself - */ - nvsp_xfer_page_range range; -} __packed nvsp_2_msg_send_vmq_rndis_pkt; - -/* - * This message is used by the VSC to complete - * a RNDIS VMQ message to the VSP. At this point, - * the initiator of this message can use any resources - * associated with the original RNDIS VMQ packet. - */ -typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_ -{ - uint32_t status; -} __packed nvsp_2_msg_send_vmq_rndis_pkt_complete; - -/* - * Version 5 messages - */ -enum nvsp_subchannel_operation { - NVSP_SUBCHANNEL_NONE = 0, - NVSP_SUBCHANNE_ALLOCATE, - NVSP_SUBCHANNE_MAX -}; - -typedef struct nvsp_5_subchannel_request_ -{ - uint32_t op; - uint32_t num_subchannels; -} __packed nvsp_5_subchannel_request; - -typedef struct nvsp_5_subchannel_complete_ -{ - uint32_t status; - /* Actual number of subchannels allocated */ - uint32_t num_subchannels; -} __packed nvsp_5_subchannel_complete; - -typedef struct nvsp_5_send_indirect_table_ -{ - /* The number of entries in the send indirection table */ - uint32_t count; - /* - * The offset of the send indireciton table from top of - * this struct. The send indirection table tells which channel - * to put the send traffic on. Each entry is a channel number. - */ - uint32_t offset; -} __packed nvsp_5_send_indirect_table; - -typedef union nvsp_1_msg_uber_ { - nvsp_1_msg_send_ndis_version send_ndis_vers; - - nvsp_1_msg_send_rx_buf send_rx_buf; - nvsp_1_msg_send_rx_buf_complete send_rx_buf_complete; - nvsp_1_msg_revoke_rx_buf revoke_rx_buf; - - nvsp_1_msg_send_send_buf send_send_buf; - nvsp_1_msg_send_send_buf_complete send_send_buf_complete; - nvsp_1_msg_revoke_send_buf revoke_send_buf; - - nvsp_1_msg_send_rndis_pkt send_rndis_pkt; - nvsp_1_msg_send_rndis_pkt_complete send_rndis_pkt_complete; -} __packed nvsp_1_msg_uber; - - -typedef union nvsp_2_msg_uber_ { - nvsp_2_msg_send_ndis_config send_ndis_config; - - nvsp_2_msg_send_chimney_buf send_chimney_buf; - nvsp_2_msg_send_chimney_buf_complete send_chimney_buf_complete; - nvsp_2_msg_revoke_chimney_buf revoke_chimney_buf; - - nvsp_2_msg_resume_chimney_rx_indication resume_chimney_rx_indication; - nvsp_2_msg_terminate_chimney terminate_chimney; - nvsp_2_msg_terminate_chimney_complete terminate_chimney_complete; - nvsp_2_msg_indicate_chimney_event indicate_chimney_event; - - nvsp_2_msg_send_chimney_pkt send_chimney_packet; - nvsp_2_msg_send_chimney_pkt_complete send_chimney_packet_complete; - nvsp_2_msg_post_chimney_rx_request post_chimney_rx_request; - nvsp_2_msg_post_chimney_rx_request_complete - post_chimney_rx_request_complete; - - nvsp_2_msg_alloc_rx_buf alloc_rx_buffer; - nvsp_2_msg_alloc_rx_buf_complete alloc_rx_buffer_complete; - nvsp_2_msg_free_rx_buf free_rx_buffer; - - nvsp_2_msg_send_vmq_rndis_pkt send_vmq_rndis_pkt; - nvsp_2_msg_send_vmq_rndis_pkt_complete send_vmq_rndis_pkt_complete; - nvsp_2_msg_alloc_chimney_handle alloc_chimney_handle; - nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete; -} __packed nvsp_2_msg_uber; - -typedef union nvsp_5_msg_uber_ -{ - nvsp_5_subchannel_request subchannel_request; - nvsp_5_subchannel_complete subchn_complete; - nvsp_5_send_indirect_table send_table; -} __packed nvsp_5_msg_uber; - -typedef union nvsp_all_msgs_ { - nvsp_msg_init_uber init_msgs; - nvsp_1_msg_uber vers_1_msgs; - nvsp_2_msg_uber vers_2_msgs; - nvsp_5_msg_uber vers_5_msgs; -} __packed nvsp_all_msgs; - -/* - * ALL Messages - */ -typedef struct nvsp_msg_ { - nvsp_msg_hdr hdr; - nvsp_all_msgs msgs; -} __packed nvsp_msg; - - /* * The following arguably belongs in a separate header file */ @@ -1014,18 +77,10 @@ typedef struct nvsp_msg_ { */ #define NETVSC_SEND_BUFFER_SIZE (1024*1024*15) /* 15M */ -#define NETVSC_SEND_BUFFER_ID 0xface #define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY (1024*1024*15) /* 15MB */ #define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024*16) /* 16MB */ -#define NETVSC_RECEIVE_BUFFER_ID 0xcafe - -#define NETVSC_RECEIVE_SG_COUNT 1 - -/* Preallocated receive packets */ -#define NETVSC_RECEIVE_PACKETLIST_COUNT 256 - /* * Maximum MTU we permit to be configured for a netvsc interface. * When the code was developed, a max MTU of 12232 was tested and @@ -1034,117 +89,20 @@ typedef struct nvsp_msg_ { #define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) #define NETVSC_PACKET_SIZE PAGE_SIZE -#define VRSS_SEND_TABLE_SIZE 16 /* * Data types */ -/* - * Per netvsc channel-specific - */ -typedef struct netvsc_dev_ { - struct hn_softc *sc; - - /* Send buffer allocated by us but manages by NetVSP */ - void *send_buf; - uint32_t send_buf_size; - uint32_t send_buf_gpadl_handle; - uint32_t send_section_size; - uint32_t send_section_count; - unsigned long bitsmap_words; - unsigned long *send_section_bitsmap; - - /* Receive buffer allocated by us but managed by NetVSP */ - void *rx_buf; - uint32_t rx_buf_size; - uint32_t rx_buf_gpadl_handle; - uint32_t rx_section_count; - nvsp_1_rx_buf_section *rx_sections; - - /* Used for NetVSP initialization protocol */ - struct sema channel_init_sema; - nvsp_msg channel_init_packet; - - nvsp_msg revoke_packet; - /*uint8_t hw_mac_addr[ETHER_ADDR_LEN];*/ - - /* Holds rndis device info */ - void *extension; - - uint8_t destroy; - /* Negotiated NVSP version */ - uint32_t nvsp_version; - - uint32_t num_channel; - - struct hyperv_dma rxbuf_dma; - struct hyperv_dma txbuf_dma; - uint32_t vrss_send_table[VRSS_SEND_TABLE_SIZE]; -} netvsc_dev; - struct vmbus_channel; -typedef void (*pfn_on_send_rx_completion)(struct vmbus_channel *, void *); - #define NETVSC_DEVICE_RING_BUFFER_SIZE (128 * PAGE_SIZE) #define NETVSC_PACKET_MAXPAGE 32 -#define NETVSC_VLAN_PRIO_MASK 0xe000 -#define NETVSC_VLAN_PRIO_SHIFT 13 -#define NETVSC_VLAN_VID_MASK 0x0fff - -#define TYPE_IPV4 2 -#define TYPE_IPV6 4 -#define TYPE_TCP 2 -#define TYPE_UDP 4 - -#define TRANSPORT_TYPE_NOT_IP 0 -#define TRANSPORT_TYPE_IPV4_TCP ((TYPE_IPV4 << 16) | TYPE_TCP) -#define TRANSPORT_TYPE_IPV4_UDP ((TYPE_IPV4 << 16) | TYPE_UDP) -#define TRANSPORT_TYPE_IPV6_TCP ((TYPE_IPV6 << 16) | TYPE_TCP) -#define TRANSPORT_TYPE_IPV6_UDP ((TYPE_IPV6 << 16) | TYPE_UDP) - -#ifdef __LP64__ -#define BITS_PER_LONG 64 -#else -#define BITS_PER_LONG 32 -#endif - -typedef struct netvsc_packet_ { - uint8_t is_data_pkt; /* One byte */ - uint16_t vlan_tci; - uint32_t status; - - /* Completion */ - union { - struct { - uint64_t rx_completion_tid; - void *rx_completion_context; - /* This is no longer used */ - pfn_on_send_rx_completion on_rx_completion; - } rx; - struct { - uint64_t send_completion_tid; - void *send_completion_context; - /* Still used in netvsc and filter code */ - pfn_on_send_rx_completion on_send_completion; - } send; - } compl; - uint32_t send_buf_section_idx; - uint32_t send_buf_section_size; - - void *rndis_mesg; - uint32_t tot_data_buf_len; - void *data; - uint32_t gpa_cnt; - struct vmbus_gpa gpa[NETVSC_PACKET_MAXPAGE]; -} netvsc_packet; - -typedef struct { - uint8_t mac_addr[6]; /* Assumption unsigned long */ - uint8_t link_state; -} netvsc_device_info; +#define HN_XACT_REQ_PGCNT 2 +#define HN_XACT_RESP_PGCNT 2 +#define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) +#define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) #ifndef HN_USE_TXDESC_BUFRING struct hn_txdesc; @@ -1159,6 +117,7 @@ struct hn_rx_ring { struct ifnet *hn_ifp; struct hn_tx_ring *hn_txr; void *hn_rdbuf; + uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ int hn_rx_idx; /* Trust csum verification on host side */ @@ -1177,6 +136,9 @@ struct hn_rx_ring { /* Rarely used stuffs */ struct sysctl_oid *hn_rx_sysctl_tree; int hn_rx_flags; + + void *hn_br; /* TX/RX bufring */ + struct hyperv_dma hn_br_dma; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 @@ -1206,16 +168,21 @@ struct hn_tx_ring { struct buf_ring *hn_mbuf_br; int hn_oactive; int hn_tx_idx; + int hn_tx_flags; struct mtx hn_tx_lock; struct hn_softc *hn_sc; struct vmbus_channel *hn_chan; int hn_direct_tx_size; - int hn_tx_chimney_size; + int hn_chim_size; bus_dma_tag_t hn_tx_data_dtag; uint64_t hn_csum_assist; + int hn_suspended; + int hn_gpa_cnt; + struct vmbus_gpa hn_gpa[NETVSC_PACKET_MAXPAGE]; + u_long hn_no_txdescs; u_long hn_send_failed; u_long hn_txdma_failed; @@ -1228,27 +195,22 @@ struct hn_tx_ring { struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; - int hn_tx_flags; } __aligned(CACHE_LINE_SIZE); #define HN_TX_FLAG_ATTACHED 0x1 +#define HN_TX_FLAG_HASHVAL 0x2 /* support HASHVAL pktinfo */ /* * Device-specific softc structure */ -typedef struct hn_softc { +struct hn_softc { struct ifnet *hn_ifp; struct arpcom arpcom; struct ifmedia hn_media; device_t hn_dev; - uint8_t hn_unit; int hn_carrier; int hn_if_flags; - struct mtx hn_lock; - int hn_initdone; - /* See hv_netvsc_drv_freebsd.c for rules on how to use */ - int temp_unusable; - netvsc_dev *net_dev; + struct sx hn_lock; struct vmbus_channel *hn_prichan; int hn_rx_ring_cnt; @@ -1259,27 +221,63 @@ typedef struct hn_softc { int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; + uint8_t *hn_chim; + u_long *hn_chim_bmap; + int hn_chim_bmap_cnt; + int hn_chim_cnt; + int hn_chim_szmax; + int hn_cpu; - int hn_tx_chimney_max; struct taskqueue *hn_tx_taskq; struct sysctl_oid *hn_tx_sysctl_tree; struct sysctl_oid *hn_rx_sysctl_tree; -} hn_softc_t; + struct vmbus_xact_ctx *hn_xact; + uint32_t hn_nvs_ver; + + struct taskqueue *hn_mgmt_taskq; + struct taskqueue *hn_mgmt_taskq0; + struct task hn_link_task; + + uint32_t hn_caps; /* HN_CAP_ */ + uint32_t hn_flags; /* HN_FLAG_ */ + void *hn_rxbuf; + uint32_t hn_rxbuf_gpadl; + struct hyperv_dma hn_rxbuf_dma; + + uint32_t hn_chim_gpadl; + struct hyperv_dma hn_chim_dma; + + uint32_t hn_rndis_rid; + uint32_t hn_ndis_ver; + int hn_ndis_tso_szmax; + int hn_ndis_tso_sgmin; + + struct ndis_rssprm_toeplitz hn_rss; +}; + +#define HN_FLAG_RXBUF_CONNECTED 0x0001 +#define HN_FLAG_CHIM_CONNECTED 0x0002 +#define HN_FLAG_HAS_RSSKEY 0x0004 +#define HN_FLAG_HAS_RSSIND 0x0008 +#define HN_FLAG_SYNTH_ATTACHED 0x0010 + +#define HN_CAP_VLAN 0x0001 +#define HN_CAP_MTU 0x0002 +#define HN_CAP_IPCS 0x0004 +#define HN_CAP_TCP4CS 0x0008 +#define HN_CAP_TCP6CS 0x0010 +#define HN_CAP_UDP4CS 0x0020 +#define HN_CAP_UDP6CS 0x0040 +#define HN_CAP_TSO4 0x0080 +#define HN_CAP_TSO6 0x0100 /* * Externs */ -extern int hv_promisc_mode; +struct hn_send_ctx; -void netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status); -netvsc_dev *hv_nv_on_device_add(struct hn_softc *sc, - void *additional_info, struct hn_rx_ring *rxr); -int hv_nv_on_device_remove(struct hn_softc *sc, - boolean_t destroy_channel); -int hv_nv_on_send(struct vmbus_channel *chan, netvsc_packet *pkt); -int hv_nv_get_next_send_section(netvsc_dev *net_dev); -void hv_nv_subchan_attach(struct vmbus_channel *chan, - struct hn_rx_ring *rxr); +int hv_nv_on_send(struct vmbus_channel *chan, uint32_t rndis_mtype, + struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt); #endif /* __HV_NET_VSC_H__ */ diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index 54301af..6137314 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -79,7 +79,7 @@ __FBSDID("$FreeBSD$"); #include <net/ethernet.h> #include <net/if_dl.h> #include <net/if_media.h> - +#include <net/rndis.h> #include <net/bpf.h> #include <net/if_types.h> @@ -117,10 +117,12 @@ __FBSDID("$FreeBSD$"); #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus_xact.h> + +#include <dev/hyperv/netvsc/hv_net_vsc.h> +#include <dev/hyperv/netvsc/hv_rndis_filter.h> +#include <dev/hyperv/netvsc/ndis.h> -#include "hv_net_vsc.h" -#include "hv_rndis.h" -#include "hv_rndis_filter.h" #include "vmbus_if.h" /* Short for Hyper-V network interface */ @@ -141,20 +143,20 @@ __FBSDID("$FreeBSD$"); #define HN_RING_CNT_DEF_MAX 8 -#define HN_RNDIS_MSG_LEN \ - (sizeof(rndis_msg) + \ - RNDIS_HASHVAL_PPI_SIZE + \ - RNDIS_VLAN_PPI_SIZE + \ - RNDIS_TSO_PPI_SIZE + \ - RNDIS_CSUM_PPI_SIZE) -#define HN_RNDIS_MSG_BOUNDARY PAGE_SIZE -#define HN_RNDIS_MSG_ALIGN CACHE_LINE_SIZE +#define HN_RNDIS_PKT_LEN \ + (sizeof(struct rndis_packet_msg) + \ + HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) +#define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE +#define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE -#define HN_TX_DATA_SEGCNT_MAX \ - (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS) +/* -1 for RNDIS packet message */ +#define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1) #define HN_DIRECT_TX_SIZE_DEF 128 @@ -168,26 +170,18 @@ struct hn_txdesc { struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ - netvsc_packet netvsc_pkt; /* XXX to be removed */ + struct hn_send_ctx send_ctx; bus_dmamap_t data_dmap; - bus_addr_t rndis_msg_paddr; - rndis_msg *rndis_msg; - bus_dmamap_t rndis_msg_dmap; + bus_addr_t rndis_pkt_paddr; + struct rndis_packet_msg *rndis_pkt; + bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x1 #define HN_TXD_FLAG_DMAMAP 0x2 -/* - * Only enable UDP checksum offloading when it is on 2012R2 or - * later. UDP checksum offloading doesn't work on earlier - * Windows releases. - */ -#define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP) -#define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP) - #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ @@ -195,28 +189,24 @@ struct hn_txdesc { #define HN_LRO_ACKCNT_DEF 1 -/* - * Be aware that this sleepable mutex will exhibit WITNESS errors when - * certain TCP and ARP code paths are taken. This appears to be a - * well-known condition, as all other drivers checked use a sleeping - * mutex to protect their transmit paths. - * Also Be aware that mutexes do not play well with semaphores, and there - * is a conflicting semaphore in a certain channel code path. - */ -#define NV_LOCK_INIT(_sc, _name) \ - mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF) -#define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock) -#define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED) -#define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock) -#define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock) +#define HN_LOCK_INIT(sc) \ + sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) +#define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) +#define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) +#define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock) +#define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) +#define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) +#define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) +#define HN_CSUM_IP_HWASSIST(sc) \ + ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) +#define HN_CSUM_IP6_HWASSIST(sc) \ + ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) /* * Globals */ -int hv_promisc_mode = 0; /* normal mode by default */ - SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); @@ -242,7 +232,7 @@ SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, "when csum info is missing (global setting)"); /* Limit TSO burst size */ -static int hn_tso_maxlen = 0; +static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); @@ -310,9 +300,9 @@ static u_int hn_cpu_index; /* * Forward declarations */ -static void hn_stop(hn_softc_t *sc); -static void hn_ifinit_locked(hn_softc_t *sc); -static void hn_ifinit(void *xsc); +static void hn_stop(struct hn_softc *sc); +static void hn_init_locked(struct hn_softc *sc); +static void hn_init(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct hn_tx_ring *txr, int len); static void hn_start(struct ifnet *ifp); @@ -324,7 +314,7 @@ static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); -static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else @@ -333,21 +323,54 @@ static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *, int); static void hn_destroy_tx_ring(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); +static void hn_fixup_tx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_start_taskfunc(void *, int); static void hn_start_txeof_taskfunc(void *, int); -static void hn_stop_tx_tasks(struct hn_softc *); +static void hn_link_taskfunc(void *, int); +static void hn_suspend_mgmt_taskfunc(void *, int); static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); -static void hn_create_rx_data(struct hn_softc *sc, int); +static int hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); -static void hn_set_tx_chimney_size(struct hn_softc *, int); -static void hn_channel_attach(struct hn_softc *, struct vmbus_channel *); -static void hn_subchan_attach(struct hn_softc *, struct vmbus_channel *); -static void hn_subchan_setup(struct hn_softc *); +static void hn_set_chim_size(struct hn_softc *, int); +static void hn_set_tso_maxsize(struct hn_softc *, int, int); +static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); +static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); +static int hn_attach_subchans(struct hn_softc *); +static void hn_detach_allchans(struct hn_softc *); +static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr); +static void hn_set_ring_inuse(struct hn_softc *, int); +static int hn_synth_attach(struct hn_softc *, int); +static void hn_synth_detach(struct hn_softc *); +static bool hn_tx_ring_pending(struct hn_tx_ring *); +static void hn_suspend(struct hn_softc *); +static void hn_suspend_data(struct hn_softc *); +static void hn_suspend_mgmt(struct hn_softc *); +static void hn_resume(struct hn_softc *); +static void hn_resume_data(struct hn_softc *); +static void hn_resume_mgmt(struct hn_softc *); +static void hn_rx_drain(struct vmbus_channel *); +static void hn_tx_resume(struct hn_softc *, int); +static void hn_tx_ring_qflush(struct hn_tx_ring *); +static int netvsc_detach(device_t dev); + +static void hn_nvs_handle_notify(struct hn_softc *sc, + const struct vmbus_chanpkt_hdr *pkt); +static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkt); +static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, + struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkthdr); +static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid); static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); @@ -356,6 +379,14 @@ static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof_taskfunc(void *, int); +static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; + #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) @@ -378,6 +409,67 @@ hn_get_txswq_depth(const struct hn_tx_ring *txr) } static int +hn_rss_reconfig(struct hn_softc *sc) +{ + int error; + + HN_LOCK_ASSERT(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) + return (ENXIO); + + /* + * Disable RSS first. + * + * NOTE: + * Direct reconfiguration by setting the UNCHG flags does + * _not_ work properly. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "disable RSS\n"); + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); + if (error) { + if_printf(sc->hn_ifp, "RSS disable failed\n"); + return (error); + } + + /* + * Reenable the RSS w/ the updated RSS key or indirect + * table. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "reconfig RSS\n"); + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); + if (error) { + if_printf(sc->hn_ifp, "RSS reconfig failed\n"); + return (error); + } + return (0); +} + +static void +hn_rss_ind_fixup(struct hn_softc *sc, int nchan) +{ + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + int i; + + KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); + + /* + * Check indirect table to make sure that all channels in it + * can be used. + */ + for (i = 0; i < NDIS_HASH_INDCNT; ++i) { + if (rss->rss_ind[i] >= nchan) { + if_printf(sc->hn_ifp, + "RSS indirect table %d fixup: %u -> %d\n", + i, rss->rss_ind[i], nchan - 1); + rss->rss_ind[i] = nchan - 1; + } + } +} + +static int hn_ifmedia_upd(struct ifnet *ifp __unused) { @@ -443,19 +535,20 @@ hn_cpuset_setthread_task(void *xmask, int pending __unused) static int netvsc_attach(device_t dev) { - netvsc_device_info device_info; - hn_softc_t *sc; - int unit = device_get_unit(dev); + struct hn_softc *sc = device_get_softc(dev); + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; - int tso_maxlen; - - sc = device_get_softc(dev); - sc->hn_unit = unit; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); + HN_LOCK_INIT(sc); + /* + * Setup taskqueue for transmission. + */ if (hn_tx_taskq == NULL) { sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskq); @@ -477,13 +570,32 @@ netvsc_attach(device_t dev) } else { sc->hn_tx_taskq = hn_tx_taskq; } - NV_LOCK_INIT(sc, "NetVSCLock"); + /* + * Setup taskqueue for mangement tasks, e.g. link status. + */ + sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, + taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); + taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", + device_get_nameunit(dev)); + TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); + + /* + * Allocate ifnet and setup its name earlier, so that if_printf + * can be used by functions, which will be called after + * ether_ifattach(). + */ ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* + * Initialize ifmedia earlier so that it can be unconditionally + * destroyed, if error happened later on. + */ + ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); + + /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * @@ -513,72 +625,35 @@ netvsc_attach(device_t dev) */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; + /* + * Create enough TX/RX rings, even if only limited number of + * channels can be allocated. + */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; - hn_create_rx_data(sc, ring_cnt); + error = hn_create_rx_data(sc, ring_cnt); + if (error) + goto failed; /* - * Associate the first TX/RX ring w/ the primary channel. + * Create transaction context for NVS and RNDIS transactions. */ - hn_channel_attach(sc, sc->hn_prichan); - - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_ioctl = hn_ioctl; - ifp->if_init = hn_ifinit; - /* needed by hv_rf_on_device_add() code */ - ifp->if_mtu = ETHERMTU; - if (hn_use_if_start) { - int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); - - ifp->if_start = hn_start; - IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); - ifp->if_snd.ifq_drv_maxlen = qdepth - 1; - IFQ_SET_READY(&ifp->if_snd); - } else { - ifp->if_transmit = hn_transmit; - ifp->if_qflush = hn_xmit_qflush; - } - - ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); - ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); - ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); - /* XXX ifmedia_set really should do this for us */ - sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; + sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), + HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); + if (sc->hn_xact == NULL) + goto failed; /* - * Tell upper layers that we support full VLAN capability. + * Attach the synthetic parts, i.e. NVS and RNDIS. */ - ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); - ifp->if_capabilities |= - IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | - IFCAP_LRO; - ifp->if_capenable |= - IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | - IFCAP_LRO; - ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO; - - error = hv_rf_on_device_add(sc, &device_info, ring_cnt, - &sc->hn_rx_ring[0]); + error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; - KASSERT(sc->net_dev->num_channel > 0 && - sc->net_dev->num_channel <= sc->hn_rx_ring_inuse, - ("invalid channel count %u, should be less than %d", - sc->net_dev->num_channel, sc->hn_rx_ring_inuse)); - /* - * Set the # of TX/RX rings that could be used according to - * the # of channels that host offered. - */ - if (sc->hn_tx_ring_inuse > sc->net_dev->num_channel) - sc->hn_tx_ring_inuse = sc->net_dev->num_channel; - sc->hn_rx_ring_inuse = sc->net_dev->num_channel; - device_printf(dev, "%d TX ring, %d RX ring\n", - sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); - - if (sc->net_dev->num_channel > 1) - hn_subchan_setup(sc); + error = hn_rndis_get_eaddr(sc, eaddr); + if (error) + goto failed; #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { @@ -590,63 +665,137 @@ netvsc_attach(device_t dev) } #endif - if (device_info.link_state == 0) { - sc->hn_carrier = 1; + /* + * Fixup TX stuffs after synthetic parts are attached. + */ + hn_fixup_tx_data(sc); + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, + &sc->hn_nvs_ver, 0, "NVS version"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_ndis_version_sysctl, "A", "NDIS version"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_caps_sysctl, "A", "capabilities"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_hwassist_sysctl, "A", "hwassist"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_rss_key_sysctl, "IU", "RSS key"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_rss_ind_sysctl, "IU", "RSS indirect table"); + + /* + * Setup the ifmedia, which has been initialized earlier. + */ + ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); + /* XXX ifmedia_set really should do this for us */ + sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; + + /* + * Setup the ifnet for this interface. + */ + + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = hn_ioctl; + ifp->if_init = hn_init; + if (hn_use_if_start) { + int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); + + ifp->if_start = hn_start; + IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); + ifp->if_snd.ifq_drv_maxlen = qdepth - 1; + IFQ_SET_READY(&ifp->if_snd); + } else { + ifp->if_transmit = hn_transmit; + ifp->if_qflush = hn_xmit_qflush; } - tso_maxlen = hn_tso_maxlen; - if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET) - tso_maxlen = IP_MAXPACKET; + ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; +#ifdef foo + /* We can't diff IPv6 packets from IPv4 packets on RX path. */ + ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; +#endif + if (sc->hn_caps & HN_CAP_VLAN) { + /* XXX not sure about VLAN_MTU. */ + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; + } - ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; - ifp->if_hw_tsomaxsegsize = PAGE_SIZE; - ifp->if_hw_tsomax = tso_maxlen - - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; + if (ifp->if_hwassist & HN_CSUM_IP_MASK) + ifp->if_capabilities |= IFCAP_TXCSUM; + if (ifp->if_hwassist & HN_CSUM_IP6_MASK) + ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; + if (sc->hn_caps & HN_CAP_TSO4) { + ifp->if_capabilities |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_IP_TSO; + } + if (sc->hn_caps & HN_CAP_TSO6) { + ifp->if_capabilities |= IFCAP_TSO6; + ifp->if_hwassist |= CSUM_IP6_TSO; + } + + /* Enable all available capabilities by default. */ + ifp->if_capenable = ifp->if_capabilities; - ether_ifattach(ifp, device_info.mac_addr); + if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { + hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); + ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; + ifp->if_hw_tsomaxsegsize = PAGE_SIZE; + } - if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax, - ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); + ether_ifattach(ifp, eaddr); - sc->hn_tx_chimney_max = sc->net_dev->send_section_size; - hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); - if (hn_tx_chimney_size > 0 && - hn_tx_chimney_size < sc->hn_tx_chimney_max) - hn_set_tx_chimney_size(sc, hn_tx_chimney_size); + if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { + if_printf(ifp, "TSO segcnt %u segsz %u\n", + ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); + } + + /* Inform the upper layer about the long frame support. */ + ifp->if_hdrlen = sizeof(struct ether_vlan_header); + + /* + * Kick off link status check. + */ + sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; + hn_link_status_update(sc); return (0); failed: - hn_destroy_tx_data(sc); - if (ifp != NULL) - if_free(ifp); + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) + hn_synth_detach(sc); + netvsc_detach(dev); return (error); } -/* - * Standard detach entry point - */ static int netvsc_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); + struct ifnet *ifp = sc->hn_ifp; - if (bootverbose) - printf("netvsc_detach\n"); - - /* - * XXXKYS: Need to clean up all our - * driver state; this is the driver - * unloading. - */ - - /* - * XXXKYS: Need to stop outgoing traffic and unregister - * the netdevice. - */ - - hv_rf_on_device_remove(sc, HV_RF_NV_DESTROY_CHANNEL); - - hn_stop_tx_tasks(sc); + if (device_is_attached(dev)) { + HN_LOCK(sc); + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_stop(sc); + /* + * NOTE: + * hn_stop() only suspends data, so managment + * stuffs have to be suspended manually here. + */ + hn_suspend_mgmt(sc); + hn_synth_detach(sc); + } + HN_UNLOCK(sc); + ether_ifdetach(ifp); + } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); @@ -654,7 +803,14 @@ netvsc_detach(device_t dev) if (sc->hn_tx_taskq != hn_tx_taskq) taskqueue_free(sc->hn_tx_taskq); + taskqueue_free(sc->hn_mgmt_taskq0); + if (sc->hn_xact != NULL) + vmbus_xact_ctx_destroy(sc->hn_xact); + + if_free(ifp); + + HN_LOCK_DESTROY(sc); return (0); } @@ -667,6 +823,36 @@ netvsc_shutdown(device_t dev) return (0); } +static void +hn_link_taskfunc(void *xsc, int pending __unused) +{ + struct hn_softc *sc = xsc; + struct ifnet *ifp = sc->hn_ifp; + uint32_t link_status; + int error; + + error = hn_rndis_get_linkstatus(sc, &link_status); + if (error) { + /* XXX what to do? */ + return; + } + + if (link_status == NDIS_MEDIA_STATE_CONNECTED) + sc->hn_carrier = 1; + else + sc->hn_carrier = 0; + if_link_state_change(ifp, + sc->hn_carrier ? LINK_STATE_UP : LINK_STATE_DOWN); +} + +void +hn_link_status_update(struct hn_softc *sc) +{ + + if (sc->hn_mgmt_taskq != NULL) + taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); +} + static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) @@ -785,6 +971,23 @@ hn_txdesc_hold(struct hn_txdesc *txd) atomic_add_int(&txd->refs, 1); } +static bool +hn_tx_ring_pending(struct hn_tx_ring *txr) +{ + bool pending = false; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) + pending = true; + mtx_unlock_spin(&txr->hn_txlist_spin); +#else + if (!buf_ring_full(txr->hn_txdesc_br)) + pending = true; +#endif + return (pending); +} + static __inline void hn_txeof(struct hn_tx_ring *txr) { @@ -793,14 +996,14 @@ hn_txeof(struct hn_tx_ring *txr) } static void -hn_tx_done(struct vmbus_channel *chan, void *xpkt) +hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc, + struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { - netvsc_packet *packet = xpkt; - struct hn_txdesc *txd; + struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; - txd = (struct hn_txdesc *)(uintptr_t) - packet->compl.send.send_completion_tid; + if (sndc->hn_chim_idx != HN_NVS_CHIM_IDX_INVALID) + hn_chim_free(sc, sndc->hn_chim_idx); txr = txd->txr; KASSERT(txr->hn_chan == chan, @@ -819,7 +1022,7 @@ hn_tx_done(struct vmbus_channel *chan, void *xpkt) } void -netvsc_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) +hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; @@ -843,6 +1046,15 @@ netvsc_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) hn_txeof(txr); } +static __inline uint32_t +hn_rndis_pktmsg_offset(uint32_t ofs) +{ + + KASSERT(ofs >= sizeof(struct rndis_packet_msg), + ("invalid RNDIS packet msg offset %u", ofs)); + return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); +} + /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. @@ -853,16 +1065,10 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; - netvsc_packet *packet; - rndis_msg *rndis_mesg; - rndis_packet *rndis_pkt; - rndis_per_packet_info *rppi; - struct rndis_hash_value *hash_value; - uint32_t rndis_msg_size; - - packet = &txd->netvsc_pkt; - packet->is_data_pkt = TRUE; - packet->tot_data_buf_len = m_head->m_pkthdr.len; + struct rndis_packet_msg *pkt; + uint32_t send_buf_section_idx; + int send_buf_section_size, pktlen; + uint32_t *pi_data; /* * extension points to the area reserved for the @@ -870,45 +1076,36 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) * the netvsc_packet (and rppi struct, if present; * length is updated later). */ - rndis_mesg = txd->rndis_msg; - /* XXX not necessary */ - memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN); - rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; - - rndis_pkt = &rndis_mesg->msg.packet; - rndis_pkt->data_offset = sizeof(rndis_packet); - rndis_pkt->data_length = packet->tot_data_buf_len; - rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); - - rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); - - /* - * Set the hash value for this packet, so that the host could - * dispatch the TX done event for this packet back to this TX - * ring's channel. - */ - rndis_msg_size += RNDIS_HASHVAL_PPI_SIZE; - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_HASHVAL_PPI_SIZE, - nbl_hash_value); - hash_value = (struct rndis_hash_value *)((uint8_t *)rppi + - rppi->per_packet_info_offset); - hash_value->hash_value = txr->hn_tx_idx; + pkt = txd->rndis_pkt; + pkt->rm_type = REMOTE_NDIS_PACKET_MSG; + pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; + pkt->rm_dataoffset = sizeof(*pkt); + pkt->rm_datalen = m_head->m_pkthdr.len; + pkt->rm_pktinfooffset = sizeof(*pkt); + pkt->rm_pktinfolen = 0; + + if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { + /* + * Set the hash value for this packet, so that the host could + * dispatch the TX done event for this packet back to this TX + * ring's channel. + */ + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); + *pi_data = txr->hn_tx_idx; + } if (m_head->m_flags & M_VLANTAG) { - ndis_8021q_info *rppi_vlan_info; - - rndis_msg_size += RNDIS_VLAN_PPI_SIZE; - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, - ieee_8021q_info); - - rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi + - rppi->per_packet_info_offset); - rppi_vlan_info->u1.s1.vlan_id = - m_head->m_pkthdr.ether_vtag & 0xfff; + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); + *pi_data = NDIS_VLAN_INFO_MAKE( + EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), + EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), + EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { - rndis_tcp_tso_info *tso_info; +#if defined(INET6) || defined(INET) struct ether_vlan_header *eh; int ether_len; @@ -921,15 +1118,8 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) else ether_len = ETHER_HDR_LEN; - rndis_msg_size += RNDIS_TSO_PPI_SIZE; - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE, - tcp_large_send_info); - - tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi + - rppi->per_packet_info_offset); - tso_info->lso_v2_xmit.type = - RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; - + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip = @@ -938,13 +1128,12 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); - tso_info->lso_v2_xmit.ip_version = - RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip->ip_len = 0; ip->ip_sum = 0; - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, + m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) @@ -956,62 +1145,53 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) (m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); - tso_info->lso_v2_xmit.ip_version = - RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); + *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, + m_head->m_pkthdr.tso_segsz); } #endif - tso_info->lso_v2_xmit.tcp_header_offset = 0; - tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz; +#endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { - rndis_tcp_ip_csum_info *csum_info; - - rndis_msg_size += RNDIS_CSUM_PPI_SIZE; - rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE, - tcpip_chksum_info); - csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi + - rppi->per_packet_info_offset); - - csum_info->xmit.is_ipv4 = 1; - if (m_head->m_pkthdr.csum_flags & CSUM_IP) - csum_info->xmit.ip_header_csum = 1; - - if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { - csum_info->xmit.tcp_csum = 1; - csum_info->xmit.tcp_header_offset = 0; - } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { - csum_info->xmit.udp_csum = 1; + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); + if (m_head->m_pkthdr.csum_flags & + (CSUM_IP6_TCP | CSUM_IP6_UDP)) { + *pi_data = NDIS_TXCSUM_INFO_IPV6; + } else { + *pi_data = NDIS_TXCSUM_INFO_IPV4; + if (m_head->m_pkthdr.csum_flags & CSUM_IP) + *pi_data |= NDIS_TXCSUM_INFO_IPCS; } + + if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) + *pi_data |= NDIS_TXCSUM_INFO_TCPCS; + else if (m_head->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP6_UDP)) + *pi_data |= NDIS_TXCSUM_INFO_UDPCS; } - rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size; - packet->tot_data_buf_len = rndis_mesg->msg_len; + pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; + /* Convert RNDIS packet message offsets */ + pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); + pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Chimney send, if the packet could fit into one chimney buffer. */ - if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) { - netvsc_dev *net_dev = txr->hn_sc->net_dev; - uint32_t send_buf_section_idx; - + if (pkt->rm_len < txr->hn_chim_size) { txr->hn_tx_chimney_tried++; - send_buf_section_idx = - hv_nv_get_next_send_section(net_dev); - if (send_buf_section_idx != - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { - uint8_t *dest = ((uint8_t *)net_dev->send_buf + - (send_buf_section_idx * - net_dev->send_section_size)); - - memcpy(dest, rndis_mesg, rndis_msg_size); - dest += rndis_msg_size; + send_buf_section_idx = hn_chim_alloc(txr->hn_sc); + if (send_buf_section_idx != HN_NVS_CHIM_IDX_INVALID) { + uint8_t *dest = txr->hn_sc->hn_chim + + (send_buf_section_idx * txr->hn_sc->hn_chim_szmax); + + memcpy(dest, pkt, pktlen); + dest += pktlen; m_copydata(m_head, 0, m_head->m_pkthdr.len, dest); - packet->send_buf_section_idx = send_buf_section_idx; - packet->send_buf_section_size = - packet->tot_data_buf_len; - packet->gpa_cnt = 0; + send_buf_section_size = pkt->rm_len; + txr->hn_gpa_cnt = 0; txr->hn_tx_chimney++; goto done; } @@ -1037,36 +1217,34 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) } *m_head0 = m_head; - packet->gpa_cnt = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS; + /* +1 RNDIS packet message */ + txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ - packet->gpa[0].gpa_page = atop(txd->rndis_msg_paddr); - packet->gpa[0].gpa_ofs = txd->rndis_msg_paddr & PAGE_MASK; - packet->gpa[0].gpa_len = rndis_msg_size; + txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); + txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; + txr->hn_gpa[0].gpa_len = pktlen; /* - * Fill the page buffers with mbuf info starting at index - * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. + * Fill the page buffers with mbuf info after the page + * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { - struct vmbus_gpa *gpa = &packet->gpa[ - i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS]; + struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } - packet->send_buf_section_idx = - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - packet->send_buf_section_size = 0; + send_buf_section_idx = HN_NVS_CHIM_IDX_INVALID; + send_buf_section_size = 0; done: txd->m = m_head; /* Set the completion routine */ - packet->compl.send.on_send_completion = hn_tx_done; - packet->compl.send.send_completion_context = packet; - packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd; + hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd, + send_buf_section_idx, send_buf_section_size); return 0; } @@ -1086,7 +1264,8 @@ again: * Make sure that txd is not freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); - error = hv_nv_on_send(txr->hn_chan, &txd->netvsc_pkt); + error = hv_nv_on_send(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, + &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt); if (!error) { ETHER_BPF_MTAP(ifp, txd->m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); @@ -1153,6 +1332,9 @@ hn_start_locked(struct hn_tx_ring *txr, int len) KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); + if (__predict_false(txr->hn_suspended)) + return 0; + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return 0; @@ -1202,19 +1384,6 @@ hn_start_locked(struct hn_tx_ring *txr, int len) } /* - * Link up/down notification - */ -void -netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status) -{ - if (status == 1) { - sc->hn_carrier = 1; - } else { - sc->hn_carrier = 0; - } -} - -/* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. @@ -1292,14 +1461,13 @@ hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) * Note: This is no longer used as a callback */ int -netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, - const rndis_tcp_ip_csum_info *csum_info, - const struct rndis_hash_info *hash_info, - const struct rndis_hash_value *hash_value) +hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, + const struct hn_recvinfo *info) { struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; + int hash_type = M_HASHTYPE_OPAQUE; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); @@ -1307,17 +1475,16 @@ netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, /* * Bail out if packet contains more data than configured MTU. */ - if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { + if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); - } else if (packet->tot_data_buf_len <= MHLEN) { + } else if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } - memcpy(mtod(m_new, void *), packet->data, - packet->tot_data_buf_len); - m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; + memcpy(mtod(m_new, void *), data, dlen); + m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* @@ -1327,7 +1494,7 @@ netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; - if (packet->tot_data_buf_len > MCLBYTES) { + if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } @@ -1338,7 +1505,7 @@ netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, return (0); } - hv_m_append(m_new, packet->tot_data_buf_len, packet->data); + hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; @@ -1346,28 +1513,29 @@ netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, do_csum = 0; /* receive side checksum offload */ - if (csum_info != NULL) { + if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ - if (csum_info->receive.ip_csum_succeeded && do_csum) { + if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ - if ((csum_info->receive.tcp_csum_succeeded || - csum_info->receive.udp_csum_succeeded) && do_csum) { + if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | + NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; - if (csum_info->receive.tcp_csum_succeeded) + if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } - if (csum_info->receive.ip_csum_succeeded && - csum_info->receive.tcp_csum_succeeded) + if ((info->csum_info & + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { const struct ether_header *eh; @@ -1423,21 +1591,20 @@ netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, } } skip: - if ((packet->vlan_tci != 0) && - (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { - m_new->m_pkthdr.ether_vtag = packet->vlan_tci; + if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { + m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( + NDIS_VLAN_INFO_ID(info->vlan_info), + NDIS_VLAN_INFO_PRI(info->vlan_info), + NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } - if (hash_info != NULL && hash_value != NULL) { - int hash_type = M_HASHTYPE_OPAQUE; - + if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; - m_new->m_pkthdr.flowid = hash_value->hash_value; - if ((hash_info->hash_info & NDIS_HASH_FUNCTION_MASK) == + m_new->m_pkthdr.flowid = info->hash_value; + if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { - uint32_t type = - (hash_info->hash_info & NDIS_HASH_TYPE_MASK); + uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); switch (type) { case NDIS_HASH_IPV4: @@ -1465,14 +1632,10 @@ skip: break; } } - M_HASHTYPE_SET(m_new, hash_type); } else { - if (hash_value != NULL) - m_new->m_pkthdr.flowid = hash_value->hash_value; - else - m_new->m_pkthdr.flowid = rxr->hn_rx_idx; - M_HASHTYPE_SET(m_new, M_HASHTYPE_OPAQUE); + m_new->m_pkthdr.flowid = rxr->hn_rx_idx; } + M_HASHTYPE_SET(m_new, hash_type); /* * Note: Moved RX completion back to hv_nv_on_receive() so all @@ -1502,56 +1665,39 @@ skip: return (0); } -/* - * Rules for using sc->temp_unusable: - * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() - * 2. code reading sc->temp_unusable under NV_LOCK(), and finding - * sc->temp_unusable set, must release NV_LOCK() and exit - * 3. to retain exclusive control of the interface, - * sc->temp_unusable must be set by code before releasing NV_LOCK() - * 4. only code setting sc->temp_unusable can clear sc->temp_unusable - * 5. code setting sc->temp_unusable must eventually clear sc->temp_unusable - */ - -/* - * Standard ioctl entry point. Called when the user wants to configure - * the interface. - */ static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - hn_softc_t *sc = ifp->if_softc; + struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; -#ifdef INET - struct ifaddr *ifa = (struct ifaddr *)data; -#endif - netvsc_device_info device_info; int mask, error = 0; - int retry_cnt = 500; - - switch(cmd) { - case SIOCSIFADDR: -#ifdef INET - if (ifa->ifa_addr->sa_family == AF_INET) { - ifp->if_flags |= IFF_UP; - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) - hn_ifinit(sc); - arp_ifinit(ifp, ifa); - } else -#endif - error = ether_ioctl(ifp, cmd, data); - break; + switch (cmd) { case SIOCSIFMTU: - /* Check MTU value change */ - if (ifp->if_mtu == ifr->ifr_mtu) - break; - if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { error = EINVAL; break; } + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { + HN_UNLOCK(sc); + break; + } + + if ((sc->hn_caps & HN_CAP_MTU) == 0) { + /* Can't change MTU */ + HN_UNLOCK(sc); + error = EOPNOTSUPP; + break; + } + + if (ifp->if_mtu == ifr->ifr_mtu) { + HN_UNLOCK(sc); + break; + } + /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; @@ -1560,104 +1706,48 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * Make sure that LRO aggregation length limit is still * valid, after the MTU change. */ - NV_LOCK(sc); if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); - NV_UNLOCK(sc); #endif - do { - NV_LOCK(sc); - if (!sc->temp_unusable) { - sc->temp_unusable = TRUE; - retry_cnt = -1; - } - NV_UNLOCK(sc); - if (retry_cnt > 0) { - retry_cnt--; - DELAY(5 * 1000); - } - } while (retry_cnt > 0); - - if (retry_cnt == 0) { - error = EINVAL; - break; - } - - /* We must remove and add back the device to cause the new - * MTU to take effect. This includes tearing down, but not - * deleting the channel, then bringing it back up. + /* + * Suspend this interface before the synthetic parts + * are ripped. */ - error = hv_rf_on_device_remove(sc, HV_RF_NV_RETAIN_CHANNEL); - if (error) { - NV_LOCK(sc); - sc->temp_unusable = FALSE; - NV_UNLOCK(sc); - break; - } - - /* Wait for subchannels to be destroyed */ - vmbus_subchan_drain(sc->hn_prichan); + hn_suspend(sc); - error = hv_rf_on_device_add(sc, &device_info, - sc->hn_rx_ring_inuse, &sc->hn_rx_ring[0]); - if (error) { - NV_LOCK(sc); - sc->temp_unusable = FALSE; - NV_UNLOCK(sc); - break; - } - KASSERT(sc->hn_rx_ring_cnt == sc->net_dev->num_channel, - ("RX ring count %d and channel count %u mismatch", - sc->hn_rx_ring_cnt, sc->net_dev->num_channel)); - if (sc->net_dev->num_channel > 1) { - int r; + /* + * Detach the synthetics parts, i.e. NVS and RNDIS. + */ + hn_synth_detach(sc); - /* - * Skip the rings on primary channel; they are - * handled by the hv_rf_on_device_add() above. - */ - for (r = 1; r < sc->hn_rx_ring_cnt; ++r) { - sc->hn_rx_ring[r].hn_rx_flags &= - ~HN_RX_FLAG_ATTACHED; - } - for (r = 1; r < sc->hn_tx_ring_cnt; ++r) { - sc->hn_tx_ring[r].hn_tx_flags &= - ~HN_TX_FLAG_ATTACHED; - } - hn_subchan_setup(sc); - } + /* + * Reattach the synthetic parts, i.e. NVS and RNDIS, + * with the new MTU setting. + * XXX check error. + */ + hn_synth_attach(sc, ifr->ifr_mtu); - sc->hn_tx_chimney_max = sc->net_dev->send_section_size; - if (sc->hn_tx_ring[0].hn_tx_chimney_size > - sc->hn_tx_chimney_max) - hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); + if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) + hn_set_chim_size(sc, sc->hn_chim_szmax); + hn_set_tso_maxsize(sc, hn_tso_maxlen, ifr->ifr_mtu); - hn_ifinit_locked(sc); + /* + * All done! Resume the interface now. + */ + hn_resume(sc); - NV_LOCK(sc); - sc->temp_unusable = FALSE; - NV_UNLOCK(sc); + HN_UNLOCK(sc); break; + case SIOCSIFFLAGS: - do { - NV_LOCK(sc); - if (!sc->temp_unusable) { - sc->temp_unusable = TRUE; - retry_cnt = -1; - } - NV_UNLOCK(sc); - if (retry_cnt > 0) { - retry_cnt--; - DELAY(5 * 1000); - } - } while (retry_cnt > 0); - - if (retry_cnt == 0) { - error = EINVAL; - break; - } + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { + HN_UNLOCK(sc); + break; + } if (ifp->if_flags & IFF_UP) { /* @@ -1680,35 +1770,44 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) /* do something here for Hyper-V */ } else #endif - hn_ifinit_locked(sc); + hn_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { hn_stop(sc); } } - NV_LOCK(sc); - sc->temp_unusable = FALSE; - NV_UNLOCK(sc); sc->hn_if_flags = ifp->if_flags; - error = 0; + + HN_UNLOCK(sc); break; - case SIOCSIFCAP: - NV_LOCK(sc); + case SIOCSIFCAP: + HN_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; - if (ifp->if_capenable & IFCAP_TXCSUM) { - ifp->if_hwassist |= - sc->hn_tx_ring[0].hn_csum_assist; - } else { - ifp->if_hwassist &= - ~sc->hn_tx_ring[0].hn_csum_assist; - } + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); + else + ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); + } + if (mask & IFCAP_TXCSUM_IPV6) { + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); + else + ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } + /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; +#ifdef foo + /* We can't diff IPv6 packets from IPv4 packets on RX path. */ + if (mask & IFCAP_RXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; +#endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; @@ -1720,7 +1819,6 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) else ifp->if_hwassist &= ~CSUM_IP_TSO; } - if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) @@ -1729,57 +1827,50 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifp->if_hwassist &= ~CSUM_IP6_TSO; } - NV_UNLOCK(sc); - error = 0; + HN_UNLOCK(sc); break; + case SIOCADDMULTI: case SIOCDELMULTI: -#ifdef notyet - /* Fixme: Multicast mode? */ - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - NV_LOCK(sc); - netvsc_setmulti(sc); - NV_UNLOCK(sc); - error = 0; - } -#endif - error = EINVAL; + /* Always all-multi */ + /* + * TODO: + * Enable/disable all-multi according to the emptiness of + * the mcast address list. + */ break; + case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; + default: error = ether_ioctl(ifp, cmd, data); break; } - return (error); } -/* - * - */ static void -hn_stop(hn_softc_t *sc) +hn_stop(struct hn_softc *sc) { - struct ifnet *ifp; - int ret, i; + struct ifnet *ifp = sc->hn_ifp; + int i; - ifp = sc->hn_ifp; + HN_LOCK_ASSERT(sc); - if (bootverbose) - printf(" Closing Device ...\n"); + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("synthetic parts were not attached")); - atomic_clear_int(&ifp->if_drv_flags, - (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); + /* Clear RUNNING bit _before_ hn_suspend_data() */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); + hn_suspend_data(sc); + + /* Clear OACTIVE bit. */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; - - if_link_state_change(ifp, LINK_STATE_DOWN); - sc->hn_initdone = 0; - - ret = hv_rf_on_close(sc); } /* @@ -1840,59 +1931,43 @@ do_sched: } } -/* - * - */ static void -hn_ifinit_locked(hn_softc_t *sc) +hn_init_locked(struct hn_softc *sc) { - struct ifnet *ifp; - int ret, i; + struct ifnet *ifp = sc->hn_ifp; + int i; - ifp = sc->hn_ifp; + HN_LOCK_ASSERT(sc); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; - } - hv_promisc_mode = 1; - - ret = hv_rf_on_open(sc); - if (ret != 0) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; - } else { - sc->hn_initdone = 1; - } + /* TODO: add hn_rx_filter */ + hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS); + + /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; + /* Clear TX 'suspended' bit. */ + hn_tx_resume(sc, sc->hn_tx_ring_inuse); + + /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); - if_link_state_change(ifp, LINK_STATE_UP); } -/* - * - */ static void -hn_ifinit(void *xsc) +hn_init(void *xsc) { - hn_softc_t *sc = xsc; - - NV_LOCK(sc); - if (sc->temp_unusable) { - NV_UNLOCK(sc); - return; - } - sc->temp_unusable = TRUE; - NV_UNLOCK(sc); + struct hn_softc *sc = xsc; - hn_ifinit_locked(sc); - - NV_LOCK(sc); - sc->temp_unusable = FALSE; - NV_UNLOCK(sc); + HN_LOCK(sc); + hn_init_locked(sc); + HN_UNLOCK(sc); } #ifdef LATER @@ -1902,11 +1977,9 @@ hn_ifinit(void *xsc) static void hn_watchdog(struct ifnet *ifp) { - hn_softc_t *sc; - sc = ifp->if_softc; - printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit); - hn_ifinit(sc); /*???*/ + if_printf(ifp, "watchdog timeout -- resetting\n"); + hn_init(ifp->if_softc); /* XXX */ ifp->if_oerrors++; } #endif @@ -1925,13 +1998,15 @@ hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) if (error || req->newptr == NULL) return error; + HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || - lenlim > TCP_LRO_LENGTH_MAX) + lenlim > TCP_LRO_LENGTH_MAX) { + HN_UNLOCK(sc); return EINVAL; - - NV_LOCK(sc); + } hn_set_lro_lenlim(sc, lenlim); - NV_UNLOCK(sc); + HN_UNLOCK(sc); + return 0; } @@ -1958,10 +2033,10 @@ hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) * count limit. */ --ackcnt; - NV_LOCK(sc); + HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; - NV_UNLOCK(sc); + HN_UNLOCK(sc); return 0; } @@ -1982,7 +2057,7 @@ hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) if (error || req->newptr == NULL) return error; - NV_LOCK(sc); + HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; @@ -1991,25 +2066,27 @@ hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) else rxr->hn_trust_hcsum &= ~hcsum; } - NV_UNLOCK(sc); + HN_UNLOCK(sc); return 0; } static int -hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) +hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; - int chimney_size, error; + int chim_size, error; - chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size; - error = sysctl_handle_int(oidp, &chimney_size, 0, req); + chim_size = sc->hn_tx_ring[0].hn_chim_size; + error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; - if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0) + if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; - hn_set_tx_chimney_size(sc, chimney_size); + HN_LOCK(sc); + hn_set_chim_size(sc, chim_size); + HN_UNLOCK(sc); return 0; } @@ -2134,17 +2211,128 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) if (error || req->newptr == NULL) return error; - NV_LOCK(sc); + HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } - NV_UNLOCK(sc); + HN_UNLOCK(sc); return 0; } static int +hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char verstr[16]; + + snprintf(verstr, sizeof(verstr), "%u.%u", + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); +} + +static int +hn_caps_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char caps_str[128]; + uint32_t caps; + + HN_LOCK(sc); + caps = sc->hn_caps; + HN_UNLOCK(sc); + snprintf(caps_str, sizeof(caps_str), "%b", caps, + "\020" + "\001VLAN" + "\002MTU" + "\003IPCS" + "\004TCP4CS" + "\005TCP6CS" + "\006UDP4CS" + "\007UDP6CS" + "\010TSO4" + "\011TSO6"); + return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); +} + +static int +hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char assist_str[128]; + uint32_t hwassist; + + HN_LOCK(sc); + hwassist = sc->hn_ifp->if_hwassist; + HN_UNLOCK(sc); + snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); + return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); +} + +static int +hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error; + + HN_LOCK(sc); + + error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); + if (error || req->newptr == NULL) + goto back; + + error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); + if (error) + goto back; + sc->hn_flags |= HN_FLAG_HAS_RSSKEY; + + if (sc->hn_rx_ring_inuse > 1) { + error = hn_rss_reconfig(sc); + } else { + /* Not RSS capable, at least for now; just save the RSS key. */ + error = 0; + } +back: + HN_UNLOCK(sc); + return (error); +} + +static int +hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error; + + HN_LOCK(sc); + + error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); + if (error || req->newptr == NULL) + goto back; + + /* + * Don't allow RSS indirect table change, if this interface is not + * RSS capable currently. + */ + if (sc->hn_rx_ring_inuse == 1) { + error = EOPNOTSUPP; + goto back; + } + + error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); + if (error) + goto back; + sc->hn_flags |= HN_FLAG_HAS_RSSIND; + + hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse); + error = hn_rss_reconfig(sc); +back: + HN_UNLOCK(sc); + return (error); +} + +static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; @@ -2219,7 +2407,7 @@ hn_check_iplen(const struct mbuf *m, int hoff) return ip->ip_p; } -static void +static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; @@ -2232,6 +2420,22 @@ hn_create_rx_data(struct hn_softc *sc, int ring_cnt) #endif int i; + /* + * Create RXBUF for reception. + * + * NOTE: + * - It is shared by all channels. + * - A large enough buffer is allocated, certain version of NVSes + * may further limit the usable space. + */ + sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), + PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->hn_rxbuf == NULL) { + device_printf(sc->hn_dev, "allocate rxbuf failed\n"); + return (ENOMEM); + } + sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; @@ -2243,7 +2447,8 @@ hn_create_rx_data(struct hn_softc *sc, int ring_cnt) lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; - device_printf(dev, "LRO: entry count %d\n", lroent_cnt); + if (bootverbose) + device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ @@ -2257,6 +2462,16 @@ hn_create_rx_data(struct hn_softc *sc, int ring_cnt) for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), + PAGE_SIZE, 0, + NETVSC_DEVICE_RING_BUFFER_SIZE + + NETVSC_DEVICE_RING_BUFFER_SIZE, + &rxr->hn_br_dma, BUS_DMA_WAITOK); + if (rxr->hn_br == NULL) { + device_printf(dev, "allocate bufring failed\n"); + return (ENOMEM); + } + if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) @@ -2268,6 +2483,7 @@ hn_create_rx_data(struct hn_softc *sc, int ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); rxr->hn_rx_idx = i; + rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. @@ -2384,6 +2600,8 @@ hn_create_rx_data(struct hn_softc *sc, int ring_cnt) CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); + + return (0); } static void @@ -2391,12 +2609,22 @@ hn_destroy_rx_data(struct hn_softc *sc) { int i; + if (sc->hn_rxbuf != NULL) { + hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); + sc->hn_rxbuf = NULL; + } + if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + if (rxr->hn_br == NULL) + continue; + hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); + rxr->hn_br = NULL; + #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif @@ -2416,7 +2644,6 @@ hn_create_tx_ring(struct hn_softc *sc, int id) device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; - uint32_t version; txr->hn_sc = sc; txr->hn_tx_idx = id; @@ -2455,18 +2682,6 @@ hn_create_tx_ring(struct hn_softc *sc, int id) } txr->hn_direct_tx_size = hn_direct_tx_size; - version = VMBUS_GET_VERSION(device_get_parent(dev), dev); - if (version >= VMBUS_VERSION_WIN8_1) { - txr->hn_csum_assist = HN_CSUM_ASSIST; - } else { - txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8; - if (id == 0) { - device_printf(dev, "bus version %u.%u, " - "no UDP checksum offloading\n", - VMBUS_VERSION_MAJOR(version), - VMBUS_VERSION_MINOR(version)); - } - } /* * Always schedule transmission instead of trying to do direct @@ -2476,16 +2691,16 @@ hn_create_tx_ring(struct hn_softc *sc, int id) parent_dtag = bus_get_dma_tag(dev); - /* DMA tag for RNDIS messages. */ + /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ - HN_RNDIS_MSG_ALIGN, /* alignment */ - HN_RNDIS_MSG_BOUNDARY, /* boundary */ + HN_RNDIS_PKT_ALIGN, /* alignment */ + HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ - HN_RNDIS_MSG_LEN, /* maxsize */ + HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ - HN_RNDIS_MSG_LEN, /* maxsegsize */ + HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ @@ -2520,28 +2735,28 @@ hn_create_tx_ring(struct hn_softc *sc, int id) txd->txr = txr; /* - * Allocate and load RNDIS messages. + * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, - (void **)&txd->rndis_msg, - BUS_DMA_WAITOK | BUS_DMA_COHERENT, - &txd->rndis_msg_dmap); + (void **)&txd->rndis_pkt, + BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, + &txd->rndis_pkt_dmap); if (error) { device_printf(dev, - "failed to allocate rndis_msg, %d\n", i); + "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, - txd->rndis_msg_dmap, - txd->rndis_msg, HN_RNDIS_MSG_LEN, - hyperv_dma_map_paddr, &txd->rndis_msg_paddr, + txd->rndis_pkt_dmap, + txd->rndis_pkt, HN_RNDIS_PKT_LEN, + hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, - "failed to load rndis_msg, %d\n", i); + "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, - txd->rndis_msg, txd->rndis_msg_dmap); + txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } @@ -2552,9 +2767,9 @@ hn_create_tx_ring(struct hn_softc *sc, int id) device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, - txd->rndis_msg_dmap); + txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, - txd->rndis_msg, txd->rndis_msg_dmap); + txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } @@ -2612,9 +2827,9 @@ hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); - bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap); - bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, - txd->rndis_msg_dmap); + bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); + bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, + txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } @@ -2666,6 +2881,19 @@ hn_create_tx_data(struct hn_softc *sc, int ring_cnt) struct sysctl_ctx_list *ctx; int i; + /* + * Create TXBUF for chimney sending. + * + * NOTE: It is shared by all channels. + */ + sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), + PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->hn_chim == NULL) { + device_printf(sc->hn_dev, "allocate txbuf failed\n"); + return (ENOMEM); + } + sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; @@ -2715,12 +2943,11 @@ hn_create_tx_data(struct hn_softc *sc, int ring_cnt) CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", - CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, + CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, - hn_tx_chimney_size_sysctl, - "I", "Chimney send packet size limit"); + hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), @@ -2741,14 +2968,75 @@ hn_create_tx_data(struct hn_softc *sc, int ring_cnt) } static void -hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size) +hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; - NV_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) - sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size; - NV_UNLOCK(sc); + sc->hn_tx_ring[i].hn_chim_size = chim_size; +} + +static void +hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) +{ + struct ifnet *ifp = sc->hn_ifp; + int tso_minlen; + + if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) + return; + + KASSERT(sc->hn_ndis_tso_sgmin >= 2, + ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); + tso_minlen = sc->hn_ndis_tso_sgmin * mtu; + + KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && + sc->hn_ndis_tso_szmax <= IP_MAXPACKET, + ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); + + if (tso_maxlen < tso_minlen) + tso_maxlen = tso_minlen; + else if (tso_maxlen > IP_MAXPACKET) + tso_maxlen = IP_MAXPACKET; + if (tso_maxlen > sc->hn_ndis_tso_szmax) + tso_maxlen = sc->hn_ndis_tso_szmax; + ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + if (bootverbose) + if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); +} + +static void +hn_fixup_tx_data(struct hn_softc *sc) +{ + uint64_t csum_assist; + int i; + + hn_set_chim_size(sc, sc->hn_chim_szmax); + if (hn_tx_chimney_size > 0 && + hn_tx_chimney_size < sc->hn_chim_szmax) + hn_set_chim_size(sc, hn_tx_chimney_size); + + csum_assist = 0; + if (sc->hn_caps & HN_CAP_IPCS) + csum_assist |= CSUM_IP; + if (sc->hn_caps & HN_CAP_TCP4CS) + csum_assist |= CSUM_IP_TCP; + if (sc->hn_caps & HN_CAP_UDP4CS) + csum_assist |= CSUM_IP_UDP; +#ifdef notyet + if (sc->hn_caps & HN_CAP_TCP6CS) + csum_assist |= CSUM_IP6_TCP; + if (sc->hn_caps & HN_CAP_UDP6CS) + csum_assist |= CSUM_IP6_UDP; +#endif + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_csum_assist = csum_assist; + + if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { + /* Support HASHVAL pktinfo on TX path. */ + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; + } } static void @@ -2756,6 +3044,11 @@ hn_destroy_tx_data(struct hn_softc *sc) { int i; + if (sc->hn_chim != NULL) { + hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); + sc->hn_chim = NULL; + } + if (sc->hn_tx_ring_cnt == 0) return; @@ -2790,19 +3083,6 @@ hn_start_txeof_taskfunc(void *xtxr, int pending __unused) mtx_unlock(&txr->hn_tx_lock); } -static void -hn_stop_tx_tasks(struct hn_softc *sc) -{ - int i; - - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { - struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; - - taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); - taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); - } -} - static int hn_xmit(struct hn_tx_ring *txr, int len) { @@ -2814,6 +3094,9 @@ hn_xmit(struct hn_tx_ring *txr, int len) KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); + if (__predict_false(txr->hn_suspended)) + return 0; + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return 0; @@ -2900,20 +3183,24 @@ do_sched: } static void +hn_tx_ring_qflush(struct hn_tx_ring *txr) +{ + struct mbuf *m; + + mtx_lock(&txr->hn_tx_lock); + while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) + m_freem(m); + mtx_unlock(&txr->hn_tx_lock); +} + +static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; int i; - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { - struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; - struct mbuf *m; - - mtx_lock(&txr->hn_tx_lock); - while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) - m_freem(m); - mtx_unlock(&txr->hn_tx_lock); - } + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); } @@ -2968,14 +3255,19 @@ hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) mtx_unlock(&txr->hn_tx_lock); } -static void -hn_channel_attach(struct hn_softc *sc, struct vmbus_channel *chan) +static int +hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { + struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; - int idx; + struct hn_tx_ring *txr = NULL; + int idx, error; idx = vmbus_chan_subidx(chan); + /* + * Link this channel to RX/TX ring. + */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); @@ -2985,60 +3277,747 @@ hn_channel_attach(struct hn_softc *sc, struct vmbus_channel *chan) rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; if (bootverbose) { - if_printf(sc->hn_ifp, "link RX ring %d to channel%u\n", + if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { - struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; - + txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { - if_printf(sc->hn_ifp, "link TX ring %d to channel%u\n", + if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } - /* Bind channel to a proper CPU */ + /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); + + /* + * Open this channel + */ + cbr.cbr = rxr->hn_br; + cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; + cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE; + cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE; + error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); + if (error) { + if_printf(sc->hn_ifp, "open chan%u failed: %d\n", + vmbus_chan_id(chan), error); + rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; + if (txr != NULL) + txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; + } + return (error); } static void -hn_subchan_attach(struct hn_softc *sc, struct vmbus_channel *chan) +hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { + struct hn_rx_ring *rxr; + int idx; + + idx = vmbus_chan_subidx(chan); + + /* + * Link this channel to RX/TX ring. + */ + KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, + ("invalid channel index %d, should > 0 && < %d", + idx, sc->hn_rx_ring_inuse)); + rxr = &sc->hn_rx_ring[idx]; + KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), + ("RX ring %d is not attached", idx)); + rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; + + if (idx < sc->hn_tx_ring_inuse) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; + + KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), + ("TX ring %d is not attached attached", idx)); + txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; + } - KASSERT(!vmbus_chan_is_primary(chan), - ("subchannel callback on primary channel")); - hn_channel_attach(sc, chan); + /* + * Close this channel. + * + * NOTE: + * Channel closing does _not_ destroy the target channel. + */ + vmbus_chan_close(chan); } -static void -hn_subchan_setup(struct hn_softc *sc) +static int +hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; - int subchan_cnt = sc->net_dev->num_channel - 1; - int i; + int subchan_cnt = sc->hn_rx_ring_inuse - 1; + int i, error = 0; - /* Wait for sub-channels setup to complete. */ - subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); + if (subchan_cnt == 0) + return (0); /* Attach the sub-channels. */ + subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { - struct vmbus_channel *subchan = subchans[i]; + error = hn_chan_attach(sc, subchans[i]); + if (error) + break; + } + vmbus_subchan_rel(subchans, subchan_cnt); - /* NOTE: Calling order is critical. */ - hn_subchan_attach(sc, subchan); - hv_nv_subchan_attach(subchan, - &sc->hn_rx_ring[vmbus_chan_subidx(subchan)]); + if (error) { + if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); + } else { + if (bootverbose) { + if_printf(sc->hn_ifp, "%d sub-channels attached\n", + subchan_cnt); + } } + return (error); +} + +static void +hn_detach_allchans(struct hn_softc *sc) +{ + struct vmbus_channel **subchans; + int subchan_cnt = sc->hn_rx_ring_inuse - 1; + int i; + + if (subchan_cnt == 0) + goto back; - /* Release the sub-channels */ + /* Detach the sub-channels. */ + subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); + for (i = 0; i < subchan_cnt; ++i) + hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); - if_printf(sc->hn_ifp, "%d sub-channels setup done\n", subchan_cnt); + +back: + /* + * Detach the primary channel, _after_ all sub-channels + * are detached. + */ + hn_chan_detach(sc, sc->hn_prichan); + + /* Wait for sub-channels to be destroyed, if any. */ + vmbus_subchan_drain(sc->hn_prichan); + +#ifdef INVARIANTS + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + KASSERT((sc->hn_rx_ring[i].hn_rx_flags & + HN_RX_FLAG_ATTACHED) == 0, + ("%dth RX ring is still attached", i)); + } + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + KASSERT((sc->hn_tx_ring[i].hn_tx_flags & + HN_TX_FLAG_ATTACHED) == 0, + ("%dth TX ring is still attached", i)); + } +#endif +} + +static int +hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) +{ + struct vmbus_channel **subchans; + int nchan, rxr_cnt, error; + + nchan = *nsubch + 1; + if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30 || nchan == 1) { + /* + * Either RSS is not supported, or multiple RX/TX rings + * are not requested. + */ + *nsubch = 0; + return (0); + } + + /* + * Get RSS capabilities, e.g. # of RX rings, and # of indirect + * table entries. + */ + error = hn_rndis_get_rsscaps(sc, &rxr_cnt); + if (error) { + /* No RSS; this is benign. */ + *nsubch = 0; + return (0); + } + if (bootverbose) { + if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", + rxr_cnt, nchan); + } + + if (nchan > rxr_cnt) + nchan = rxr_cnt; + if (nchan == 1) { + if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); + *nsubch = 0; + return (0); + } + + /* + * Allocate sub-channels from NVS. + */ + *nsubch = nchan - 1; + error = hn_nvs_alloc_subchans(sc, nsubch); + if (error || *nsubch == 0) { + /* Failed to allocate sub-channels. */ + *nsubch = 0; + return (0); + } + + /* + * Wait for all sub-channels to become ready before moving on. + */ + subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); + vmbus_subchan_rel(subchans, *nsubch); + return (0); +} + +static int +hn_synth_attach(struct hn_softc *sc, int mtu) +{ + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + int error, nsubch, nchan, i; + uint32_t old_caps; + + KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, + ("synthetic parts were attached")); + + /* Save capabilities for later verification. */ + old_caps = sc->hn_caps; + sc->hn_caps = 0; + + /* + * Attach the primary channel _before_ attaching NVS and RNDIS. + */ + error = hn_chan_attach(sc, sc->hn_prichan); + if (error) + return (error); + + /* + * Attach NVS. + */ + error = hn_nvs_attach(sc, mtu); + if (error) + return (error); + + /* + * Attach RNDIS _after_ NVS is attached. + */ + error = hn_rndis_attach(sc, mtu); + if (error) + return (error); + + /* + * Make sure capabilities are not changed. + */ + if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { + if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", + old_caps, sc->hn_caps); + /* Restore old capabilities and abort. */ + sc->hn_caps = old_caps; + return ENXIO; + } + + /* + * Allocate sub-channels for multi-TX/RX rings. + * + * NOTE: + * The # of RX rings that can be used is equivalent to the # of + * channels to be requested. + */ + nsubch = sc->hn_rx_ring_cnt - 1; + error = hn_synth_alloc_subchans(sc, &nsubch); + if (error) + return (error); + + nchan = nsubch + 1; + if (nchan == 1) { + /* Only the primary channel can be used; done */ + goto back; + } + + /* + * Configure RSS key and indirect table _after_ all sub-channels + * are allocated. + */ + + if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { + /* + * RSS key is not set yet; set it to the default RSS key. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "setup default RSS key\n"); + memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); + sc->hn_flags |= HN_FLAG_HAS_RSSKEY; + } + + if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { + /* + * RSS indirect table is not set yet; set it up in round- + * robin fashion. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "setup default RSS indirect " + "table\n"); + } + /* TODO: Take ndis_rss_caps.ndis_nind into account. */ + for (i = 0; i < NDIS_HASH_INDCNT; ++i) + rss->rss_ind[i] = i % nchan; + sc->hn_flags |= HN_FLAG_HAS_RSSIND; + } else { + /* + * # of usable channels may be changed, so we have to + * make sure that all entries in RSS indirect table + * are valid. + */ + hn_rss_ind_fixup(sc, nchan); + } + + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); + if (error) { + /* + * Failed to configure RSS key or indirect table; only + * the primary channel can be used. + */ + nchan = 1; + } +back: + /* + * Set the # of TX/RX rings that could be used according to + * the # of channels that NVS offered. + */ + hn_set_ring_inuse(sc, nchan); + + /* + * Attach the sub-channels, if any. + */ + error = hn_attach_subchans(sc); + if (error) + return (error); + + sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; + return (0); +} + +/* + * NOTE: + * The interface must have been suspended though hn_suspend(), before + * this function get called. + */ +static void +hn_synth_detach(struct hn_softc *sc) +{ + HN_LOCK_ASSERT(sc); + + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("synthetic parts were not attached")); + + /* Detach the RNDIS first. */ + hn_rndis_detach(sc); + + /* Detach NVS. */ + hn_nvs_detach(sc); + + /* Detach all of the channels. */ + hn_detach_allchans(sc); + + sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; +} + +static void +hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) +{ + KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, + ("invalid ring count %d", ring_cnt)); + + if (sc->hn_tx_ring_cnt > ring_cnt) + sc->hn_tx_ring_inuse = ring_cnt; + else + sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; + sc->hn_rx_ring_inuse = ring_cnt; + + if (bootverbose) { + if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", + sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); + } +} + +static void +hn_rx_drain(struct vmbus_channel *chan) +{ + + while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan)) + pause("waitch", 1); + vmbus_chan_intr_drain(chan); +} + +static void +hn_suspend_data(struct hn_softc *sc) +{ + struct vmbus_channel **subch = NULL; + int i, nsubch; + + HN_LOCK_ASSERT(sc); + + /* + * Suspend TX. + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_suspended = 1; + mtx_unlock(&txr->hn_tx_lock); + /* No one is able send more packets now. */ + + /* Wait for all pending sends to finish. */ + while (hn_tx_ring_pending(txr)) + pause("hnwtx", 1 /* 1 tick */); + + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); + } + + /* + * Disable RX by clearing RX filter. + */ + hn_rndis_set_rxfilter(sc, 0); + + /* + * Give RNDIS enough time to flush all pending data packets. + */ + pause("waitrx", (200 * hz) / 1000); + + /* + * Drain RX/TX bufrings and interrupts. + */ + nsubch = sc->hn_rx_ring_inuse - 1; + if (nsubch > 0) + subch = vmbus_subchan_get(sc->hn_prichan, nsubch); + + if (subch != NULL) { + for (i = 0; i < nsubch; ++i) + hn_rx_drain(subch[i]); + } + hn_rx_drain(sc->hn_prichan); + + if (subch != NULL) + vmbus_subchan_rel(subch, nsubch); +} + +static void +hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) +{ + + ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; +} + +static void +hn_suspend_mgmt(struct hn_softc *sc) +{ + struct task task; + + HN_LOCK_ASSERT(sc); + + /* + * Make sure that hn_mgmt_taskq0 can nolonger be accessed + * through hn_mgmt_taskq. + */ + TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); + vmbus_chan_run_task(sc->hn_prichan, &task); + + /* + * Make sure that all pending management tasks are completed. + */ + taskqueue_drain_all(sc->hn_mgmt_taskq0); +} + +static void +hn_suspend(struct hn_softc *sc) +{ + + if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_suspend_data(sc); + hn_suspend_mgmt(sc); +} + +static void +hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt) +{ + int i; + + KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, + ("invalid TX ring count %d", tx_ring_cnt)); + + for (i = 0; i < tx_ring_cnt; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_suspended = 0; + mtx_unlock(&txr->hn_tx_lock); + } +} + +static void +hn_resume_data(struct hn_softc *sc) +{ + int i; + + HN_LOCK_ASSERT(sc); + + /* + * Re-enable RX. + * TODO: add hn_rx_filter. + */ + hn_rndis_set_rxfilter(sc, NDIS_PACKET_TYPE_PROMISCUOUS); + + /* + * Make sure to clear suspend status on "all" TX rings, + * since hn_tx_ring_inuse can be changed after + * hn_suspend_data(). + */ + hn_tx_resume(sc, sc->hn_tx_ring_cnt); + + if (!hn_use_if_start) { + /* + * Flush unused drbrs, since hn_tx_ring_inuse may be + * reduced. + */ + for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) + hn_tx_ring_qflush(&sc->hn_tx_ring[i]); + } + + /* + * Kick start TX. + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + /* + * Use txeof task, so that any pending oactive can be + * cleared properly. + */ + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +static void +hn_resume_mgmt(struct hn_softc *sc) +{ + + /* + * Kick off link status check. + */ + sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; + hn_link_status_update(sc); +} + +static void +hn_resume(struct hn_softc *sc) +{ + + if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_resume_data(sc); + hn_resume_mgmt(sc); +} + +static void +hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) +{ + const struct hn_nvs_hdr *hdr; + + if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { + if_printf(sc->hn_ifp, "invalid nvs notify\n"); + return; + } + hdr = VMBUS_CHANPKT_CONST_DATA(pkt); + + if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { + /* Useless; ignore */ + return; + } + if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); +} + +static void +hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkt) +{ + struct hn_send_ctx *sndc; + + sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid; + sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), + VMBUS_CHANPKT_DATALEN(pkt)); + /* + * NOTE: + * 'sndc' CAN NOT be accessed anymore, since it can be freed by + * its callback. + */ +} + +static void +hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, + struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) +{ + const struct vmbus_chanpkt_rxbuf *pkt; + const struct hn_nvs_hdr *nvs_hdr; + int count, i, hlen; + + if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { + if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); + return; + } + nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); + + /* Make sure that this is a RNDIS message. */ + if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { + if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", + nvs_hdr->nvs_type); + return; + } + + hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); + if (__predict_false(hlen < sizeof(*pkt))) { + if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); + return; + } + pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; + + if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { + if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", + pkt->cp_rxbuf_id); + return; + } + + count = pkt->cp_rxbuf_cnt; + if (__predict_false(hlen < + __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { + if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); + return; + } + + /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ + for (i = 0; i < count; ++i) { + int ofs, len; + + ofs = pkt->cp_rxbuf[i].rb_ofs; + len = pkt->cp_rxbuf[i].rb_len; + if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) { + if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " + "ofs %d, len %d\n", i, ofs, len); + continue; + } + hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len); + } + + /* + * Moved completion call back here so that all received + * messages (not just data messages) will trigger a response + * message back to the host. + */ + hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid); +} + +/* + * Net VSC on receive completion + * + * Send a receive completion packet to RNDIS device (ie NetVsp) + */ +static void +hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid) +{ + struct hn_nvs_rndis_ack ack; + int retries = 0; + int ret = 0; + + ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; + ack.nvs_status = HN_NVS_STATUS_OK; + +retry_send_cmplt: + /* Send the completion */ + ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, + VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); + if (ret == 0) { + /* success */ + /* no-op */ + } else if (ret == EAGAIN) { + /* no more room... wait a bit and attempt to retry 3 times */ + retries++; + + if (retries < 4) { + DELAY(100); + goto retry_send_cmplt; + } + } +} + +static void +hn_chan_callback(struct vmbus_channel *chan, void *xrxr) +{ + struct hn_rx_ring *rxr = xrxr; + struct hn_softc *sc = rxr->hn_ifp->if_softc; + void *buffer; + int bufferlen = NETVSC_PACKET_SIZE; + + buffer = rxr->hn_rdbuf; + do { + struct vmbus_chanpkt_hdr *pkt = buffer; + uint32_t bytes_rxed; + int ret; + + bytes_rxed = bufferlen; + ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); + if (ret == 0) { + switch (pkt->cph_type) { + case VMBUS_CHANPKT_TYPE_COMP: + hn_nvs_handle_comp(sc, chan, pkt); + break; + case VMBUS_CHANPKT_TYPE_RXBUF: + hn_nvs_handle_rxbuf(sc, rxr, chan, pkt); + break; + case VMBUS_CHANPKT_TYPE_INBAND: + hn_nvs_handle_notify(sc, pkt); + break; + default: + if_printf(rxr->hn_ifp, + "unknown chan pkt %u\n", + pkt->cph_type); + break; + } + } else if (ret == ENOBUFS) { + /* Handle large packet */ + if (bufferlen > NETVSC_PACKET_SIZE) { + free(buffer, M_NETVSC); + buffer = NULL; + } + + /* alloc new buffer */ + buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT); + if (buffer == NULL) { + if_printf(rxr->hn_ifp, + "hv_cb malloc buffer failed, len=%u\n", + bytes_rxed); + bufferlen = 0; + break; + } + bufferlen = bytes_rxed; + } else { + /* No more packets */ + break; + } + } while (1); + + if (bufferlen > NETVSC_PACKET_SIZE) + free(buffer, M_NETVSC); + + hv_rf_channel_rollup(rxr, rxr->hn_txr); } static void @@ -3088,7 +4067,7 @@ static device_method_t netvsc_methods[] = { static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, - sizeof(hn_softc_t) + sizeof(struct hn_softc) }; static devclass_t netvsc_devclass; diff --git a/sys/dev/hyperv/netvsc/hv_rndis.h b/sys/dev/hyperv/netvsc/hv_rndis.h deleted file mode 100644 index da2b408..0000000 --- a/sys/dev/hyperv/netvsc/hv_rndis.h +++ /dev/null @@ -1,1101 +0,0 @@ -/*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. - * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2012 NetApp Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef __HV_RNDIS_H__ -#define __HV_RNDIS_H__ - - -/* - * NDIS protocol version numbers - */ -#define NDIS_VERSION_5_0 0x00050000 -#define NDIS_VERSION_5_1 0x00050001 -#define NDIS_VERSION_6_0 0x00060000 -#define NDIS_VERSION_6_1 0x00060001 -#define NDIS_VERSION_6_30 0x0006001e - -#define NDIS_VERSION (NDIS_VERSION_5_1) - -/* - * Status codes - */ - -#define STATUS_SUCCESS (0x00000000L) -#define STATUS_UNSUCCESSFUL (0xC0000001L) -#define STATUS_PENDING (0x00000103L) -#define STATUS_INSUFFICIENT_RESOURCES (0xC000009AL) -#define STATUS_BUFFER_OVERFLOW (0x80000005L) -#define STATUS_NOT_SUPPORTED (0xC00000BBL) - -#define RNDIS_STATUS_SUCCESS (STATUS_SUCCESS) -#define RNDIS_STATUS_PENDING (STATUS_PENDING) -#define RNDIS_STATUS_NOT_RECOGNIZED (0x00010001L) -#define RNDIS_STATUS_NOT_COPIED (0x00010002L) -#define RNDIS_STATUS_NOT_ACCEPTED (0x00010003L) -#define RNDIS_STATUS_CALL_ACTIVE (0x00010007L) - -#define RNDIS_STATUS_ONLINE (0x40010003L) -#define RNDIS_STATUS_RESET_START (0x40010004L) -#define RNDIS_STATUS_RESET_END (0x40010005L) -#define RNDIS_STATUS_RING_STATUS (0x40010006L) -#define RNDIS_STATUS_CLOSED (0x40010007L) -#define RNDIS_STATUS_WAN_LINE_UP (0x40010008L) -#define RNDIS_STATUS_WAN_LINE_DOWN (0x40010009L) -#define RNDIS_STATUS_WAN_FRAGMENT (0x4001000AL) -#define RNDIS_STATUS_MEDIA_CONNECT (0x4001000BL) -#define RNDIS_STATUS_MEDIA_DISCONNECT (0x4001000CL) -#define RNDIS_STATUS_HARDWARE_LINE_UP (0x4001000DL) -#define RNDIS_STATUS_HARDWARE_LINE_DOWN (0x4001000EL) -#define RNDIS_STATUS_INTERFACE_UP (0x4001000FL) -#define RNDIS_STATUS_INTERFACE_DOWN (0x40010010L) -#define RNDIS_STATUS_MEDIA_BUSY (0x40010011L) -#define RNDIS_STATUS_MEDIA_SPECIFIC_INDICATION (0x40010012L) -#define RNDIS_STATUS_WW_INDICATION RNDIS_STATUS_MEDIA_SPECIFIC_INDICATION -#define RNDIS_STATUS_LINK_SPEED_CHANGE (0x40010013L) - -#define RNDIS_STATUS_NOT_RESETTABLE (0x80010001L) -#define RNDIS_STATUS_SOFT_ERRORS (0x80010003L) -#define RNDIS_STATUS_HARD_ERRORS (0x80010004L) -#define RNDIS_STATUS_BUFFER_OVERFLOW (STATUS_BUFFER_OVERFLOW) - -#define RNDIS_STATUS_FAILURE (STATUS_UNSUCCESSFUL) -#define RNDIS_STATUS_RESOURCES (STATUS_INSUFFICIENT_RESOURCES) -#define RNDIS_STATUS_CLOSING (0xC0010002L) -#define RNDIS_STATUS_BAD_VERSION (0xC0010004L) -#define RNDIS_STATUS_BAD_CHARACTERISTICS (0xC0010005L) -#define RNDIS_STATUS_ADAPTER_NOT_FOUND (0xC0010006L) -#define RNDIS_STATUS_OPEN_FAILED (0xC0010007L) -#define RNDIS_STATUS_DEVICE_FAILED (0xC0010008L) -#define RNDIS_STATUS_MULTICAST_FULL (0xC0010009L) -#define RNDIS_STATUS_MULTICAST_EXISTS (0xC001000AL) -#define RNDIS_STATUS_MULTICAST_NOT_FOUND (0xC001000BL) -#define RNDIS_STATUS_REQUEST_ABORTED (0xC001000CL) -#define RNDIS_STATUS_RESET_IN_PROGRESS (0xC001000DL) -#define RNDIS_STATUS_CLOSING_INDICATING (0xC001000EL) -#define RNDIS_STATUS_NOT_SUPPORTED (STATUS_NOT_SUPPORTED) -#define RNDIS_STATUS_INVALID_PACKET (0xC001000FL) -#define RNDIS_STATUS_OPEN_LIST_FULL (0xC0010010L) -#define RNDIS_STATUS_ADAPTER_NOT_READY (0xC0010011L) -#define RNDIS_STATUS_ADAPTER_NOT_OPEN (0xC0010012L) -#define RNDIS_STATUS_NOT_INDICATING (0xC0010013L) -#define RNDIS_STATUS_INVALID_LENGTH (0xC0010014L) -#define RNDIS_STATUS_INVALID_DATA (0xC0010015L) -#define RNDIS_STATUS_BUFFER_TOO_SHORT (0xC0010016L) -#define RNDIS_STATUS_INVALID_OID (0xC0010017L) -#define RNDIS_STATUS_ADAPTER_REMOVED (0xC0010018L) -#define RNDIS_STATUS_UNSUPPORTED_MEDIA (0xC0010019L) -#define RNDIS_STATUS_GROUP_ADDRESS_IN_USE (0xC001001AL) -#define RNDIS_STATUS_FILE_NOT_FOUND (0xC001001BL) -#define RNDIS_STATUS_ERROR_READING_FILE (0xC001001CL) -#define RNDIS_STATUS_ALREADY_MAPPED (0xC001001DL) -#define RNDIS_STATUS_RESOURCE_CONFLICT (0xC001001EL) -#define RNDIS_STATUS_NO_CABLE (0xC001001FL) - -#define RNDIS_STATUS_INVALID_SAP (0xC0010020L) -#define RNDIS_STATUS_SAP_IN_USE (0xC0010021L) -#define RNDIS_STATUS_INVALID_ADDRESS (0xC0010022L) -#define RNDIS_STATUS_VC_NOT_ACTIVATED (0xC0010023L) -#define RNDIS_STATUS_DEST_OUT_OF_ORDER (0xC0010024L) -#define RNDIS_STATUS_VC_NOT_AVAILABLE (0xC0010025L) -#define RNDIS_STATUS_CELLRATE_NOT_AVAILABLE (0xC0010026L) -#define RNDIS_STATUS_INCOMPATABLE_QOS (0xC0010027L) -#define RNDIS_STATUS_AAL_PARAMS_UNSUPPORTED (0xC0010028L) -#define RNDIS_STATUS_NO_ROUTE_TO_DESTINATION (0xC0010029L) - -#define RNDIS_STATUS_TOKEN_RING_OPEN_ERROR (0xC0011000L) - - -/* - * Object Identifiers used by NdisRequest Query/Set Information - */ - -/* - * General Objects - */ - -#define RNDIS_OID_GEN_SUPPORTED_LIST 0x00010101 -#define RNDIS_OID_GEN_HARDWARE_STATUS 0x00010102 -#define RNDIS_OID_GEN_MEDIA_SUPPORTED 0x00010103 -#define RNDIS_OID_GEN_MEDIA_IN_USE 0x00010104 -#define RNDIS_OID_GEN_MAXIMUM_LOOKAHEAD 0x00010105 -#define RNDIS_OID_GEN_MAXIMUM_FRAME_SIZE 0x00010106 -#define RNDIS_OID_GEN_LINK_SPEED 0x00010107 -#define RNDIS_OID_GEN_TRANSMIT_BUFFER_SPACE 0x00010108 -#define RNDIS_OID_GEN_RECEIVE_BUFFER_SPACE 0x00010109 -#define RNDIS_OID_GEN_TRANSMIT_BLOCK_SIZE 0x0001010A -#define RNDIS_OID_GEN_RECEIVE_BLOCK_SIZE 0x0001010B -#define RNDIS_OID_GEN_VENDOR_ID 0x0001010C -#define RNDIS_OID_GEN_VENDOR_DESCRIPTION 0x0001010D -#define RNDIS_OID_GEN_CURRENT_PACKET_FILTER 0x0001010E -#define RNDIS_OID_GEN_CURRENT_LOOKAHEAD 0x0001010F -#define RNDIS_OID_GEN_DRIVER_VERSION 0x00010110 -#define RNDIS_OID_GEN_MAXIMUM_TOTAL_SIZE 0x00010111 -#define RNDIS_OID_GEN_PROTOCOL_OPTIONS 0x00010112 -#define RNDIS_OID_GEN_MAC_OPTIONS 0x00010113 -#define RNDIS_OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 -#define RNDIS_OID_GEN_MAXIMUM_SEND_PACKETS 0x00010115 -#define RNDIS_OID_GEN_VENDOR_DRIVER_VERSION 0x00010116 -#define RNDIS_OID_GEN_NETWORK_LAYER_ADDRESSES 0x00010118 -#define RNDIS_OID_GEN_TRANSPORT_HEADER_OFFSET 0x00010119 -#define RNDIS_OID_GEN_MACHINE_NAME 0x0001021A -#define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B - -/* - * For receive side scale - */ -/* Query only */ -#define RNDIS_OID_GEN_RSS_CAPABILITIES 0x00010203 -/* Query and set */ -#define RNDIS_OID_GEN_RSS_PARAMETERS 0x00010204 - -#define RNDIS_OID_GEN_XMIT_OK 0x00020101 -#define RNDIS_OID_GEN_RCV_OK 0x00020102 -#define RNDIS_OID_GEN_XMIT_ERROR 0x00020103 -#define RNDIS_OID_GEN_RCV_ERROR 0x00020104 -#define RNDIS_OID_GEN_RCV_NO_BUFFER 0x00020105 - -#define RNDIS_OID_GEN_DIRECTED_BYTES_XMIT 0x00020201 -#define RNDIS_OID_GEN_DIRECTED_FRAMES_XMIT 0x00020202 -#define RNDIS_OID_GEN_MULTICAST_BYTES_XMIT 0x00020203 -#define RNDIS_OID_GEN_MULTICAST_FRAMES_XMIT 0x00020204 -#define RNDIS_OID_GEN_BROADCAST_BYTES_XMIT 0x00020205 -#define RNDIS_OID_GEN_BROADCAST_FRAMES_XMIT 0x00020206 -#define RNDIS_OID_GEN_DIRECTED_BYTES_RCV 0x00020207 -#define RNDIS_OID_GEN_DIRECTED_FRAMES_RCV 0x00020208 -#define RNDIS_OID_GEN_MULTICAST_BYTES_RCV 0x00020209 -#define RNDIS_OID_GEN_MULTICAST_FRAMES_RCV 0x0002020A -#define RNDIS_OID_GEN_BROADCAST_BYTES_RCV 0x0002020B -#define RNDIS_OID_GEN_BROADCAST_FRAMES_RCV 0x0002020C - -#define RNDIS_OID_GEN_RCV_CRC_ERROR 0x0002020D -#define RNDIS_OID_GEN_TRANSMIT_QUEUE_LENGTH 0x0002020E - -#define RNDIS_OID_GEN_GET_TIME_CAPS 0x0002020F -#define RNDIS_OID_GEN_GET_NETCARD_TIME 0x00020210 - -/* - * These are connection-oriented general OIDs. - * These replace the above OIDs for connection-oriented media. - */ -#define RNDIS_OID_GEN_CO_SUPPORTED_LIST 0x00010101 -#define RNDIS_OID_GEN_CO_HARDWARE_STATUS 0x00010102 -#define RNDIS_OID_GEN_CO_MEDIA_SUPPORTED 0x00010103 -#define RNDIS_OID_GEN_CO_MEDIA_IN_USE 0x00010104 -#define RNDIS_OID_GEN_CO_LINK_SPEED 0x00010105 -#define RNDIS_OID_GEN_CO_VENDOR_ID 0x00010106 -#define RNDIS_OID_GEN_CO_VENDOR_DESCRIPTION 0x00010107 -#define RNDIS_OID_GEN_CO_DRIVER_VERSION 0x00010108 -#define RNDIS_OID_GEN_CO_PROTOCOL_OPTIONS 0x00010109 -#define RNDIS_OID_GEN_CO_MAC_OPTIONS 0x0001010A -#define RNDIS_OID_GEN_CO_MEDIA_CONNECT_STATUS 0x0001010B -#define RNDIS_OID_GEN_CO_VENDOR_DRIVER_VERSION 0x0001010C -#define RNDIS_OID_GEN_CO_MINIMUM_LINK_SPEED 0x0001010D - -#define RNDIS_OID_GEN_CO_GET_TIME_CAPS 0x00010201 -#define RNDIS_OID_GEN_CO_GET_NETCARD_TIME 0x00010202 - -/* - * These are connection-oriented statistics OIDs. - */ -#define RNDIS_OID_GEN_CO_XMIT_PDUS_OK 0x00020101 -#define RNDIS_OID_GEN_CO_RCV_PDUS_OK 0x00020102 -#define RNDIS_OID_GEN_CO_XMIT_PDUS_ERROR 0x00020103 -#define RNDIS_OID_GEN_CO_RCV_PDUS_ERROR 0x00020104 -#define RNDIS_OID_GEN_CO_RCV_PDUS_NO_BUFFER 0x00020105 - - -#define RNDIS_OID_GEN_CO_RCV_CRC_ERROR 0x00020201 -#define RNDIS_OID_GEN_CO_TRANSMIT_QUEUE_LENGTH 0x00020202 -#define RNDIS_OID_GEN_CO_BYTES_XMIT 0x00020203 -#define RNDIS_OID_GEN_CO_BYTES_RCV 0x00020204 -#define RNDIS_OID_GEN_CO_BYTES_XMIT_OUTSTANDING 0x00020205 -#define RNDIS_OID_GEN_CO_NETCARD_LOAD 0x00020206 - -/* - * These are objects for Connection-oriented media call-managers. - */ -#define RNDIS_OID_CO_ADD_PVC 0xFF000001 -#define RNDIS_OID_CO_DELETE_PVC 0xFF000002 -#define RNDIS_OID_CO_GET_CALL_INFORMATION 0xFF000003 -#define RNDIS_OID_CO_ADD_ADDRESS 0xFF000004 -#define RNDIS_OID_CO_DELETE_ADDRESS 0xFF000005 -#define RNDIS_OID_CO_GET_ADDRESSES 0xFF000006 -#define RNDIS_OID_CO_ADDRESS_CHANGE 0xFF000007 -#define RNDIS_OID_CO_SIGNALING_ENABLED 0xFF000008 -#define RNDIS_OID_CO_SIGNALING_DISABLED 0xFF000009 - - -/* - * 802.3 Objects (Ethernet) - */ - -#define RNDIS_OID_802_3_PERMANENT_ADDRESS 0x01010101 -#define RNDIS_OID_802_3_CURRENT_ADDRESS 0x01010102 -#define RNDIS_OID_802_3_MULTICAST_LIST 0x01010103 -#define RNDIS_OID_802_3_MAXIMUM_LIST_SIZE 0x01010104 -#define RNDIS_OID_802_3_MAC_OPTIONS 0x01010105 - -/* - * - */ -#define NDIS_802_3_MAC_OPTION_PRIORITY 0x00000001 - -#define RNDIS_OID_802_3_RCV_ERROR_ALIGNMENT 0x01020101 -#define RNDIS_OID_802_3_XMIT_ONE_COLLISION 0x01020102 -#define RNDIS_OID_802_3_XMIT_MORE_COLLISIONS 0x01020103 - -#define RNDIS_OID_802_3_XMIT_DEFERRED 0x01020201 -#define RNDIS_OID_802_3_XMIT_MAX_COLLISIONS 0x01020202 -#define RNDIS_OID_802_3_RCV_OVERRUN 0x01020203 -#define RNDIS_OID_802_3_XMIT_UNDERRUN 0x01020204 -#define RNDIS_OID_802_3_XMIT_HEARTBEAT_FAILURE 0x01020205 -#define RNDIS_OID_802_3_XMIT_TIMES_CRS_LOST 0x01020206 -#define RNDIS_OID_802_3_XMIT_LATE_COLLISIONS 0x01020207 - - -/* - * RNDIS MP custom OID for test - */ -#define OID_RNDISMP_GET_RECEIVE_BUFFERS 0xFFA0C90D // Query only - - -/* - * Remote NDIS message types - */ -#define REMOTE_NDIS_PACKET_MSG 0x00000001 -#define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 -#define REMOTE_NDIS_HALT_MSG 0x00000003 -#define REMOTE_NDIS_QUERY_MSG 0x00000004 -#define REMOTE_NDIS_SET_MSG 0x00000005 -#define REMOTE_NDIS_RESET_MSG 0x00000006 -#define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 -#define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 - -#define REMOTE_CONDIS_MP_CREATE_VC_MSG 0x00008001 -#define REMOTE_CONDIS_MP_DELETE_VC_MSG 0x00008002 -#define REMOTE_CONDIS_MP_ACTIVATE_VC_MSG 0x00008005 -#define REMOTE_CONDIS_MP_DEACTIVATE_VC_MSG 0x00008006 -#define REMOTE_CONDIS_INDICATE_STATUS_MSG 0x00008007 - -/* - * Remote NDIS message completion types - */ -#define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 -#define REMOTE_NDIS_QUERY_CMPLT 0x80000004 -#define REMOTE_NDIS_SET_CMPLT 0x80000005 -#define REMOTE_NDIS_RESET_CMPLT 0x80000006 -#define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 - -#define REMOTE_CONDIS_MP_CREATE_VC_CMPLT 0x80008001 -#define REMOTE_CONDIS_MP_DELETE_VC_CMPLT 0x80008002 -#define REMOTE_CONDIS_MP_ACTIVATE_VC_CMPLT 0x80008005 -#define REMOTE_CONDIS_MP_DEACTIVATE_VC_CMPLT 0x80008006 - -/* - * Reserved message type for private communication between lower-layer - * host driver and remote device, if necessary. - */ -#define REMOTE_NDIS_BUS_MSG 0xff000001 - -/* - * Defines for DeviceFlags in rndis_initialize_complete - */ -#define RNDIS_DF_CONNECTIONLESS 0x00000001 -#define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 -#define RNDIS_DF_RAW_DATA 0x00000004 - -/* - * Remote NDIS medium types. - */ -#define RNDIS_MEDIUM_802_3 0x00000000 -#define RNDIS_MEDIUM_802_5 0x00000001 -#define RNDIS_MEDIUM_FDDI 0x00000002 -#define RNDIS_MEDIUM_WAN 0x00000003 -#define RNDIS_MEDIUM_LOCAL_TALK 0x00000004 -#define RNDIS_MEDIUM_ARCNET_RAW 0x00000006 -#define RNDIS_MEDIUM_ARCNET_878_2 0x00000007 -#define RNDIS_MEDIUM_ATM 0x00000008 -#define RNDIS_MEDIUM_WIRELESS_WAN 0x00000009 -#define RNDIS_MEDIUM_IRDA 0x0000000a -#define RNDIS_MEDIUM_CO_WAN 0x0000000b -/* Not a real medium, defined as an upper bound */ -#define RNDIS_MEDIUM_MAX 0x0000000d - -/* - * Remote NDIS medium connection states. - */ -#define RNDIS_MEDIA_STATE_CONNECTED 0x00000000 -#define RNDIS_MEDIA_STATE_DISCONNECTED 0x00000001 - -/* - * Remote NDIS version numbers - */ -#define RNDIS_MAJOR_VERSION 0x00000001 -#define RNDIS_MINOR_VERSION 0x00000000 - - -/* - * Remote NDIS offload parameters - */ -#define RNDIS_OBJECT_TYPE_DEFAULT 0x80 - -#define RNDIS_OFFLOAD_PARAMETERS_REVISION_3 3 -#define RNDIS_OFFLOAD_PARAMETERS_NO_CHANGE 0 -#define RNDIS_OFFLOAD_PARAMETERS_LSOV2_DISABLED 1 -#define RNDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED 2 -#define RNDIS_OFFLOAD_PARAMETERS_LSOV1_ENABLED 2 -#define RNDIS_OFFLOAD_PARAMETERS_RSC_DISABLED 1 -#define RNDIS_OFFLOAD_PARAMETERS_RSC_ENABLED 2 -#define RNDIS_OFFLOAD_PARAMETERS_TX_RX_DISABLED 1 -#define RNDIS_OFFLOAD_PARAMETERS_TX_ENABLED_RX_DISABLED 2 -#define RNDIS_OFFLOAD_PARAMETERS_RX_ENABLED_TX_DISABLED 3 -#define RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED 4 - -#define RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE 1 -#define RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4 0 -#define RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6 1 - - -#define RNDIS_OID_TCP_OFFLOAD_CURRENT_CONFIG 0xFC01020B /* query only */ -#define RNDIS_OID_TCP_OFFLOAD_PARAMETERS 0xFC01020C /* set only */ -#define RNDIS_OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES 0xFC01020D/* query only */ -#define RNDIS_OID_TCP_CONNECTION_OFFLOAD_CURRENT_CONFIG 0xFC01020E /* query only */ -#define RNDIS_OID_TCP_CONNECTION_OFFLOAD_HARDWARE_CAPABILITIES 0xFC01020F /* query */ -#define RNDIS_OID_OFFLOAD_ENCAPSULATION 0x0101010A /* set/query */ - -/* - * NdisInitialize message - */ -typedef struct rndis_initialize_request_ { - /* RNDIS request ID */ - uint32_t request_id; - uint32_t major_version; - uint32_t minor_version; - uint32_t max_xfer_size; -} rndis_initialize_request; - -/* - * Response to NdisInitialize - */ -typedef struct rndis_initialize_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; - uint32_t major_version; - uint32_t minor_version; - uint32_t device_flags; - /* RNDIS medium */ - uint32_t medium; - uint32_t max_pkts_per_msg; - uint32_t max_xfer_size; - uint32_t pkt_align_factor; - uint32_t af_list_offset; - uint32_t af_list_size; -} rndis_initialize_complete; - -/* - * Call manager devices only: Information about an address family - * supported by the device is appended to the response to NdisInitialize. - */ -typedef struct rndis_co_address_family_ { - /* RNDIS AF */ - uint32_t address_family; - uint32_t major_version; - uint32_t minor_version; -} rndis_co_address_family; - -/* - * NdisHalt message - */ -typedef struct rndis_halt_request_ { - /* RNDIS request ID */ - uint32_t request_id; -} rndis_halt_request; - -/* - * NdisQueryRequest message - */ -typedef struct rndis_query_request_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS OID */ - uint32_t oid; - uint32_t info_buffer_length; - uint32_t info_buffer_offset; - /* RNDIS handle */ - uint32_t device_vc_handle; -} rndis_query_request; - -/* - * Response to NdisQueryRequest - */ -typedef struct rndis_query_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; - uint32_t info_buffer_length; - uint32_t info_buffer_offset; -} rndis_query_complete; - -/* - * NdisSetRequest message - */ -typedef struct rndis_set_request_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS OID */ - uint32_t oid; - uint32_t info_buffer_length; - uint32_t info_buffer_offset; - /* RNDIS handle */ - uint32_t device_vc_handle; -} rndis_set_request; - -/* - * Response to NdisSetRequest - */ -typedef struct rndis_set_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; -} rndis_set_complete; - -/* - * NdisReset message - */ -typedef struct rndis_reset_request_ { - uint32_t reserved; -} rndis_reset_request; - -/* - * Response to NdisReset - */ -typedef struct rndis_reset_complete_ { - /* RNDIS status */ - uint32_t status; - uint32_t addressing_reset; -} rndis_reset_complete; - -/* - * NdisMIndicateStatus message - */ -typedef struct rndis_indicate_status_ { - /* RNDIS status */ - uint32_t status; - uint32_t status_buf_length; - uint32_t status_buf_offset; -} rndis_indicate_status; - -/* - * Diagnostic information passed as the status buffer in - * rndis_indicate_status messages signifying error conditions. - */ -typedef struct rndis_diagnostic_info_ { - /* RNDIS status */ - uint32_t diag_status; - uint32_t error_offset; -} rndis_diagnostic_info; - -/* - * NdisKeepAlive message - */ -typedef struct rndis_keepalive_request_ { - /* RNDIS request ID */ - uint32_t request_id; -} rndis_keepalive_request; - -/* - * Response to NdisKeepAlive - */ -typedef struct rndis_keepalive_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; -} rndis_keepalive_complete; - -/* - * Data message. All offset fields contain byte offsets from the beginning - * of the rndis_packet structure. All length fields are in bytes. - * VcHandle is set to 0 for connectionless data, otherwise it - * contains the VC handle. - */ -typedef struct rndis_packet_ { - uint32_t data_offset; - uint32_t data_length; - uint32_t oob_data_offset; - uint32_t oob_data_length; - uint32_t num_oob_data_elements; - uint32_t per_pkt_info_offset; - uint32_t per_pkt_info_length; - /* RNDIS handle */ - uint32_t vc_handle; - uint32_t reserved; -} rndis_packet; - -typedef struct rndis_packet_ex_ { - uint32_t data_offset; - uint32_t data_length; - uint32_t oob_data_offset; - uint32_t oob_data_length; - uint32_t num_oob_data_elements; - uint32_t per_pkt_info_offset; - uint32_t per_pkt_info_length; - /* RNDIS handle */ - uint32_t vc_handle; - uint32_t reserved; - uint64_t data_buf_id; - uint32_t data_buf_offset; - uint64_t next_header_buf_id; - uint32_t next_header_byte_offset; - uint32_t next_header_byte_count; -} rndis_packet_ex; - -/* - * Optional Out of Band data associated with a Data message. - */ -typedef struct rndis_oobd_ { - uint32_t size; - /* RNDIS class ID */ - uint32_t type; - uint32_t class_info_offset; -} rndis_oobd; - -/* - * Packet extension field contents associated with a Data message. - */ -typedef struct rndis_per_packet_info_ { - uint32_t size; - uint32_t type; - uint32_t per_packet_info_offset; -} rndis_per_packet_info; - -typedef enum ndis_per_pkt_infotype_ { - tcpip_chksum_info, - ipsec_info, - tcp_large_send_info, - classification_handle_info, - ndis_reserved, - sgl_info, - ieee_8021q_info, - original_pkt_info, - pkt_cancel_id, - original_netbuf_list, - cached_netbuf_list, - short_pkt_padding_info, - max_perpkt_info -} ndis_per_pkt_infotype; - -#define nbl_hash_value pkt_cancel_id -#define nbl_hash_info original_netbuf_list - -typedef struct ndis_8021q_info_ { - union { - struct { - uint32_t user_pri : 3; /* User Priority */ - uint32_t cfi : 1; /* Canonical Format ID */ - uint32_t vlan_id : 12; - uint32_t reserved : 16; - } s1; - uint32_t value; - } u1; -} ndis_8021q_info; - -struct rndis_object_header { - uint8_t type; - uint8_t revision; - uint16_t size; -}; - -typedef struct rndis_offload_params_ { - struct rndis_object_header header; - uint8_t ipv4_csum; - uint8_t tcp_ipv4_csum; - uint8_t udp_ipv4_csum; - uint8_t tcp_ipv6_csum; - uint8_t udp_ipv6_csum; - uint8_t lso_v1; - uint8_t ip_sec_v1; - uint8_t lso_v2_ipv4; - uint8_t lso_v2_ipv6; - uint8_t tcp_connection_ipv4; - uint8_t tcp_connection_ipv6; - uint32_t flags; - uint8_t ip_sec_v2; - uint8_t ip_sec_v2_ipv4; - struct { - uint8_t rsc_ipv4; - uint8_t rsc_ipv6; - }; - struct { - uint8_t encapsulated_packet_task_offload; - uint8_t encapsulation_types; - }; - -} rndis_offload_params; - - -typedef struct rndis_tcp_ip_csum_info_ { - union { - struct { - uint32_t is_ipv4:1; - uint32_t is_ipv6:1; - uint32_t tcp_csum:1; - uint32_t udp_csum:1; - uint32_t ip_header_csum:1; - uint32_t reserved:11; - uint32_t tcp_header_offset:10; - } xmit; - struct { - uint32_t tcp_csum_failed:1; - uint32_t udp_csum_failed:1; - uint32_t ip_csum_failed:1; - uint32_t tcp_csum_succeeded:1; - uint32_t udp_csum_succeeded:1; - uint32_t ip_csum_succeeded:1; - uint32_t loopback:1; - uint32_t tcp_csum_value_invalid:1; - uint32_t ip_csum_value_invalid:1; - } receive; - uint32_t value; - }; -} rndis_tcp_ip_csum_info; - -struct rndis_hash_value { - uint32_t hash_value; -} __packed; - -struct rndis_hash_info { - uint32_t hash_info; -} __packed; - -#define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */ -#define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */ - -/* hash function */ -#define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001 - -/* hash type */ -#define NDIS_HASH_IPV4 0x00000100 -#define NDIS_HASH_TCP_IPV4 0x00000200 -#define NDIS_HASH_IPV6 0x00000400 -#define NDIS_HASH_IPV6_EX 0x00000800 -#define NDIS_HASH_TCP_IPV6 0x00001000 -#define NDIS_HASH_TCP_IPV6_EX 0x00002000 - -typedef struct rndis_tcp_tso_info_ { - union { - struct { - uint32_t unused:30; - uint32_t type:1; - uint32_t reserved2:1; - } xmit; - struct { - uint32_t mss:20; - uint32_t tcp_header_offset:10; - uint32_t type:1; - uint32_t reserved2:1; - } lso_v1_xmit; - struct { - uint32_t tcp_payload:30; - uint32_t type:1; - uint32_t reserved2:1; - } lso_v1_xmit_complete; - struct { - uint32_t mss:20; - uint32_t tcp_header_offset:10; - uint32_t type:1; - uint32_t ip_version:1; - } lso_v2_xmit; - struct { - uint32_t reserved:30; - uint32_t type:1; - uint32_t reserved2:1; - } lso_v2_xmit_complete; - uint32_t value; - }; -} rndis_tcp_tso_info; - -#define RNDIS_HASHVAL_PPI_SIZE (sizeof(rndis_per_packet_info) + \ - sizeof(struct rndis_hash_value)) - -#define RNDIS_VLAN_PPI_SIZE (sizeof(rndis_per_packet_info) + \ - sizeof(ndis_8021q_info)) - -#define RNDIS_CSUM_PPI_SIZE (sizeof(rndis_per_packet_info) + \ - sizeof(rndis_tcp_ip_csum_info)) - -#define RNDIS_TSO_PPI_SIZE (sizeof(rndis_per_packet_info) + \ - sizeof(rndis_tcp_tso_info)) - -/* - * Format of Information buffer passed in a SetRequest for the OID - * OID_GEN_RNDIS_CONFIG_PARAMETER. - */ -typedef struct rndis_config_parameter_info_ { - uint32_t parameter_name_offset; - uint32_t parameter_name_length; - uint32_t parameter_type; - uint32_t parameter_value_offset; - uint32_t parameter_value_length; -} rndis_config_parameter_info; - -/* - * Values for ParameterType in rndis_config_parameter_info - */ -#define RNDIS_CONFIG_PARAM_TYPE_INTEGER 0 -#define RNDIS_CONFIG_PARAM_TYPE_STRING 2 - - -/* - * CONDIS Miniport messages for connection oriented devices - * that do not implement a call manager. - */ - -/* - * CoNdisMiniportCreateVc message - */ -typedef struct rcondis_mp_create_vc_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS handle */ - uint32_t ndis_vc_handle; -} rcondis_mp_create_vc; - -/* - * Response to CoNdisMiniportCreateVc - */ -typedef struct rcondis_mp_create_vc_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS handle */ - uint32_t device_vc_handle; - /* RNDIS status */ - uint32_t status; -} rcondis_mp_create_vc_complete; - -/* - * CoNdisMiniportDeleteVc message - */ -typedef struct rcondis_mp_delete_vc_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS handle */ - uint32_t device_vc_handle; -} rcondis_mp_delete_vc; - -/* - * Response to CoNdisMiniportDeleteVc - */ -typedef struct rcondis_mp_delete_vc_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; -} rcondis_mp_delete_vc_complete; - -/* - * CoNdisMiniportQueryRequest message - */ -typedef struct rcondis_mp_query_request_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS request type */ - uint32_t request_type; - /* RNDIS OID */ - uint32_t oid; - /* RNDIS handle */ - uint32_t device_vc_handle; - uint32_t info_buf_length; - uint32_t info_buf_offset; -} rcondis_mp_query_request; - -/* - * CoNdisMiniportSetRequest message - */ -typedef struct rcondis_mp_set_request_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS request type */ - uint32_t request_type; - /* RNDIS OID */ - uint32_t oid; - /* RNDIS handle */ - uint32_t device_vc_handle; - uint32_t info_buf_length; - uint32_t info_buf_offset; -} rcondis_mp_set_request; - -/* - * CoNdisIndicateStatus message - */ -typedef struct rcondis_indicate_status_ { - /* RNDIS handle */ - uint32_t ndis_vc_handle; - /* RNDIS status */ - uint32_t status; - uint32_t status_buf_length; - uint32_t status_buf_offset; -} rcondis_indicate_status; - -/* - * CONDIS Call/VC parameters - */ - -typedef struct rcondis_specific_parameters_ { - uint32_t parameter_type; - uint32_t parameter_length; - uint32_t parameter_offset; -} rcondis_specific_parameters; - -typedef struct rcondis_media_parameters_ { - uint32_t flags; - uint32_t reserved1; - uint32_t reserved2; - rcondis_specific_parameters media_specific; -} rcondis_media_parameters; - -typedef struct rndis_flowspec_ { - uint32_t token_rate; - uint32_t token_bucket_size; - uint32_t peak_bandwidth; - uint32_t latency; - uint32_t delay_variation; - uint32_t service_type; - uint32_t max_sdu_size; - uint32_t minimum_policed_size; -} rndis_flowspec; - -typedef struct rcondis_call_manager_parameters_ { - rndis_flowspec transmit; - rndis_flowspec receive; - rcondis_specific_parameters call_mgr_specific; -} rcondis_call_manager_parameters; - -/* - * CoNdisMiniportActivateVc message - */ -typedef struct rcondis_mp_activate_vc_request_ { - /* RNDIS request ID */ - uint32_t request_id; - uint32_t flags; - /* RNDIS handle */ - uint32_t device_vc_handle; - uint32_t media_params_offset; - uint32_t media_params_length; - uint32_t call_mgr_params_offset; - uint32_t call_mgr_params_length; -} rcondis_mp_activate_vc_request; - -/* - * Response to CoNdisMiniportActivateVc - */ -typedef struct rcondis_mp_activate_vc_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; -} rcondis_mp_activate_vc_complete; - -/* - * CoNdisMiniportDeactivateVc message - */ -typedef struct rcondis_mp_deactivate_vc_request_ { - /* RNDIS request ID */ - uint32_t request_id; - uint32_t flags; - /* RNDIS handle */ - uint32_t device_vc_handle; -} rcondis_mp_deactivate_vc_request; - -/* - * Response to CoNdisMiniportDeactivateVc - */ -typedef struct rcondis_mp_deactivate_vc_complete_ { - /* RNDIS request ID */ - uint32_t request_id; - /* RNDIS status */ - uint32_t status; -} rcondis_mp_deactivate_vc_complete; - -/* - * union with all of the RNDIS messages - */ -typedef union rndis_msg_container_ { - rndis_packet packet; - rndis_initialize_request init_request; - rndis_halt_request halt_request; - rndis_query_request query_request; - rndis_set_request set_request; - rndis_reset_request reset_request; - rndis_keepalive_request keepalive_request; - rndis_indicate_status indicate_status; - rndis_initialize_complete init_complete; - rndis_query_complete query_complete; - rndis_set_complete set_complete; - rndis_reset_complete reset_complete; - rndis_keepalive_complete keepalive_complete; - rcondis_mp_create_vc co_miniport_create_vc; - rcondis_mp_delete_vc co_miniport_delete_vc; - rcondis_indicate_status co_miniport_status; - rcondis_mp_activate_vc_request co_miniport_activate_vc; - rcondis_mp_deactivate_vc_request co_miniport_deactivate_vc; - rcondis_mp_create_vc_complete co_miniport_create_vc_complete; - rcondis_mp_delete_vc_complete co_miniport_delete_vc_complete; - rcondis_mp_activate_vc_complete co_miniport_activate_vc_complete; - rcondis_mp_deactivate_vc_complete co_miniport_deactivate_vc_complete; - rndis_packet_ex packet_ex; -} rndis_msg_container; - -/* - * Remote NDIS message format - */ -typedef struct rndis_msg_ { - uint32_t ndis_msg_type; - - /* - * Total length of this message, from the beginning - * of the rndis_msg struct, in bytes. - */ - uint32_t msg_len; - - /* Actual message */ - rndis_msg_container msg; -} rndis_msg; - - -/* - * Handy macros - */ - -/* - * get the size of an RNDIS message. Pass in the message type, - * rndis_set_request, rndis_packet for example - */ -#define RNDIS_MESSAGE_SIZE(message) \ - (sizeof(message) + (sizeof(rndis_msg) - sizeof(rndis_msg_container))) - -/* - * get pointer to info buffer with message pointer - */ -#define MESSAGE_TO_INFO_BUFFER(message) \ - (((PUCHAR)(message)) + message->InformationBufferOffset) - -/* - * get pointer to status buffer with message pointer - */ -#define MESSAGE_TO_STATUS_BUFFER(message) \ - (((PUCHAR)(message)) + message->StatusBufferOffset) - -/* - * get pointer to OOBD buffer with message pointer - */ -#define MESSAGE_TO_OOBD_BUFFER(message) \ - (((PUCHAR)(message)) + message->OOBDataOffset) - -/* - * get pointer to data buffer with message pointer - */ -#define MESSAGE_TO_DATA_BUFFER(message) \ - (((PUCHAR)(message)) + message->PerPacketInfoOffset) - -/* - * get pointer to contained message from NDIS_MESSAGE pointer - */ -#define RNDIS_MESSAGE_PTR_TO_MESSAGE_PTR(rndis_message) \ - ((void *) &rndis_message->Message) - -/* - * get pointer to contained message from NDIS_MESSAGE pointer - */ -#define RNDIS_MESSAGE_RAW_PTR_TO_MESSAGE_PTR(rndis_message) \ - ((void *) rndis_message) - - - -/* - * Structures used in OID_RNDISMP_GET_RECEIVE_BUFFERS - */ - -#define RNDISMP_RECEIVE_BUFFER_ELEM_FLAG_VMQ_RECEIVE_BUFFER 0x00000001 - -typedef struct rndismp_rx_buf_elem_ { - uint32_t flags; - uint32_t length; - uint64_t rx_buf_id; - uint32_t gpadl_handle; - void *rx_buf; -} rndismp_rx_buf_elem; - -typedef struct rndismp_rx_bufs_info_ { - uint32_t num_rx_bufs; - rndismp_rx_buf_elem rx_buf_elems[1]; -} rndismp_rx_bufs_info; - - - -#define RNDIS_HEADER_SIZE (sizeof(rndis_msg) - sizeof(rndis_msg_container)) - -#define NDIS_PACKET_TYPE_DIRECTED 0x00000001 -#define NDIS_PACKET_TYPE_MULTICAST 0x00000002 -#define NDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 -#define NDIS_PACKET_TYPE_BROADCAST 0x00000008 -#define NDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 -#define NDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 -#define NDIS_PACKET_TYPE_SMT 0x00000040 -#define NDIS_PACKET_TYPE_ALL_LOCAL 0x00000080 -#define NDIS_PACKET_TYPE_GROUP 0x00000100 -#define NDIS_PACKET_TYPE_ALL_FUNCTIONAL 0x00000200 -#define NDIS_PACKET_TYPE_FUNCTIONAL 0x00000400 -#define NDIS_PACKET_TYPE_MAC_FRAME 0x00000800 - -/* - * Externs - */ -struct hn_rx_ring; -struct hn_tx_ring; - -int netvsc_recv(struct hn_rx_ring *rxr, - netvsc_packet *packet, const rndis_tcp_ip_csum_info *csum_info, - const struct rndis_hash_info *hash_info, - const struct rndis_hash_value *hash_value); -void netvsc_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr); - -void* hv_set_rppi_data(rndis_msg *rndis_mesg, - uint32_t rppi_size, - int pkt_type); - -void* hv_get_ppi_data(rndis_packet *rpkt, uint32_t type); - -#endif /* __HV_RNDIS_H__ */ - diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 120ad1a..e22060e 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -39,6 +39,9 @@ __FBSDID("$FreeBSD$"); #include <net/if_arp.h> #include <net/if_var.h> #include <net/ethernet.h> +#include <net/rndis.h> +#include <netinet/in.h> +#include <netinet/ip.h> #include <sys/types.h> #include <machine/atomic.h> #include <sys/sema.h> @@ -47,16 +50,11 @@ __FBSDID("$FreeBSD$"); #include <vm/pmap.h> #include <dev/hyperv/include/hyperv.h> -#include "hv_net_vsc.h" -#include "hv_rndis.h" -#include "hv_rndis_filter.h" - -struct hv_rf_recvinfo { - const ndis_8021q_info *vlan_info; - const rndis_tcp_ip_csum_info *csum_info; - const struct rndis_hash_info *hash_info; - const struct rndis_hash_value *hash_value; -}; +#include <dev/hyperv/include/vmbus_xact.h> +#include <dev/hyperv/netvsc/hv_net_vsc.h> +#include <dev/hyperv/netvsc/hv_rndis_filter.h> +#include <dev/hyperv/netvsc/if_hnreg.h> +#include <dev/hyperv/netvsc/ndis.h> #define HV_RF_RECVINFO_VLAN 0x1 #define HV_RF_RECVINFO_CSUM 0x2 @@ -68,1206 +66,1218 @@ struct hv_rf_recvinfo { HV_RF_RECVINFO_HASHINF | \ HV_RF_RECVINFO_HASHVAL) -/* - * Forward declarations - */ -static int hv_rf_send_request(rndis_device *device, rndis_request *request, - uint32_t message_type); -static void hv_rf_receive_response(rndis_device *device, rndis_msg *response); -static void hv_rf_receive_indicate_status(rndis_device *device, - rndis_msg *response); -static void hv_rf_receive_data(struct hn_rx_ring *rxr, rndis_msg *message, - netvsc_packet *pkt); -static int hv_rf_query_device(rndis_device *device, uint32_t oid, - void *result, uint32_t *result_size); -static inline int hv_rf_query_device_mac(rndis_device *device); -static inline int hv_rf_query_device_link_status(rndis_device *device); -static int hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter); -static int hv_rf_init_device(rndis_device *device); -static int hv_rf_open_device(rndis_device *device); -static int hv_rf_close_device(rndis_device *device); -static void hv_rf_on_send_request_completion(struct vmbus_channel *, void *context); -static void hv_rf_on_send_request_halt_completion(struct vmbus_channel *, void *context); -int -hv_rf_send_offload_request(struct hn_softc *sc, - rndis_offload_params *offloads); -/* - * Set the Per-Packet-Info with the specified type - */ -void * -hv_set_rppi_data(rndis_msg *rndis_mesg, uint32_t rppi_size, - int pkt_type) -{ - rndis_packet *rndis_pkt; - rndis_per_packet_info *rppi; - - rndis_pkt = &rndis_mesg->msg.packet; - rndis_pkt->data_offset += rppi_size; +#define HN_RNDIS_RID_COMPAT_MASK 0xffff +#define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK - rppi = (rndis_per_packet_info *)((char *)rndis_pkt + - rndis_pkt->per_pkt_info_offset + rndis_pkt->per_pkt_info_length); +#define HN_RNDIS_XFER_SIZE 2048 - rppi->size = rppi_size; - rppi->type = pkt_type; - rppi->per_packet_info_offset = sizeof(rndis_per_packet_info); - - rndis_pkt->per_pkt_info_length += rppi_size; - - return (rppi); -} +#define HN_NDIS_TXCSUM_CAP_IP4 \ + (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT) +#define HN_NDIS_TXCSUM_CAP_TCP4 \ + (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT) +#define HN_NDIS_TXCSUM_CAP_TCP6 \ + (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \ + NDIS_TXCSUM_CAP_IP6EXT) +#define HN_NDIS_TXCSUM_CAP_UDP6 \ + (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT) +#define HN_NDIS_LSOV2_CAP_IP6 \ + (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT) /* - * Get the Per-Packet-Info with the specified type - * return NULL if not found. - */ -void * -hv_get_ppi_data(rndis_packet *rpkt, uint32_t type) -{ - rndis_per_packet_info *ppi; - int len; - - if (rpkt->per_pkt_info_offset == 0) - return (NULL); - - ppi = (rndis_per_packet_info *)((unsigned long)rpkt + - rpkt->per_pkt_info_offset); - len = rpkt->per_pkt_info_length; - - while (len > 0) { - if (ppi->type == type) - return (void *)((unsigned long)ppi + - ppi->per_packet_info_offset); - - len -= ppi->size; - ppi = (rndis_per_packet_info *)((unsigned long)ppi + ppi->size); - } - - return (NULL); -} - - -/* - * Allow module_param to work and override to switch to promiscuous mode. + * Forward declarations */ -static inline rndis_device * -hv_get_rndis_device(void) +static void hv_rf_receive_indicate_status(struct hn_softc *sc, + const void *data, int dlen); +static void hv_rf_receive_data(struct hn_rx_ring *rxr, + const void *data, int dlen); + +static int hn_rndis_query(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0); +static int hn_rndis_query2(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0, + size_t min_odlen); +static int hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, + size_t dlen); +static int hn_rndis_conf_offload(struct hn_softc *sc, int mtu); +static int hn_rndis_query_hwcaps(struct hn_softc *sc, + struct ndis_offload *caps); + +static __inline uint32_t +hn_rndis_rid(struct hn_softc *sc) { - rndis_device *device; - - device = malloc(sizeof(rndis_device), M_NETVSC, M_WAITOK | M_ZERO); - - mtx_init(&device->req_lock, "HV-FRL", NULL, MTX_DEF); - - /* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */ - STAILQ_INIT(&device->myrequest_list); + uint32_t rid; - device->state = RNDIS_DEV_UNINITIALIZED; +again: + rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1); + if (rid == 0) + goto again; - return (device); -} - -/* - * - */ -static inline void -hv_put_rndis_device(rndis_device *device) -{ - mtx_destroy(&device->req_lock); - free(device, M_NETVSC); + /* Use upper 16 bits for non-compat RNDIS messages. */ + return ((rid & 0xffff) << 16); } -/* - * - */ -static inline rndis_request * -hv_rndis_request(rndis_device *device, uint32_t message_type, - uint32_t message_length) +void * +hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, + size_t pi_dlen, uint32_t pi_type) { - rndis_request *request; - rndis_msg *rndis_mesg; - rndis_set_request *set; - - request = malloc(sizeof(rndis_request), M_NETVSC, M_WAITOK | M_ZERO); + const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); + struct rndis_pktinfo *pi; - sema_init(&request->wait_sema, 0, "rndis sema"); - - rndis_mesg = &request->request_msg; - rndis_mesg->ndis_msg_type = message_type; - rndis_mesg->msg_len = message_length; + KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, + ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* - * Set the request id. This field is always after the rndis header - * for request/response packet types so we just use the set_request - * as a template. + * Per-packet-info does not move; it only grows. + * + * NOTE: + * rm_pktinfooffset in this phase counts from the beginning + * of rndis_packet_msg. */ - set = &rndis_mesg->msg.set_request; - set->request_id = atomic_fetchadd_int(&device->new_request_id, 1); - /* Increment to get the new value (call above returns old value) */ - set->request_id += 1; + KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, + ("%u pktinfo overflows RNDIS packet msg", pi_type)); + pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + + pkt->rm_pktinfolen); + pkt->rm_pktinfolen += pi_size; - /* Add to the request list */ - mtx_lock(&device->req_lock); - STAILQ_INSERT_TAIL(&device->myrequest_list, request, mylist_entry); - mtx_unlock(&device->req_lock); + pi->rm_size = pi_size; + pi->rm_type = pi_type; + pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; - return (request); -} + /* Data immediately follow per-packet-info. */ + pkt->rm_dataoffset += pi_size; -/* - * - */ -static inline void -hv_put_rndis_request(rndis_device *device, rndis_request *request) -{ - mtx_lock(&device->req_lock); - /* Fixme: Has O(n) performance */ - /* - * XXXKYS: Use Doubly linked lists. - */ - STAILQ_REMOVE(&device->myrequest_list, request, rndis_request_, - mylist_entry); - mtx_unlock(&device->req_lock); + /* Update RNDIS packet msg length */ + pkt->rm_len += pi_size; - sema_destroy(&request->wait_sema); - free(request, M_NETVSC); + return (pi->rm_data); } /* - * - */ -static int -hv_rf_send_request(rndis_device *device, rndis_request *request, - uint32_t message_type) -{ - int ret; - netvsc_packet *packet; - netvsc_dev *net_dev = device->net_dev; - int send_buf_section_idx; - - /* Set up the packet to send it */ - packet = &request->pkt; - - packet->is_data_pkt = FALSE; - packet->tot_data_buf_len = request->request_msg.msg_len; - packet->gpa_cnt = 1; - - packet->gpa[0].gpa_page = - hv_get_phys_addr(&request->request_msg) >> PAGE_SHIFT; - packet->gpa[0].gpa_len = request->request_msg.msg_len; - packet->gpa[0].gpa_ofs = - (unsigned long)&request->request_msg & (PAGE_SIZE - 1); - - if (packet->gpa[0].gpa_ofs + packet->gpa[0].gpa_len > PAGE_SIZE) { - packet->gpa_cnt = 2; - packet->gpa[0].gpa_len = PAGE_SIZE - packet->gpa[0].gpa_ofs; - packet->gpa[1].gpa_page = - hv_get_phys_addr((char*)&request->request_msg + - packet->gpa[0].gpa_len) >> PAGE_SHIFT; - packet->gpa[1].gpa_ofs = 0; - packet->gpa[1].gpa_len = request->request_msg.msg_len - - packet->gpa[0].gpa_len; - } - - packet->compl.send.send_completion_context = request; /* packet */ - if (message_type != REMOTE_NDIS_HALT_MSG) { - packet->compl.send.on_send_completion = - hv_rf_on_send_request_completion; - } else { - packet->compl.send.on_send_completion = - hv_rf_on_send_request_halt_completion; - } - packet->compl.send.send_completion_tid = (unsigned long)device; - if (packet->tot_data_buf_len < net_dev->send_section_size) { - send_buf_section_idx = hv_nv_get_next_send_section(net_dev); - if (send_buf_section_idx != - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { - char *dest = ((char *)net_dev->send_buf + - send_buf_section_idx * net_dev->send_section_size); - - memcpy(dest, &request->request_msg, request->request_msg.msg_len); - packet->send_buf_section_idx = send_buf_section_idx; - packet->send_buf_section_size = packet->tot_data_buf_len; - packet->gpa_cnt = 0; - goto sendit; - } - /* Failed to allocate chimney send buffer; move on */ - } - packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; - packet->send_buf_section_size = 0; - -sendit: - ret = hv_nv_on_send(device->net_dev->sc->hn_prichan, packet); - - return (ret); -} - -/* - * RNDIS filter receive response + * RNDIS filter receive indicate status */ static void -hv_rf_receive_response(rndis_device *device, rndis_msg *response) -{ - rndis_request *request = NULL; - rndis_request *next_request; - boolean_t found = FALSE; - - mtx_lock(&device->req_lock); - request = STAILQ_FIRST(&device->myrequest_list); - while (request != NULL) { - /* - * All request/response message contains request_id as the - * first field - */ - if (request->request_msg.msg.init_request.request_id == - response->msg.init_complete.request_id) { - found = TRUE; - break; - } - next_request = STAILQ_NEXT(request, mylist_entry); - request = next_request; - } - mtx_unlock(&device->req_lock); - - if (found) { - if (response->msg_len <= sizeof(rndis_msg)) { - memcpy(&request->response_msg, response, - response->msg_len); - } else { - if (response->ndis_msg_type == REMOTE_NDIS_RESET_CMPLT) { - /* Does not have a request id field */ - request->response_msg.msg.reset_complete.status = - STATUS_BUFFER_OVERFLOW; - } else { - request->response_msg.msg.init_complete.status = - STATUS_BUFFER_OVERFLOW; - } - } - - sema_post(&request->wait_sema); - } -} - -int -hv_rf_send_offload_request(struct hn_softc *sc, - rndis_offload_params *offloads) +hv_rf_receive_indicate_status(struct hn_softc *sc, const void *data, int dlen) { - rndis_request *request; - rndis_set_request *set; - rndis_offload_params *offload_req; - rndis_set_complete *set_complete; - rndis_device *rndis_dev; - device_t dev = sc->hn_dev; - netvsc_dev *net_dev = sc->net_dev; - uint32_t vsp_version = net_dev->nvsp_version; - uint32_t extlen = sizeof(rndis_offload_params); - int ret; - - if (vsp_version <= NVSP_PROTOCOL_VERSION_4) { - extlen = VERSION_4_OFFLOAD_SIZE; - /* On NVSP_PROTOCOL_VERSION_4 and below, we do not support - * UDP checksum offload. - */ - offloads->udp_ipv4_csum = 0; - offloads->udp_ipv6_csum = 0; - } - - rndis_dev = net_dev->extension; - - request = hv_rndis_request(rndis_dev, REMOTE_NDIS_SET_MSG, - RNDIS_MESSAGE_SIZE(rndis_set_request) + extlen); - if (!request) - return (ENOMEM); - - set = &request->request_msg.msg.set_request; - set->oid = RNDIS_OID_TCP_OFFLOAD_PARAMETERS; - set->info_buffer_length = extlen; - set->info_buffer_offset = sizeof(rndis_set_request); - set->device_vc_handle = 0; - - offload_req = (rndis_offload_params *)((unsigned long)set + - set->info_buffer_offset); - *offload_req = *offloads; - offload_req->header.type = RNDIS_OBJECT_TYPE_DEFAULT; - offload_req->header.revision = RNDIS_OFFLOAD_PARAMETERS_REVISION_3; - offload_req->header.size = extlen; - - ret = hv_rf_send_request(rndis_dev, request, REMOTE_NDIS_SET_MSG); - if (ret != 0) { - device_printf(dev, "hv send offload request failed, ret=%d!\n", - ret); - goto cleanup; - } - - ret = sema_timedwait(&request->wait_sema, 5 * hz); - if (ret != 0) { - device_printf(dev, "hv send offload request timeout\n"); - goto cleanup; - } + const struct rndis_status_msg *msg; - set_complete = &request->response_msg.msg.set_complete; - if (set_complete->status == RNDIS_STATUS_SUCCESS) { - device_printf(dev, "hv send offload request succeeded\n"); - ret = 0; - } else { - if (set_complete->status == STATUS_NOT_SUPPORTED) { - device_printf(dev, "HV Not support offload\n"); - ret = 0; - } else { - ret = set_complete->status; - } + if (dlen < sizeof(*msg)) { + if_printf(sc->hn_ifp, "invalid RNDIS status\n"); + return; } + msg = data; -cleanup: - hv_put_rndis_request(rndis_dev, request); - - return (ret); -} - -/* - * RNDIS filter receive indicate status - */ -static void -hv_rf_receive_indicate_status(rndis_device *device, rndis_msg *response) -{ - rndis_indicate_status *indicate = &response->msg.indicate_status; - - switch(indicate->status) { + switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: - netvsc_linkstatus_callback(device->net_dev->sc, 1); - break; case RNDIS_STATUS_MEDIA_DISCONNECT: - netvsc_linkstatus_callback(device->net_dev->sc, 0); + hn_link_status_update(sc); break; + + case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: + /* Not really useful; ignore. */ + break; + + case RNDIS_STATUS_NETWORK_CHANGE: + /* TODO */ + if_printf(sc->hn_ifp, "network changed\n"); + break; + default: /* TODO: */ - device_printf(device->net_dev->sc->hn_dev, - "unknown status %d received\n", indicate->status); + if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", + msg->rm_status); break; } } static int -hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hv_rf_recvinfo *info) +hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_recvinfo *info) { - const rndis_per_packet_info *ppi; - uint32_t mask, len; - - info->vlan_info = NULL; - info->csum_info = NULL; - info->hash_info = NULL; - info->hash_value = NULL; - - if (rpkt->per_pkt_info_offset == 0) - return 0; - - ppi = (const rndis_per_packet_info *) - ((const uint8_t *)rpkt + rpkt->per_pkt_info_offset); - len = rpkt->per_pkt_info_length; - mask = 0; - - while (len != 0) { - const void *ppi_dptr; - uint32_t ppi_dlen; - - if (__predict_false(ppi->size < ppi->per_packet_info_offset)) - return EINVAL; - ppi_dlen = ppi->size - ppi->per_packet_info_offset; - ppi_dptr = (const uint8_t *)ppi + ppi->per_packet_info_offset; - - switch (ppi->type) { - case ieee_8021q_info: - if (__predict_false(ppi_dlen < sizeof(ndis_8021q_info))) - return EINVAL; - info->vlan_info = ppi_dptr; + const struct rndis_pktinfo *pi = info_data; + uint32_t mask = 0; + + while (info_dlen != 0) { + const void *data; + uint32_t dlen; + + if (__predict_false(info_dlen < sizeof(*pi))) + return (EINVAL); + if (__predict_false(info_dlen < pi->rm_size)) + return (EINVAL); + info_dlen -= pi->rm_size; + + if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) + return (EINVAL); + if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) + return (EINVAL); + dlen = pi->rm_size - pi->rm_pktinfooffset; + data = pi->rm_data; + + switch (pi->rm_type) { + case NDIS_PKTINFO_TYPE_VLAN: + if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) + return (EINVAL); + info->vlan_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_VLAN; break; - case tcpip_chksum_info: - if (__predict_false(ppi_dlen < - sizeof(rndis_tcp_ip_csum_info))) - return EINVAL; - info->csum_info = ppi_dptr; + case NDIS_PKTINFO_TYPE_CSUM: + if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) + return (EINVAL); + info->csum_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_CSUM; break; - case nbl_hash_value: - if (__predict_false(ppi_dlen < - sizeof(struct rndis_hash_value))) - return EINVAL; - info->hash_value = ppi_dptr; + case HN_NDIS_PKTINFO_TYPE_HASHVAL: + if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) + return (EINVAL); + info->hash_value = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_HASHVAL; break; - case nbl_hash_info: - if (__predict_false(ppi_dlen < - sizeof(struct rndis_hash_info))) - return EINVAL; - info->hash_info = ppi_dptr; + case HN_NDIS_PKTINFO_TYPE_HASHINF: + if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) + return (EINVAL); + info->hash_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_HASHINF; break; default: - goto skip; + goto next; } if (mask == HV_RF_RECVINFO_ALL) { /* All found; done */ break; } -skip: - if (__predict_false(len < ppi->size)) - return EINVAL; - len -= ppi->size; - ppi = (const rndis_per_packet_info *) - ((const uint8_t *)ppi + ppi->size); - } - return 0; +next: + pi = (const struct rndis_pktinfo *) + ((const uint8_t *)pi + pi->rm_size); + } + + /* + * Final fixup. + * - If there is no hash value, invalidate the hash info. + */ + if ((mask & HV_RF_RECVINFO_HASHVAL) == 0) + info->hash_info = HN_NDIS_HASH_INFO_INVALID; + return (0); +} + +static __inline bool +hn_rndis_check_overlap(int off, int len, int check_off, int check_len) +{ + + if (off < check_off) { + if (__predict_true(off + len <= check_off)) + return (false); + } else if (off > check_off) { + if (__predict_true(check_off + check_len <= off)) + return (false); + } + return (true); } /* * RNDIS filter receive data */ static void -hv_rf_receive_data(struct hn_rx_ring *rxr, rndis_msg *message, - netvsc_packet *pkt) +hv_rf_receive_data(struct hn_rx_ring *rxr, const void *data, int dlen) { - rndis_packet *rndis_pkt; - uint32_t data_offset; - struct hv_rf_recvinfo info; - - rndis_pkt = &message->msg.packet; + const struct rndis_packet_msg *pkt; + struct hn_recvinfo info; + int data_off, pktinfo_off, data_len, pktinfo_len; /* - * Fixme: Handle multiple rndis pkt msgs that may be enclosed in this - * netvsc packet (ie tot_data_buf_len != message_length) + * Check length. */ + if (__predict_false(dlen < sizeof(*pkt))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); + return; + } + pkt = data; - /* Remove rndis header, then pass data packet up the stack */ - data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; + if (__predict_false(dlen < pkt->rm_len)) { + if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " + "dlen %d, msglen %u\n", dlen, pkt->rm_len); + return; + } + if (__predict_false(pkt->rm_len < + pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " + "msglen %u, data %u, oob %u, pktinfo %u\n", + pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, + pkt->rm_pktinfolen); + return; + } + if (__predict_false(pkt->rm_datalen == 0)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); + return; + } - pkt->tot_data_buf_len -= data_offset; - if (pkt->tot_data_buf_len < rndis_pkt->data_length) { - pkt->status = nvsp_status_failure; - if_printf(rxr->hn_ifp, - "total length %u is less than data length %u\n", - pkt->tot_data_buf_len, rndis_pkt->data_length); + /* + * Check offests. + */ +#define IS_OFFSET_INVALID(ofs) \ + ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ + ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) + + /* XXX Hyper-V does not meet data offset alignment requirement */ + if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data offset %u\n", pkt->rm_dataoffset); + return; + } + if (__predict_false(pkt->rm_oobdataoffset > 0 && + IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob offset %u\n", pkt->rm_oobdataoffset); + return; + } + if (__predict_true(pkt->rm_pktinfooffset > 0) && + __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } - pkt->tot_data_buf_len = rndis_pkt->data_length; - pkt->data = (void *)((unsigned long)pkt->data + data_offset); +#undef IS_OFFSET_INVALID - if (hv_rf_find_recvinfo(rndis_pkt, &info)) { - pkt->status = nvsp_status_failure; - if_printf(rxr->hn_ifp, "recvinfo parsing failed\n"); - return; + data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); + data_len = pkt->rm_datalen; + pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); + pktinfo_len = pkt->rm_pktinfolen; + + /* + * Check OOB coverage. + */ + if (__predict_false(pkt->rm_oobdatalen != 0)) { + int oob_off, oob_len; + + if_printf(rxr->hn_ifp, "got oobdata\n"); + oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); + oob_len = pkt->rm_oobdatalen; + + if (__predict_false(oob_off + oob_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overflow, msglen %u, oob abs %d len %d\n", + pkt->rm_len, oob_off, oob_len); + return; + } + + /* + * Check against data. + */ + if (hn_rndis_check_overlap(oob_off, oob_len, + data_off, data_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps data, oob abs %d len %d, " + "data abs %d len %d\n", + oob_off, oob_len, data_off, data_len); + return; + } + + /* + * Check against pktinfo. + */ + if (pktinfo_len != 0 && + hn_rndis_check_overlap(oob_off, oob_len, + pktinfo_off, pktinfo_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps pktinfo, oob abs %d len %d, " + "pktinfo abs %d len %d\n", + oob_off, oob_len, pktinfo_off, pktinfo_len); + return; + } } - if (info.vlan_info != NULL) - pkt->vlan_tci = info.vlan_info->u1.s1.vlan_id; - else - pkt->vlan_tci = 0; + /* + * Check per-packet-info coverage and find useful per-packet-info. + */ + info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; + info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; + info.hash_info = HN_NDIS_HASH_INFO_INVALID; + if (__predict_true(pktinfo_len != 0)) { + bool overlap; + int error; + + if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overflow, msglen %u, " + "pktinfo abs %d len %d\n", + pkt->rm_len, pktinfo_off, pktinfo_len); + return; + } + + /* + * Check packet info coverage. + */ + overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, + data_off, data_len); + if (__predict_false(overlap)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overlap data, pktinfo abs %d len %d, " + "data abs %d len %d\n", + pktinfo_off, pktinfo_len, data_off, data_len); + return; + } + + /* + * Find useful per-packet-info. + */ + error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, + pktinfo_len, &info); + if (__predict_false(error)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " + "pktinfo\n"); + return; + } + } - netvsc_recv(rxr, pkt, info.csum_info, info.hash_info, info.hash_value); + if (__predict_false(data_off + data_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data overflow, msglen %u, data abs %d len %d\n", + pkt->rm_len, data_off, data_len); + return; + } + hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); } /* * RNDIS filter on receive */ -int -hv_rf_on_receive(netvsc_dev *net_dev, - struct hn_rx_ring *rxr, netvsc_packet *pkt) +void +hv_rf_on_receive(struct hn_softc *sc, struct hn_rx_ring *rxr, + const void *data, int dlen) { - rndis_device *rndis_dev; - rndis_msg *rndis_hdr; - - /* Make sure the rndis device state is initialized */ - if (net_dev->extension == NULL) { - pkt->status = nvsp_status_failure; - return (ENODEV); - } + const struct rndis_comp_hdr *comp; + const struct rndis_msghdr *hdr; - rndis_dev = (rndis_device *)net_dev->extension; - if (rndis_dev->state == RNDIS_DEV_UNINITIALIZED) { - pkt->status = nvsp_status_failure; - return (EINVAL); + if (__predict_false(dlen < sizeof(*hdr))) { + if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); + return; } + hdr = data; - rndis_hdr = pkt->data; - - switch (rndis_hdr->ndis_msg_type) { - - /* data message */ + switch (hdr->rm_type) { case REMOTE_NDIS_PACKET_MSG: - hv_rf_receive_data(rxr, rndis_hdr, pkt); + hv_rf_receive_data(rxr, data, dlen); break; - /* completion messages */ + case REMOTE_NDIS_INITIALIZE_CMPLT: case REMOTE_NDIS_QUERY_CMPLT: case REMOTE_NDIS_SET_CMPLT: - case REMOTE_NDIS_RESET_CMPLT: - case REMOTE_NDIS_KEEPALIVE_CMPLT: - hv_rf_receive_response(rndis_dev, rndis_hdr); + case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */ + if (dlen < sizeof(*comp)) { + if_printf(rxr->hn_ifp, "invalid RNDIS cmplt\n"); + return; + } + comp = data; + + KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX, + ("invalid RNDIS rid 0x%08x\n", comp->rm_rid)); + vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen); break; - /* notification message */ + case REMOTE_NDIS_INDICATE_STATUS_MSG: - hv_rf_receive_indicate_status(rndis_dev, rndis_hdr); + hv_rf_receive_indicate_status(sc, data, dlen); + break; + + case REMOTE_NDIS_RESET_CMPLT: + /* + * Reset completed, no rid. + * + * NOTE: + * RESET is not issued by hn(4), so this message should + * _not_ be observed. + */ + if_printf(rxr->hn_ifp, "RESET cmplt received\n"); break; + default: - printf("hv_rf_on_receive(): Unknown msg_type 0x%x\n", - rndis_hdr->ndis_msg_type); + if_printf(rxr->hn_ifp, "unknown RNDIS msg 0x%x\n", + hdr->rm_type); break; } +} +int +hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr) +{ + size_t eaddr_len; + int error; + + eaddr_len = ETHER_ADDR_LEN; + error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0, + eaddr, &eaddr_len); + if (error) + return (error); + if (eaddr_len != ETHER_ADDR_LEN) { + if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len); + return (EINVAL); + } return (0); } -/* - * RNDIS filter query device - */ -static int -hv_rf_query_device(rndis_device *device, uint32_t oid, void *result, - uint32_t *result_size) +int +hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status) { - rndis_request *request; - uint32_t in_result_size = *result_size; - rndis_query_request *query; - rndis_query_complete *query_complete; - int ret = 0; - - *result_size = 0; - request = hv_rndis_request(device, REMOTE_NDIS_QUERY_MSG, - RNDIS_MESSAGE_SIZE(rndis_query_request)); - if (request == NULL) { - ret = -1; - goto cleanup; + size_t size; + int error; + + size = sizeof(*link_status); + error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0, + link_status, &size); + if (error) + return (error); + if (size != sizeof(uint32_t)) { + if_printf(sc->hn_ifp, "invalid link status len %zu\n", size); + return (EINVAL); } + return (0); +} - /* Set up the rndis query */ - query = &request->request_msg.msg.query_request; - query->oid = oid; - query->info_buffer_offset = sizeof(rndis_query_request); - query->info_buffer_length = 0; - query->device_vc_handle = 0; +static const void * +hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen, + struct hn_send_ctx *sndc, size_t *comp_len) +{ + struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT]; + int gpa_cnt, error; + bus_addr_t paddr; - if (oid == RNDIS_OID_GEN_RSS_CAPABILITIES) { - struct rndis_recv_scale_cap *cap; + KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0, + ("invalid request length %zu", reqlen)); - request->request_msg.msg_len += - sizeof(struct rndis_recv_scale_cap); - query->info_buffer_length = sizeof(struct rndis_recv_scale_cap); - cap = (struct rndis_recv_scale_cap *)((unsigned long)query + - query->info_buffer_offset); - cap->hdr.type = RNDIS_OBJECT_TYPE_RSS_CAPABILITIES; - cap->hdr.rev = RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2; - cap->hdr.size = sizeof(struct rndis_recv_scale_cap); - } + /* + * Setup the SG list. + */ + paddr = vmbus_xact_req_paddr(xact); + KASSERT((paddr & PAGE_MASK) == 0, + ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr)); + for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) { + int len = PAGE_SIZE; - ret = hv_rf_send_request(device, request, REMOTE_NDIS_QUERY_MSG); - if (ret != 0) { - /* Fixme: printf added */ - printf("RNDISFILTER request failed to Send!\n"); - goto cleanup; - } + if (reqlen == 0) + break; + if (reqlen < len) + len = reqlen; - sema_wait(&request->wait_sema); + gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt; + gpa[gpa_cnt].gpa_len = len; + gpa[gpa_cnt].gpa_ofs = 0; - /* Copy the response back */ - query_complete = &request->response_msg.msg.query_complete; - - if (query_complete->info_buffer_length > in_result_size) { - ret = EINVAL; - goto cleanup; + reqlen -= len; } + KASSERT(reqlen == 0, ("still have %zu request data left", reqlen)); - memcpy(result, (void *)((unsigned long)query_complete + - query_complete->info_buffer_offset), - query_complete->info_buffer_length); - - *result_size = query_complete->info_buffer_length; - -cleanup: - if (request != NULL) - hv_put_rndis_request(device, request); - - return (ret); -} - -/* - * RNDIS filter query device MAC address - */ -static inline int -hv_rf_query_device_mac(rndis_device *device) -{ - uint32_t size = ETHER_ADDR_LEN; - - return (hv_rf_query_device(device, - RNDIS_OID_802_3_PERMANENT_ADDRESS, device->hw_mac_addr, &size)); + /* + * Send this RNDIS control message and wait for its completion + * message. + */ + vmbus_xact_activate(xact); + error = hv_nv_on_send(sc->hn_prichan, HN_NVS_RNDIS_MTYPE_CTRL, sndc, + gpa, gpa_cnt); + if (error) { + vmbus_xact_deactivate(xact); + if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error); + return (NULL); + } + return (vmbus_xact_wait(xact, comp_len)); } -/* - * RNDIS filter query device link status - */ -static inline int -hv_rf_query_device_link_status(rndis_device *device) +static const void * +hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid, + size_t reqlen, size_t *comp_len0, uint32_t comp_type) { - uint32_t size = sizeof(uint32_t); - - return (hv_rf_query_device(device, - RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, &device->link_status, &size)); -} + const struct rndis_comp_hdr *comp; + size_t comp_len, min_complen = *comp_len0; -static uint8_t netvsc_hash_key[HASH_KEYLEN] = { - 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, - 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, - 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, - 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, - 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa -}; + KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid)); + KASSERT(min_complen >= sizeof(*comp), + ("invalid minimum complete len %zu", min_complen)); -/* - * RNDIS set vRSS parameters - */ -static int -hv_rf_set_rss_param(rndis_device *device, int num_queue) -{ - rndis_request *request; - rndis_set_request *set; - rndis_set_complete *set_complete; - rndis_recv_scale_param *rssp; - uint32_t extlen = sizeof(rndis_recv_scale_param) + - (4 * ITAB_NUM) + HASH_KEYLEN; - uint32_t *itab, status; - uint8_t *keyp; - int i, ret; - - - request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG, - RNDIS_MESSAGE_SIZE(rndis_set_request) + extlen); - if (request == NULL) { - if (bootverbose) - printf("Netvsc: No memory to set vRSS parameters.\n"); - ret = -1; - goto cleanup; - } - - set = &request->request_msg.msg.set_request; - set->oid = RNDIS_OID_GEN_RSS_PARAMETERS; - set->info_buffer_length = extlen; - set->info_buffer_offset = sizeof(rndis_set_request); - set->device_vc_handle = 0; - - /* Fill out the rssp parameter structure */ - rssp = (rndis_recv_scale_param *)(set + 1); - rssp->hdr.type = RNDIS_OBJECT_TYPE_RSS_PARAMETERS; - rssp->hdr.rev = RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2; - rssp->hdr.size = sizeof(rndis_recv_scale_param); - rssp->flag = 0; - rssp->hashinfo = RNDIS_HASH_FUNC_TOEPLITZ | RNDIS_HASH_IPV4 | - RNDIS_HASH_TCP_IPV4 | RNDIS_HASH_IPV6 | RNDIS_HASH_TCP_IPV6; - rssp->indirect_tabsize = 4 * ITAB_NUM; - rssp->indirect_taboffset = sizeof(rndis_recv_scale_param); - rssp->hashkey_size = HASH_KEYLEN; - rssp->hashkey_offset = rssp->indirect_taboffset + - rssp->indirect_tabsize; - - /* Set indirection table entries */ - itab = (uint32_t *)(rssp + 1); - for (i = 0; i < ITAB_NUM; i++) - itab[i] = i % num_queue; - - /* Set hash key values */ - keyp = (uint8_t *)((unsigned long)rssp + rssp->hashkey_offset); - for (i = 0; i < HASH_KEYLEN; i++) - keyp[i] = netvsc_hash_key[i]; - - ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG); - if (ret != 0) { - goto cleanup; - } + /* + * Execute the xact setup by the caller. + */ + comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_send_ctx_none, + &comp_len); + if (comp == NULL) + return (NULL); /* - * Wait for the response from the host. Another thread will signal - * us when the response has arrived. In the failure case, - * sema_timedwait() returns a non-zero status after waiting 5 seconds. + * Check this RNDIS complete message. */ - ret = sema_timedwait(&request->wait_sema, 5 * hz); - if (ret == 0) { - /* Response received, check status */ - set_complete = &request->response_msg.msg.set_complete; - status = set_complete->status; - if (status != RNDIS_STATUS_SUCCESS) { - /* Bad response status, return error */ - if (bootverbose) - printf("Netvsc: Failed to set vRSS " - "parameters.\n"); - ret = -2; + if (comp_len < min_complen) { + if (comp_len >= sizeof(*comp)) { + /* rm_status field is valid */ + if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, " + "status 0x%08x\n", comp_len, comp->rm_status); } else { - if (bootverbose) - printf("Netvsc: Successfully set vRSS " - "parameters.\n"); + if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n", + comp_len); } - } else { - /* - * We cannot deallocate the request since we may still - * receive a send completion for it. - */ - printf("Netvsc: vRSS set timeout, id = %u, ret = %d\n", - request->request_msg.msg.init_request.request_id, ret); - goto exit; + return (NULL); } - -cleanup: - if (request != NULL) { - hv_put_rndis_request(device, request); + if (comp->rm_len < min_complen) { + if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n", + comp->rm_len); + return (NULL); + } + if (comp->rm_type != comp_type) { + if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, " + "expect 0x%08x\n", comp->rm_type, comp_type); + return (NULL); + } + if (comp->rm_rid != rid) { + if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, " + "expect %u\n", comp->rm_rid, rid); + return (NULL); } -exit: - return (ret); + /* All pass! */ + *comp_len0 = comp_len; + return (comp); } -/* - * RNDIS filter set packet filter - * Sends an rndis request with the new filter, then waits for a response - * from the host. - * Returns zero on success, non-zero on failure. - */ static int -hv_rf_set_packet_filter(rndis_device *device, uint32_t new_filter) +hn_rndis_query(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0) { - rndis_request *request; - rndis_set_request *set; - rndis_set_complete *set_complete; - uint32_t status; - int ret; - - request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG, - RNDIS_MESSAGE_SIZE(rndis_set_request) + sizeof(uint32_t)); - if (request == NULL) { - ret = -1; - goto cleanup; - } - - /* Set up the rndis set */ - set = &request->request_msg.msg.set_request; - set->oid = RNDIS_OID_GEN_CURRENT_PACKET_FILTER; - set->info_buffer_length = sizeof(uint32_t); - set->info_buffer_offset = sizeof(rndis_set_request); - memcpy((void *)((unsigned long)set + sizeof(rndis_set_request)), - &new_filter, sizeof(uint32_t)); - - ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG); - if (ret != 0) { - goto cleanup; - } - - /* - * Wait for the response from the host. Another thread will signal - * us when the response has arrived. In the failure case, - * sema_timedwait() returns a non-zero status after waiting 5 seconds. - */ - ret = sema_timedwait(&request->wait_sema, 5 * hz); - if (ret == 0) { - /* Response received, check status */ - set_complete = &request->response_msg.msg.set_complete; - status = set_complete->status; - if (status != RNDIS_STATUS_SUCCESS) { - /* Bad response status, return error */ - ret = -2; - } - } else { - /* - * We cannot deallocate the request since we may still - * receive a send completion for it. - */ - goto exit; - } - -cleanup: - if (request != NULL) { - hv_put_rndis_request(device, request); - } -exit: - return (ret); + return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0)); } -/* - * RNDIS filter init device - */ static int -hv_rf_init_device(rndis_device *device) +hn_rndis_query2(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0, + size_t min_odlen) { - rndis_request *request; - rndis_initialize_request *init; - rndis_initialize_complete *init_complete; - uint32_t status; - int ret; - - request = hv_rndis_request(device, REMOTE_NDIS_INITIALIZE_MSG, - RNDIS_MESSAGE_SIZE(rndis_initialize_request)); - if (!request) { - ret = -1; - goto cleanup; - } - - /* Set up the rndis set */ - init = &request->request_msg.msg.init_request; - init->major_version = RNDIS_MAJOR_VERSION; - init->minor_version = RNDIS_MINOR_VERSION; + struct rndis_query_req *req; + const struct rndis_query_comp *comp; + struct vmbus_xact *xact; + size_t reqlen, odlen = *odlen0, comp_len; + int error, ofs; + uint32_t rid; + + reqlen = sizeof(*req) + idlen; + xact = vmbus_xact_get(sc->hn_xact, reqlen); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid); + return (ENXIO); + } + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_QUERY_MSG; + req->rm_len = reqlen; + req->rm_rid = rid; + req->rm_oid = oid; /* - * Per the RNDIS document, this should be set to the max MTU - * plus the header size. However, 2048 works fine, so leaving - * it as is. + * XXX + * This is _not_ RNDIS Spec conforming: + * "This MUST be set to 0 when there is no input data + * associated with the OID." + * + * If this field was set to 0 according to the RNDIS Spec, + * Hyper-V would set non-SUCCESS status in the query + * completion. */ - init->max_xfer_size = 2048; - - device->state = RNDIS_DEV_INITIALIZING; + req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET; - ret = hv_rf_send_request(device, request, REMOTE_NDIS_INITIALIZE_MSG); - if (ret != 0) { - device->state = RNDIS_DEV_UNINITIALIZED; - goto cleanup; + if (idlen > 0) { + req->rm_infobuflen = idlen; + /* Input data immediately follows RNDIS query. */ + memcpy(req + 1, idata, idlen); } - sema_wait(&request->wait_sema); + comp_len = sizeof(*comp) + min_odlen; + comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, + REMOTE_NDIS_QUERY_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid); + error = EIO; + goto done; + } - init_complete = &request->response_msg.msg.init_complete; - status = init_complete->status; - if (status == RNDIS_STATUS_SUCCESS) { - device->state = RNDIS_DEV_INITIALIZED; - ret = 0; - } else { - device->state = RNDIS_DEV_UNINITIALIZED; - ret = -1; + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: " + "status 0x%08x\n", oid, comp->rm_status); + error = EIO; + goto done; + } + if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) { + /* No output data! */ + if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid); + *odlen0 = 0; + error = 0; + goto done; } -cleanup: - if (request) { - hv_put_rndis_request(device, request); + /* + * Check output data length and offset. + */ + /* ofs is the offset from the beginning of comp. */ + ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset); + if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) { + if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, " + "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen); + error = EINVAL; + goto done; } - return (ret); + /* + * Save output data. + */ + if (comp->rm_infobuflen < odlen) + odlen = comp->rm_infobuflen; + memcpy(odata, ((const uint8_t *)comp) + ofs, odlen); + *odlen0 = odlen; + + error = 0; +done: + vmbus_xact_put(xact); + return (error); } -#define HALT_COMPLETION_WAIT_COUNT 25 - -/* - * RNDIS filter halt device - */ -static int -hv_rf_halt_device(rndis_device *device) +int +hn_rndis_get_rsscaps(struct hn_softc *sc, int *rxr_cnt) { - rndis_request *request; - rndis_halt_request *halt; - int i, ret; + struct ndis_rss_caps in, caps; + size_t caps_len; + int error; - /* Attempt to do a rndis device halt */ - request = hv_rndis_request(device, REMOTE_NDIS_HALT_MSG, - RNDIS_MESSAGE_SIZE(rndis_halt_request)); - if (request == NULL) { - return (-1); - } - - /* initialize "poor man's semaphore" */ - request->halt_complete_flag = 0; + *rxr_cnt = 0; - /* Set up the rndis set */ - halt = &request->request_msg.msg.halt_request; - halt->request_id = atomic_fetchadd_int(&device->new_request_id, 1); - /* Increment to get the new value (call above returns old value) */ - halt->request_id += 1; - - ret = hv_rf_send_request(device, request, REMOTE_NDIS_HALT_MSG); - if (ret != 0) { - return (-1); + memset(&in, 0, sizeof(in)); + in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS; + if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { + in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_1; + in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE_6_0; + } else { + in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2; + in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE; } + caps_len = NDIS_RSS_CAPS_SIZE; + error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES, + &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0); + if (error) + return (error); + /* - * Wait for halt response from halt callback. We must wait for - * the transaction response before freeing the request and other - * resources. + * Preliminary verification. */ - for (i=HALT_COMPLETION_WAIT_COUNT; i > 0; i--) { - if (request->halt_complete_flag != 0) { - break; - } - DELAY(400); + if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) { + if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", + caps.ndis_hdr.ndis_type); + return (EINVAL); } - if (i == 0) { - return (-1); + if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) { + if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", + caps.ndis_hdr.ndis_rev); + return (EINVAL); + } + if (caps.ndis_hdr.ndis_size > caps_len) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " + "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len); + return (EINVAL); + } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", + caps.ndis_hdr.ndis_size); + return (EINVAL); } - device->state = RNDIS_DEV_UNINITIALIZED; - - hv_put_rndis_request(device, request); + if (caps.ndis_nrxr == 0) { + if_printf(sc->hn_ifp, "0 RX rings!?\n"); + return (EINVAL); + } + *rxr_cnt = caps.ndis_nrxr; + if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE) { + if (bootverbose) { + if_printf(sc->hn_ifp, "RSS indirect table size %u\n", + caps.ndis_nind); + } + } return (0); } -/* - * RNDIS filter open device - */ static int -hv_rf_open_device(rndis_device *device) +hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen) { - int ret; - - if (device->state != RNDIS_DEV_INITIALIZED) { - return (0); + struct rndis_set_req *req; + const struct rndis_set_comp *comp; + struct vmbus_xact *xact; + size_t reqlen, comp_len; + uint32_t rid; + int error; + + KASSERT(dlen > 0, ("invalid dlen %zu", dlen)); + + reqlen = sizeof(*req) + dlen; + xact = vmbus_xact_get(sc->hn_xact, reqlen); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid); + return (ENXIO); } - - if (hv_promisc_mode != 1) { - ret = hv_rf_set_packet_filter(device, - NDIS_PACKET_TYPE_BROADCAST | - NDIS_PACKET_TYPE_ALL_MULTICAST | - NDIS_PACKET_TYPE_DIRECTED); - } else { - ret = hv_rf_set_packet_filter(device, - NDIS_PACKET_TYPE_PROMISCUOUS); + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_SET_MSG; + req->rm_len = reqlen; + req->rm_rid = rid; + req->rm_oid = oid; + req->rm_infobuflen = dlen; + req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET; + /* Data immediately follows RNDIS set. */ + memcpy(req + 1, data, dlen); + + comp_len = sizeof(*comp); + comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, + REMOTE_NDIS_SET_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid); + error = EIO; + goto done; } - if (ret == 0) { - device->state = RNDIS_DEV_DATAINITIALIZED; + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: " + "status 0x%08x\n", oid, comp->rm_status); + error = EIO; + goto done; } - - return (ret); + error = 0; +done: + vmbus_xact_put(xact); + return (error); } -/* - * RNDIS filter close device - */ static int -hv_rf_close_device(rndis_device *device) +hn_rndis_conf_offload(struct hn_softc *sc, int mtu) { - int ret; - - if (device->state != RNDIS_DEV_DATAINITIALIZED) { - return (0); - } - - ret = hv_rf_set_packet_filter(device, 0); - if (ret == 0) { - device->state = RNDIS_DEV_INITIALIZED; + struct ndis_offload hwcaps; + struct ndis_offload_params params; + uint32_t caps = 0; + size_t paramsz; + int error, tso_maxsz, tso_minsg; + + error = hn_rndis_query_hwcaps(sc, &hwcaps); + if (error) { + if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error); + return (error); } - return (ret); -} + /* NOTE: 0 means "no change" */ + memset(¶ms, 0, sizeof(params)); -/* - * RNDIS filter on device add - */ -int -hv_rf_on_device_add(struct hn_softc *sc, void *additl_info, - int nchan, struct hn_rx_ring *rxr) -{ - int ret; - netvsc_dev *net_dev; - rndis_device *rndis_dev; - nvsp_msg *init_pkt; - rndis_offload_params offloads; - struct rndis_recv_scale_cap rsscaps; - uint32_t rsscaps_size = sizeof(struct rndis_recv_scale_cap); - netvsc_device_info *dev_info = (netvsc_device_info *)additl_info; - device_t dev = sc->hn_dev; - - rndis_dev = hv_get_rndis_device(); - if (rndis_dev == NULL) { - return (ENOMEM); + params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT; + if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { + params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2; + paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1; + } else { + params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3; + paramsz = NDIS_OFFLOAD_PARAMS_SIZE; } + params.ndis_hdr.ndis_size = paramsz; /* - * Let the inner driver handle this first to create the netvsc channel - * NOTE! Once the channel is created, we may get a receive callback - * (hv_rf_on_receive()) before this call is completed. - * Note: Earlier code used a function pointer here. + * TSO4/TSO6 setup. */ - net_dev = hv_nv_on_device_add(sc, additl_info, rxr); - if (!net_dev) { - hv_put_rndis_device(rndis_dev); - - return (ENOMEM); + tso_maxsz = IP_MAXPACKET; + tso_minsg = 2; + if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) { + caps |= HN_CAP_TSO4; + params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON; + + if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz) + tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz; + if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg) + tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg; } - - /* - * Initialize the rndis device - */ - - net_dev->extension = rndis_dev; - rndis_dev->net_dev = net_dev; - - /* Send the rndis initialization message */ - ret = hv_rf_init_device(rndis_dev); - if (ret != 0) { - /* - * TODO: If rndis init failed, we will need to shut down - * the channel - */ + if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) && + (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) == + HN_NDIS_LSOV2_CAP_IP6) { +#ifdef notyet + caps |= HN_CAP_TSO6; + params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON; + + if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz) + tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz; + if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg) + tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg; +#endif } - - /* Get the mac address */ - ret = hv_rf_query_device_mac(rndis_dev); - if (ret != 0) { - /* TODO: shut down rndis device and the channel */ + sc->hn_ndis_tso_szmax = 0; + sc->hn_ndis_tso_sgmin = 0; + if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) { + KASSERT(tso_maxsz <= IP_MAXPACKET, + ("invalid NDIS TSO maxsz %d", tso_maxsz)); + KASSERT(tso_minsg >= 2, + ("invalid NDIS TSO minsg %d", tso_minsg)); + if (tso_maxsz < tso_minsg * mtu) { + if_printf(sc->hn_ifp, "invalid NDIS TSO config: " + "maxsz %d, minsg %d, mtu %d; " + "disable TSO4 and TSO6\n", + tso_maxsz, tso_minsg, mtu); + caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6); + params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF; + params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF; + } else { + sc->hn_ndis_tso_szmax = tso_maxsz; + sc->hn_ndis_tso_sgmin = tso_minsg; + if (bootverbose) { + if_printf(sc->hn_ifp, "NDIS TSO " + "szmax %d sgmin %d\n", + sc->hn_ndis_tso_szmax, + sc->hn_ndis_tso_sgmin); + } + } } - /* config csum offload and send request to host */ - memset(&offloads, 0, sizeof(offloads)); - offloads.ipv4_csum = RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; - offloads.tcp_ipv4_csum = RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; - offloads.udp_ipv4_csum = RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; - offloads.tcp_ipv6_csum = RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; - offloads.udp_ipv6_csum = RNDIS_OFFLOAD_PARAMETERS_TX_RX_ENABLED; - offloads.lso_v2_ipv4 = RNDIS_OFFLOAD_PARAMETERS_LSOV2_ENABLED; - - ret = hv_rf_send_offload_request(sc, &offloads); - if (ret != 0) { - /* TODO: shut down rndis device and the channel */ - device_printf(dev, - "hv_rf_send_offload_request failed, ret=%d\n", ret); + /* IPv4 checksum */ + if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) == + HN_NDIS_TXCSUM_CAP_IP4) { + caps |= HN_CAP_IPCS; + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX; } - - memcpy(dev_info->mac_addr, rndis_dev->hw_mac_addr, ETHER_ADDR_LEN); - - hv_rf_query_device_link_status(rndis_dev); - - dev_info->link_state = rndis_dev->link_status; - - net_dev->num_channel = 1; - if (net_dev->nvsp_version < NVSP_PROTOCOL_VERSION_5 || nchan == 1) - return (0); - - memset(&rsscaps, 0, rsscaps_size); - ret = hv_rf_query_device(rndis_dev, - RNDIS_OID_GEN_RSS_CAPABILITIES, - &rsscaps, &rsscaps_size); - if ((ret != 0) || (rsscaps.num_recv_que < 2)) { - device_printf(dev, "hv_rf_query_device failed or " - "rsscaps.num_recv_que < 2 \n"); - goto out; + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) { + if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX; } - device_printf(dev, "channel, offered %u, requested %d\n", - rsscaps.num_recv_que, nchan); - if (nchan > rsscaps.num_recv_que) - nchan = rsscaps.num_recv_que; - net_dev->num_channel = nchan; - if (net_dev->num_channel == 1) { - device_printf(dev, "net_dev->num_channel == 1 under VRSS\n"); - goto out; + /* TCP4 checksum */ + if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) == + HN_NDIS_TXCSUM_CAP_TCP4) { + caps |= HN_CAP_TCP4CS; + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX; } - - /* request host to create sub channels */ - init_pkt = &net_dev->channel_init_packet; - memset(init_pkt, 0, sizeof(nvsp_msg)); - - init_pkt->hdr.msg_type = nvsp_msg5_type_subchannel; - init_pkt->msgs.vers_5_msgs.subchannel_request.op = - NVSP_SUBCHANNE_ALLOCATE; - init_pkt->msgs.vers_5_msgs.subchannel_request.num_subchannels = - net_dev->num_channel - 1; - - ret = vmbus_chan_send(sc->hn_prichan, - VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, - init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); - if (ret != 0) { - device_printf(dev, "Fail to allocate subchannel\n"); - goto out; + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) { + if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX; } - sema_wait(&net_dev->channel_init_sema); + /* UDP4 checksum */ + if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { + caps |= HN_CAP_UDP4CS; + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) { + if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX; + } - if (init_pkt->msgs.vers_5_msgs.subchn_complete.status != - nvsp_status_success) { - ret = ENODEV; - device_printf(dev, "sub channel complete error\n"); - goto out; + /* TCP6 checksum */ + if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) == + HN_NDIS_TXCSUM_CAP_TCP6) { + caps |= HN_CAP_TCP6CS; + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) { + if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX; } - net_dev->num_channel = 1 + - init_pkt->msgs.vers_5_msgs.subchn_complete.num_subchannels; + /* UDP6 checksum */ + if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) == + HN_NDIS_TXCSUM_CAP_UDP6) { + caps |= HN_CAP_UDP6CS; + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) { + if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX; + } - ret = hv_rf_set_rss_param(rndis_dev, net_dev->num_channel); + if (bootverbose) { + if_printf(sc->hn_ifp, "offload csum: " + "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n", + params.ndis_ip4csum, + params.ndis_tcp4csum, + params.ndis_udp4csum, + params.ndis_tcp6csum, + params.ndis_udp6csum); + if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n", + params.ndis_lsov2_ip4, + params.ndis_lsov2_ip6); + } -out: - if (ret) - net_dev->num_channel = 1; + error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, ¶ms, paramsz); + if (error) { + if_printf(sc->hn_ifp, "offload config failed: %d\n", error); + return (error); + } - return (ret); + if (bootverbose) + if_printf(sc->hn_ifp, "offload config done\n"); + sc->hn_caps |= caps; + return (0); } -/* - * RNDIS filter on device remove - */ int -hv_rf_on_device_remove(struct hn_softc *sc, boolean_t destroy_channel) +hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags) { - netvsc_dev *net_dev = sc->net_dev; - rndis_device *rndis_dev = (rndis_device *)net_dev->extension; - int ret; - - /* Halt and release the rndis device */ - ret = hv_rf_halt_device(rndis_dev); - - hv_put_rndis_device(rndis_dev); - net_dev->extension = NULL; + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + struct ndis_rss_params *prm = &rss->rss_params; + int error; - /* Pass control to inner driver to remove the device */ - ret |= hv_nv_on_device_remove(sc, destroy_channel); + /* + * Only NDIS 6.30+ is supported. + */ + KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30, + ("NDIS 6.30+ is required, NDIS version 0x%08x", sc->hn_ndis_ver)); - return (ret); + /* + * NOTE: + * DO NOT whack rss_key and rss_ind, which are setup by the caller. + */ + memset(prm, 0, sizeof(*prm)); + + prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS; + prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2; + prm->ndis_hdr.ndis_size = sizeof(*rss); + prm->ndis_flags = flags; + prm->ndis_hash = NDIS_HASH_FUNCTION_TOEPLITZ | + NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4 | + NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; + /* TODO: Take ndis_rss_caps.ndis_nind into account */ + prm->ndis_indsize = sizeof(rss->rss_ind); + prm->ndis_indoffset = + __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]); + prm->ndis_keysize = sizeof(rss->rss_key); + prm->ndis_keyoffset = + __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]); + + error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, + rss, sizeof(*rss)); + if (error) { + if_printf(sc->hn_ifp, "RSS config failed: %d\n", error); + } else { + if (bootverbose) + if_printf(sc->hn_ifp, "RSS config done\n"); + } + return (error); } -/* - * RNDIS filter on open - */ int -hv_rf_on_open(struct hn_softc *sc) +hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter) { - netvsc_dev *net_dev = sc->net_dev; + int error; - return (hv_rf_open_device((rndis_device *)net_dev->extension)); + error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER, + &filter, sizeof(filter)); + if (error) { + if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n", + filter, error); + } else { + if (bootverbose) { + if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n", + filter); + } + } + return (error); } -/* - * RNDIS filter on close - */ -int -hv_rf_on_close(struct hn_softc *sc) +static int +hn_rndis_init(struct hn_softc *sc) { - netvsc_dev *net_dev = sc->net_dev; + struct rndis_init_req *req; + const struct rndis_init_comp *comp; + struct vmbus_xact *xact; + size_t comp_len; + uint32_t rid; + int error; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS init\n"); + return (ENXIO); + } + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_INITIALIZE_MSG; + req->rm_len = sizeof(*req); + req->rm_rid = rid; + req->rm_ver_major = RNDIS_VERSION_MAJOR; + req->rm_ver_minor = RNDIS_VERSION_MINOR; + req->rm_max_xfersz = HN_RNDIS_XFER_SIZE; + + comp_len = RNDIS_INIT_COMP_SIZE_MIN; + comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len, + REMOTE_NDIS_INITIALIZE_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS init failed\n"); + error = EIO; + goto done; + } - return (hv_rf_close_device((rndis_device *)net_dev->extension)); + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n", + comp->rm_status); + error = EIO; + goto done; + } + if (bootverbose) { + if_printf(sc->hn_ifp, "RNDIS ver %u.%u, pktsz %u, pktcnt %u, " + "align %u\n", comp->rm_ver_major, comp->rm_ver_minor, + comp->rm_pktmaxsz, comp->rm_pktmaxcnt, + 1U << comp->rm_align); + } + error = 0; +done: + vmbus_xact_put(xact); + return (error); } -/* - * RNDIS filter on send request completion callback - */ -static void -hv_rf_on_send_request_completion(struct vmbus_channel *chan __unused, - void *context __unused) +static int +hn_rndis_halt(struct hn_softc *sc) { + struct vmbus_xact *xact; + struct rndis_halt_req *halt; + struct hn_send_ctx sndc; + size_t comp_len; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS halt\n"); + return (ENXIO); + } + halt = vmbus_xact_req_data(xact); + halt->rm_type = REMOTE_NDIS_HALT_MSG; + halt->rm_len = sizeof(*halt); + halt->rm_rid = hn_rndis_rid(sc); + + /* No RNDIS completion; rely on NVS message send completion */ + hn_send_ctx_init_simple(&sndc, hn_nvs_sent_xact, xact); + hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len); + + vmbus_xact_put(xact); + if (bootverbose) + if_printf(sc->hn_ifp, "RNDIS halt done\n"); + return (0); } -/* - * RNDIS filter on send request (halt only) completion callback - */ -static void -hv_rf_on_send_request_halt_completion(struct vmbus_channel *chan __unused, - void *context) +static int +hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps) +{ + struct ndis_offload in; + size_t caps_len, size; + int error; + + memset(&in, 0, sizeof(in)); + in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD; + if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3; + size = NDIS_OFFLOAD_SIZE; + } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2; + size = NDIS_OFFLOAD_SIZE_6_1; + } else { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1; + size = NDIS_OFFLOAD_SIZE_6_0; + } + in.ndis_hdr.ndis_size = size; + + caps_len = NDIS_OFFLOAD_SIZE; + error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, + &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0); + if (error) + return (error); + + /* + * Preliminary verification. + */ + if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) { + if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", + caps->ndis_hdr.ndis_type); + return (EINVAL); + } + if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) { + if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", + caps->ndis_hdr.ndis_rev); + return (EINVAL); + } + if (caps->ndis_hdr.ndis_size > caps_len) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " + "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len); + return (EINVAL); + } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", + caps->ndis_hdr.ndis_size); + return (EINVAL); + } + + if (bootverbose) { + /* + * NOTE: + * caps->ndis_hdr.ndis_size MUST be checked before accessing + * NDIS 6.1+ specific fields. + */ + if_printf(sc->hn_ifp, "hwcaps rev %u\n", + caps->ndis_hdr.ndis_rev); + + if_printf(sc->hn_ifp, "hwcaps csum: " + "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, " + "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n", + caps->ndis_csum.ndis_ip4_txcsum, + caps->ndis_csum.ndis_ip4_txenc, + caps->ndis_csum.ndis_ip4_rxcsum, + caps->ndis_csum.ndis_ip4_rxenc, + caps->ndis_csum.ndis_ip6_txcsum, + caps->ndis_csum.ndis_ip6_txenc, + caps->ndis_csum.ndis_ip6_rxcsum, + caps->ndis_csum.ndis_ip6_rxenc); + if_printf(sc->hn_ifp, "hwcaps lsov2: " + "ip4 maxsz %u minsg %u encap 0x%x, " + "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n", + caps->ndis_lsov2.ndis_ip4_maxsz, + caps->ndis_lsov2.ndis_ip4_minsg, + caps->ndis_lsov2.ndis_ip4_encap, + caps->ndis_lsov2.ndis_ip6_maxsz, + caps->ndis_lsov2.ndis_ip6_minsg, + caps->ndis_lsov2.ndis_ip6_encap, + caps->ndis_lsov2.ndis_ip6_opts); + } + return (0); +} + +int +hn_rndis_attach(struct hn_softc *sc, int mtu) { - rndis_request *request = context; + int error; + + /* + * Initialize RNDIS. + */ + error = hn_rndis_init(sc); + if (error) + return (error); /* - * Notify hv_rf_halt_device() about halt completion. - * The halt code must wait for completion before freeing - * the transaction resources. + * Configure NDIS offload settings. + * XXX no offloading, if error happened? */ - request->halt_complete_flag = 1; + hn_rndis_conf_offload(sc, mtu); + return (0); +} + +void +hn_rndis_detach(struct hn_softc *sc) +{ + + /* Halt the RNDIS. */ + hn_rndis_halt(sc); } void hv_rf_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { - netvsc_channel_rollup(rxr, txr); + hn_chan_rollup(rxr, txr); } diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.h b/sys/dev/hyperv/netvsc/hv_rndis_filter.h index 2f940db..3ecda3b 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.h +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.h @@ -33,96 +33,16 @@ #include <sys/param.h> #include <net/ethernet.h> - -/* - * Defines - */ - -/* Destroy or preserve channel on filter/netvsc teardown */ -#define HV_RF_NV_DESTROY_CHANNEL TRUE -#define HV_RF_NV_RETAIN_CHANNEL FALSE - -/* - * Number of page buffers to reserve for the RNDIS filter packet in the - * transmitted message. - */ -#define HV_RF_NUM_TX_RESERVED_PAGE_BUFS 1 - - -/* - * Data types - */ - -typedef enum { - RNDIS_DEV_UNINITIALIZED = 0, - RNDIS_DEV_INITIALIZING, - RNDIS_DEV_INITIALIZED, - RNDIS_DEV_DATAINITIALIZED, -} rndis_device_state; - -typedef struct rndis_request_ { - STAILQ_ENTRY(rndis_request_) mylist_entry; - struct sema wait_sema; - - /* - * The max response size is sizeof(rndis_msg) + PAGE_SIZE. - * - * XXX - * This is ugly and should be cleaned up once we busdma-fy - * RNDIS request bits. - */ - rndis_msg response_msg; - uint8_t buf_resp[PAGE_SIZE]; - - /* Simplify allocation by having a netvsc packet inline */ - netvsc_packet pkt; - - /* - * The max request size is sizeof(rndis_msg) + PAGE_SIZE. - * - * NOTE: - * This is required for the large request like RSS settings. - * - * XXX - * This is ugly and should be cleaned up once we busdma-fy - * RNDIS request bits. - */ - rndis_msg request_msg; - uint8_t buf_req[PAGE_SIZE]; - - /* Fixme: Poor man's semaphore. */ - uint32_t halt_complete_flag; -} rndis_request; - -typedef struct rndis_device_ { - netvsc_dev *net_dev; - - rndis_device_state state; - uint32_t link_status; - uint32_t new_request_id; - - struct mtx req_lock; - - STAILQ_HEAD(RQ, rndis_request_) myrequest_list; - - uint8_t hw_mac_addr[ETHER_ADDR_LEN]; -} rndis_device; +#include <dev/hyperv/netvsc/if_hnvar.h> /* * Externs */ -struct hn_softc; struct hn_rx_ring; -int hv_rf_on_receive(netvsc_dev *net_dev, - struct hn_rx_ring *rxr, netvsc_packet *pkt); -void hv_rf_receive_rollup(netvsc_dev *net_dev); +void hv_rf_on_receive(struct hn_softc *sc, struct hn_rx_ring *rxr, + const void *data, int dlen); void hv_rf_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr); -int hv_rf_on_device_add(struct hn_softc *sc, void *additl_info, int nchan, - struct hn_rx_ring *rxr); -int hv_rf_on_device_remove(struct hn_softc *sc, boolean_t destroy_channel); -int hv_rf_on_open(struct hn_softc *sc); -int hv_rf_on_close(struct hn_softc *sc); #endif /* __HV_RNDIS_FILTER_H__ */ diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h new file mode 100644 index 0000000..53f59ec --- /dev/null +++ b/sys/dev/hyperv/netvsc/if_hnreg.h @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IF_HNREG_H_ +#define _IF_HNREG_H_ + +#include <sys/param.h> +#include <sys/systm.h> + +/* + * NDIS protocol version numbers + */ +#define HN_NDIS_VERSION_6_1 0x00060001 +#define HN_NDIS_VERSION_6_30 0x0006001e +#define HN_NDIS_VERSION_MAJOR(ver) (((ver) & 0xffff0000) >> 16) +#define HN_NDIS_VERSION_MINOR(ver) ((ver) & 0xffff) + +/* + * NVS versions. + */ +#define HN_NVS_VERSION_1 0x00002 +#define HN_NVS_VERSION_2 0x30002 +#define HN_NVS_VERSION_4 0x40000 +#define HN_NVS_VERSION_5 0x50000 + +#define HN_NVS_RXBUF_SIG 0xcafe +#define HN_NVS_CHIM_SIG 0xface + +#define HN_NVS_CHIM_IDX_INVALID 0xffffffff + +#define HN_NVS_RNDIS_MTYPE_DATA 0 +#define HN_NVS_RNDIS_MTYPE_CTRL 1 + +/* + * NVS message transacion status codes. + */ +#define HN_NVS_STATUS_OK 1 +#define HN_NVS_STATUS_FAILED 2 + +/* + * NVS request/response message types. + */ +#define HN_NVS_TYPE_INIT 1 +#define HN_NVS_TYPE_INIT_RESP 2 +#define HN_NVS_TYPE_NDIS_INIT 100 +#define HN_NVS_TYPE_RXBUF_CONN 101 +#define HN_NVS_TYPE_RXBUF_CONNRESP 102 +#define HN_NVS_TYPE_RXBUF_DISCONN 103 +#define HN_NVS_TYPE_CHIM_CONN 104 +#define HN_NVS_TYPE_CHIM_CONNRESP 105 +#define HN_NVS_TYPE_CHIM_DISCONN 106 +#define HN_NVS_TYPE_RNDIS 107 +#define HN_NVS_TYPE_RNDIS_ACK 108 +#define HN_NVS_TYPE_NDIS_CONF 125 +#define HN_NVS_TYPE_VFASSOC_NOTE 128 /* notification */ +#define HN_NVS_TYPE_SET_DATAPATH 129 +#define HN_NVS_TYPE_SUBCH_REQ 133 +#define HN_NVS_TYPE_SUBCH_RESP 133 /* same as SUBCH_REQ */ +#define HN_NVS_TYPE_TXTBL_NOTE 134 /* notification */ + +/* + * Any size less than this one will _not_ work, e.g. hn_nvs_init + * only has 12B valid data, however, if only 12B data were sent, + * Hypervisor would never reply. + */ +#define HN_NVS_REQSIZE_MIN 32 + +/* NVS message common header */ +struct hn_nvs_hdr { + uint32_t nvs_type; +} __packed; + +struct hn_nvs_init { + uint32_t nvs_type; /* HN_NVS_TYPE_INIT */ + uint32_t nvs_ver_min; + uint32_t nvs_ver_max; + uint8_t nvs_rsvd[20]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_init) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_init_resp { + uint32_t nvs_type; /* HN_NVS_TYPE_INIT_RESP */ + uint32_t nvs_ver; /* deprecated */ + uint32_t nvs_rsvd; + uint32_t nvs_status; /* HN_NVS_STATUS_ */ +} __packed; + +/* No reponse */ +struct hn_nvs_ndis_conf { + uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_CONF */ + uint32_t nvs_mtu; + uint32_t nvs_rsvd; + uint64_t nvs_caps; /* HN_NVS_NDIS_CONF_ */ + uint8_t nvs_rsvd1[12]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_ndis_conf) >= HN_NVS_REQSIZE_MIN); + +#define HN_NVS_NDIS_CONF_SRIOV 0x0004 +#define HN_NVS_NDIS_CONF_VLAN 0x0008 + +/* No response */ +struct hn_nvs_ndis_init { + uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_INIT */ + uint32_t nvs_ndis_major; /* NDIS_VERSION_MAJOR_ */ + uint32_t nvs_ndis_minor; /* NDIS_VERSION_MINOR_ */ + uint8_t nvs_rsvd[20]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rxbuf_conn { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONN */ + uint32_t nvs_gpadl; /* RXBUF vmbus GPADL */ + uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ + uint8_t nvs_rsvd[22]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rxbuf_conn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rxbuf_sect { + uint32_t nvs_start; + uint32_t nvs_slotsz; + uint32_t nvs_slotcnt; + uint32_t nvs_end; +} __packed; + +struct hn_nvs_rxbuf_connresp { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONNRESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_nsect; /* # of elem in nvs_sect */ + struct hn_nvs_rxbuf_sect nvs_sect[]; +} __packed; + +/* No response */ +struct hn_nvs_rxbuf_disconn { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_DISCONN */ + uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ + uint8_t nvs_rsvd[26]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rxbuf_disconn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_chim_conn { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONN */ + uint32_t nvs_gpadl; /* chimney buf vmbus GPADL */ + uint16_t nvs_sig; /* NDIS_NVS_CHIM_SIG */ + uint8_t nvs_rsvd[22]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_chim_conn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_chim_connresp { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONNRESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_sectsz; /* section size */ +} __packed; + +/* No response */ +struct hn_nvs_chim_disconn { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_DISCONN */ + uint16_t nvs_sig; /* HN_NVS_CHIM_SIG */ + uint8_t nvs_rsvd[26]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_chim_disconn) >= HN_NVS_REQSIZE_MIN); + +#define HN_NVS_SUBCH_OP_ALLOC 1 + +struct hn_nvs_subch_req { + uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_REQ */ + uint32_t nvs_op; /* HN_NVS_SUBCH_OP_ */ + uint32_t nvs_nsubch; + uint8_t nvs_rsvd[20]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_subch_req) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_subch_resp { + uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_RESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_nsubch; +} __packed; + +struct hn_nvs_rndis { + uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS */ + uint32_t nvs_rndis_mtype;/* HN_NVS_RNDIS_MTYPE_ */ + /* + * Chimney sending buffer index and size. + * + * NOTE: + * If nvs_chim_idx is set to HN_NVS_CHIM_IDX_INVALID + * and nvs_chim_sz is set to 0, then chimney sending + * buffer is _not_ used by this RNDIS message. + */ + uint32_t nvs_chim_idx; + uint32_t nvs_chim_sz; + uint8_t nvs_rsvd[16]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rndis) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rndis_ack { + uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS_ACK */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint8_t nvs_rsvd[24]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN); + +/* + * RNDIS extension + */ + +/* Per-packet hash info */ +#define HN_NDIS_HASH_INFO_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHINF NDIS_PKTINFO_TYPE_ORIG_NBLIST +/* NDIS_HASH_ */ + +/* Per-packet hash value */ +#define HN_NDIS_HASH_VALUE_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHVAL NDIS_PKTINFO_TYPE_PKT_CANCELID + +/* Per-packet-info size */ +#define HN_RNDIS_PKTINFO_SIZE(dlen) \ + __offsetof(struct rndis_pktinfo, rm_data[dlen]) + +#endif /* !_IF_HNREG_H_ */ diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h new file mode 100644 index 0000000..517d281 --- /dev/null +++ b/sys/dev/hyperv/netvsc/if_hnvar.h @@ -0,0 +1,145 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IF_HNVAR_H_ +#define _IF_HNVAR_H_ + +#include <sys/param.h> + +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/netvsc/if_hnreg.h> + +struct hn_softc; + +struct vmbus_channel; +struct hn_send_ctx; + +typedef void (*hn_sent_callback_t) + (struct hn_send_ctx *, struct hn_softc *, + struct vmbus_channel *, const void *, int); + +struct hn_send_ctx { + hn_sent_callback_t hn_cb; + void *hn_cbarg; + uint32_t hn_chim_idx; + int hn_chim_sz; +}; + +struct rndis_hash_info; +struct rndix_hash_value; +struct ndis_8021q_info_; +struct rndis_tcp_ip_csum_info_; + +#define HN_NDIS_VLAN_INFO_INVALID 0xffffffff +#define HN_NDIS_RXCSUM_INFO_INVALID 0 +#define HN_NDIS_HASH_INFO_INVALID 0 + +struct hn_recvinfo { + uint32_t vlan_info; + uint32_t csum_info; + uint32_t hash_info; + uint32_t hash_value; +}; + +#define HN_SEND_CTX_INITIALIZER(cb, cbarg) \ +{ \ + .hn_cb = cb, \ + .hn_cbarg = cbarg, \ + .hn_chim_idx = HN_NVS_CHIM_IDX_INVALID, \ + .hn_chim_sz = 0 \ +} + +static __inline void +hn_send_ctx_init(struct hn_send_ctx *sndc, hn_sent_callback_t cb, + void *cbarg, uint32_t chim_idx, int chim_sz) +{ + + sndc->hn_cb = cb; + sndc->hn_cbarg = cbarg; + sndc->hn_chim_idx = chim_idx; + sndc->hn_chim_sz = chim_sz; +} + +static __inline void +hn_send_ctx_init_simple(struct hn_send_ctx *sndc, hn_sent_callback_t cb, + void *cbarg) +{ + + hn_send_ctx_init(sndc, cb, cbarg, HN_NVS_CHIM_IDX_INVALID, 0); +} + +static __inline int +hn_nvs_send(struct vmbus_channel *chan, uint16_t flags, + void *nvs_msg, int nvs_msglen, struct hn_send_ctx *sndc) +{ + + return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags, + nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); +} + +static __inline int +hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, + void *nvs_msg, int nvs_msglen, struct hn_send_ctx *sndc) +{ + + return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen, + (uint64_t)(uintptr_t)sndc)); +} + +struct vmbus_xact; +struct rndis_packet_msg; + +uint32_t hn_chim_alloc(struct hn_softc *sc); +void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx); + +int hn_rndis_attach(struct hn_softc *sc, int mtu); +void hn_rndis_detach(struct hn_softc *sc); +int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags); +void *hn_rndis_pktinfo_append(struct rndis_packet_msg *, + size_t pktsize, size_t pi_dlen, uint32_t pi_type); +int hn_rndis_get_rsscaps(struct hn_softc *sc, int *rxr_cnt); +int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr); +int hn_rndis_get_linkstatus(struct hn_softc *sc, + uint32_t *link_status); +/* filter: NDIS_PACKET_TYPE_ or 0. */ +int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter); + +int hn_nvs_attach(struct hn_softc *sc, int mtu); +void hn_nvs_detach(struct hn_softc *sc); +int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch); +void hn_nvs_sent_xact(struct hn_send_ctx *sndc, struct hn_softc *sc, + struct vmbus_channel *chan, const void *data, int dlen); + +int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, + const struct hn_recvinfo *info); +void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr); +void hn_link_status_update(struct hn_softc *sc); + +extern struct hn_send_ctx hn_send_ctx_none; + +#endif /* !_IF_HNVAR_H_ */ diff --git a/sys/dev/hyperv/netvsc/ndis.h b/sys/dev/hyperv/netvsc/ndis.h new file mode 100644 index 0000000..fed262d --- /dev/null +++ b/sys/dev/hyperv/netvsc/ndis.h @@ -0,0 +1,386 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_NDIS_H_ +#define _NET_NDIS_H_ + +#define NDIS_MEDIA_STATE_CONNECTED 0 +#define NDIS_MEDIA_STATE_DISCONNECTED 1 + +#define NDIS_OFFLOAD_SET_NOCHG 0 +#define NDIS_OFFLOAD_SET_ON 1 +#define NDIS_OFFLOAD_SET_OFF 2 + +/* a.k.a GRE MAC */ +#define NDIS_ENCAP_TYPE_NVGRE 0x00000001 + +#define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */ +#define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */ + +/* hash function */ +#define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001 + +/* hash type */ +#define NDIS_HASH_IPV4 0x00000100 +#define NDIS_HASH_TCP_IPV4 0x00000200 +#define NDIS_HASH_IPV6 0x00000400 +#define NDIS_HASH_IPV6_EX 0x00000800 +#define NDIS_HASH_TCP_IPV6 0x00001000 +#define NDIS_HASH_TCP_IPV6_EX 0x00002000 + +#define NDIS_HASH_KEYSIZE_TOEPLITZ 40 +#define NDIS_HASH_INDCNT 128 + +#define NDIS_OBJTYPE_DEFAULT 0x80 +#define NDIS_OBJTYPE_RSS_CAPS 0x88 +#define NDIS_OBJTYPE_RSS_PARAMS 0x89 +#define NDIS_OBJTYPE_OFFLOAD 0xa7 + +struct ndis_object_hdr { + uint8_t ndis_type; /* NDIS_OBJTYPE_ */ + uint8_t ndis_rev; /* type specific */ + uint16_t ndis_size; /* incl. this hdr */ +}; + +/* + * OID_TCP_OFFLOAD_PARAMETERS + * ndis_type: NDIS_OBJTYPE_DEFAULT + */ +struct ndis_offload_params { + struct ndis_object_hdr ndis_hdr; + uint8_t ndis_ip4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_tcp4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_udp4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_tcp6csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_udp6csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_lsov1; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_ipsecv1; /* NDIS_OFFLOAD_IPSECV1_ */ + uint8_t ndis_lsov2_ip4; /* NDIS_OFFLOAD_LSOV2_ */ + uint8_t ndis_lsov2_ip6; /* NDIS_OFFLOAD_LSOV2_ */ + uint8_t ndis_tcp4conn; /* 0 */ + uint8_t ndis_tcp6conn; /* 0 */ + uint32_t ndis_flags; /* 0 */ + /* NDIS >= 6.1 */ + uint8_t ndis_ipsecv2; /* NDIS_OFFLOAD_IPSECV2_ */ + uint8_t ndis_ipsecv2_ip4;/* NDIS_OFFLOAD_IPSECV2_ */ + /* NDIS >= 6.30 */ + uint8_t ndis_rsc_ip4; /* NDIS_OFFLOAD_RSC_ */ + uint8_t ndis_rsc_ip6; /* NDIS_OFFLOAD_RSC_ */ + uint8_t ndis_encap; /* NDIS_OFFLOAD_SET_ */ + uint8_t ndis_encap_types;/* NDIS_ENCAP_TYPE_ */ +}; + +#define NDIS_OFFLOAD_PARAMS_SIZE sizeof(struct ndis_offload_params) +#define NDIS_OFFLOAD_PARAMS_SIZE_6_1 \ + __offsetof(struct ndis_offload_params, ndis_rsc_ip4) + +#define NDIS_OFFLOAD_PARAMS_REV_2 2 /* NDIS 6.1 */ +#define NDIS_OFFLOAD_PARAMS_REV_3 3 /* NDIS 6.30 */ + +#define NDIS_OFFLOAD_PARAM_NOCHG 0 /* common */ +#define NDIS_OFFLOAD_PARAM_OFF 1 +#define NDIS_OFFLOAD_PARAM_TX 2 +#define NDIS_OFFLOAD_PARAM_RX 3 +#define NDIS_OFFLOAD_PARAM_TXRX 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_LSOV1_OFF 1 +#define NDIS_OFFLOAD_LSOV1_ON 2 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_IPSECV1_OFF 1 +#define NDIS_OFFLOAD_IPSECV1_AH 2 +#define NDIS_OFFLOAD_IPSECV1_ESP 3 +#define NDIS_OFFLOAD_IPSECV1_AH_ESP 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_LSOV2_OFF 1 +#define NDIS_OFFLOAD_LSOV2_ON 2 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_IPSECV2_OFF 1 +#define NDIS_OFFLOAD_IPSECV2_AH 2 +#define NDIS_OFFLOAD_IPSECV2_ESP 3 +#define NDIS_OFFLOAD_IPSECV2_AH_ESP 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_RSC_OFF 1 +#define NDIS_OFFLOAD_RSC_ON 2 + +/* + * OID_GEN_RECEIVE_SCALE_CAPABILITIES + * ndis_type: NDIS_OBJTYPE_RSS_CAPS + */ +struct ndis_rss_caps { + struct ndis_object_hdr ndis_hdr; + uint32_t ndis_flags; /* NDIS_RSS_CAP_ */ + uint32_t ndis_nmsi; /* # of MSIs */ + uint32_t ndis_nrxr; /* # of RX rings */ + /* NDIS >= 6.30 */ + uint16_t ndis_nind; /* # of indtbl ent. */ + uint16_t ndis_pad; +}; + +#define NDIS_RSS_CAPS_SIZE \ + __offsetof(struct ndis_rss_caps, ndis_pad) +#define NDIS_RSS_CAPS_SIZE_6_0 \ + __offsetof(struct ndis_rss_caps, ndis_nind) + +#define NDIS_RSS_CAPS_REV_1 1 /* NDIS 6.{0,1,20} */ +#define NDIS_RSS_CAPS_REV_2 2 /* NDIS 6.30 */ + +#define NDIS_RSS_CAP_MSI 0x01000000 +#define NDIS_RSS_CAP_CLASSIFY_ISR 0x02000000 +#define NDIS_RSS_CAP_CLASSIFY_DPC 0x04000000 +#define NDIS_RSS_CAP_MSIX 0x08000000 +#define NDIS_RSS_CAP_IPV4 0x00000100 +#define NDIS_RSS_CAP_IPV6 0x00000200 +#define NDIS_RSS_CAP_IPV6_EX 0x00000400 +#define NDIS_RSS_CAP_HASH_TOEPLITZ 0x00000001 + +/* + * OID_GEN_RECEIVE_SCALE_PARAMETERS + * ndis_type: NDIS_OBJTYPE_RSS_PARAMS + */ +struct ndis_rss_params { + struct ndis_object_hdr ndis_hdr; + uint16_t ndis_flags; /* NDIS_RSS_FLAG_ */ + uint16_t ndis_bcpu; /* base cpu 0 */ + uint32_t ndis_hash; /* NDIS_HASH_ */ + uint16_t ndis_indsize; /* indirect table */ + uint32_t ndis_indoffset; + uint16_t ndis_keysize; /* hash key */ + uint32_t ndis_keyoffset; + /* NDIS >= 6.20 */ + uint32_t ndis_cpumaskoffset; + uint32_t ndis_cpumaskcnt; + uint32_t ndis_cpumaskentsz; +}; + +#define NDIS_RSS_PARAMS_SIZE sizeof(struct ndis_rss_params) +#define NDIS_RSS_PARAMS_SIZE_6_0 \ + __offsetof(struct ndis_rss_params, ndis_cpumaskoffset) + +#define NDIS_RSS_PARAMS_REV_1 1 /* NDIS 6.0 */ +#define NDIS_RSS_PARAMS_REV_2 2 /* NDIS 6.20 */ + +#define NDIS_RSS_FLAG_NONE 0x0000 +#define NDIS_RSS_FLAG_BCPU_UNCHG 0x0001 +#define NDIS_RSS_FLAG_HASH_UNCHG 0x0002 +#define NDIS_RSS_FLAG_IND_UNCHG 0x0004 +#define NDIS_RSS_FLAG_KEY_UNCHG 0x0008 +#define NDIS_RSS_FLAG_DISABLE 0x0010 + +/* non-standard convenient struct */ +struct ndis_rssprm_toeplitz { + struct ndis_rss_params rss_params; + /* Toeplitz hash key */ + uint8_t rss_key[NDIS_HASH_KEYSIZE_TOEPLITZ]; + /* Indirect table */ + uint32_t rss_ind[NDIS_HASH_INDCNT]; +}; + +/* + * OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES + * ndis_type: NDIS_OBJTYPE_OFFLOAD + */ + +#define NDIS_OFFLOAD_ENCAP_NONE 0x0000 +#define NDIS_OFFLOAD_ENCAP_NULL 0x0001 +#define NDIS_OFFLOAD_ENCAP_8023 0x0002 +#define NDIS_OFFLOAD_ENCAP_8023PQ 0x0004 +#define NDIS_OFFLOAD_ENCAP_8023PQ_OOB 0x0008 +#define NDIS_OFFLOAD_ENCAP_RFC1483 0x0010 + +struct ndis_csum_offload { + uint32_t ndis_ip4_txenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_txcsum; +#define NDIS_TXCSUM_CAP_IP4OPT 0x001 +#define NDIS_TXCSUM_CAP_TCP4OPT 0x004 +#define NDIS_TXCSUM_CAP_TCP4 0x010 +#define NDIS_TXCSUM_CAP_UDP4 0x040 +#define NDIS_TXCSUM_CAP_IP4 0x100 + uint32_t ndis_ip4_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_rxcsum; +#define NDIS_RXCSUM_CAP_IP4OPT 0x001 +#define NDIS_RXCSUM_CAP_TCP4OPT 0x004 +#define NDIS_RXCSUM_CAP_TCP4 0x010 +#define NDIS_RXCSUM_CAP_UDP4 0x040 +#define NDIS_RXCSUM_CAP_IP4 0x100 + uint32_t ndis_ip6_txenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_txcsum; +#define NDIS_TXCSUM_CAP_IP6EXT 0x001 +#define NDIS_TXCSUM_CAP_TCP6OPT 0x004 +#define NDIS_TXCSUM_CAP_TCP6 0x010 +#define NDIS_TXCSUM_CAP_UDP6 0x040 + uint32_t ndis_ip6_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_rxcsum; +#define NDIS_RXCSUM_CAP_IP6EXT 0x001 +#define NDIS_RXCSUM_CAP_TCP6OPT 0x004 +#define NDIS_RXCSUM_CAP_TCP6 0x010 +#define NDIS_RXCSUM_CAP_UDP6 0x040 +}; + +struct ndis_lsov1_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_maxsize; + uint32_t ndis_minsegs; + uint32_t ndis_opts; +}; + +struct ndis_ipsecv1_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ah_esp; + uint32_t ndis_xport_tun; + uint32_t ndis_ip4_opts; + uint32_t ndis_flags; + uint32_t ndis_ip4_ah; + uint32_t ndis_ip4_esp; +}; + +struct ndis_lsov2_offload { + uint32_t ndis_ip4_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_maxsz; + uint32_t ndis_ip4_minsg; + uint32_t ndis_ip6_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_maxsz; + uint32_t ndis_ip6_minsg; + uint32_t ndis_ip6_opts; +#define NDIS_LSOV2_CAP_IP6EXT 0x001 +#define NDIS_LSOV2_CAP_TCP6OPT 0x004 +}; + +struct ndis_ipsecv2_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint16_t ndis_ip6; + uint16_t ndis_ip4opt; + uint16_t ndis_ip6ext; + uint16_t ndis_ah; + uint16_t ndis_esp; + uint16_t ndis_ah_esp; + uint16_t ndis_xport; + uint16_t ndis_tun; + uint16_t ndis_xport_tun; + uint16_t ndis_lso; + uint16_t ndis_extseq; + uint32_t ndis_udp_esp; + uint32_t ndis_auth; + uint32_t ndis_crypto; + uint32_t ndis_sa_caps; +}; + +struct ndis_rsc_offload { + uint16_t ndis_ip4; + uint16_t ndis_ip6; +}; + +struct ndis_encap_offload { + uint32_t ndis_flags; + uint32_t ndis_maxhdr; +}; + +struct ndis_offload { + struct ndis_object_hdr ndis_hdr; + struct ndis_csum_offload ndis_csum; + struct ndis_lsov1_offload ndis_lsov1; + struct ndis_ipsecv1_offload ndis_ipsecv1; + struct ndis_lsov2_offload ndis_lsov2; + uint32_t ndis_flags; + /* NDIS >= 6.1 */ + struct ndis_ipsecv2_offload ndis_ipsecv2; + /* NDIS >= 6.30 */ + struct ndis_rsc_offload ndis_rsc; + struct ndis_encap_offload ndis_encap_gre; +}; + +#define NDIS_OFFLOAD_SIZE sizeof(struct ndis_offload) +#define NDIS_OFFLOAD_SIZE_6_0 \ + __offsetof(struct ndis_offload, ndis_ipsecv2) +#define NDIS_OFFLOAD_SIZE_6_1 \ + __offsetof(struct ndis_offload, ndis_rsc) + +#define NDIS_OFFLOAD_REV_1 1 /* NDIS 6.0 */ +#define NDIS_OFFLOAD_REV_2 2 /* NDIS 6.1 */ +#define NDIS_OFFLOAD_REV_3 3 /* NDIS 6.30 */ + +/* + * Per-packet-info + */ + +/* VLAN */ +#define NDIS_VLAN_INFO_SIZE sizeof(uint32_t) +#define NDIS_VLAN_INFO_PRI_MASK 0x0007 +#define NDIS_VLAN_INFO_CFI_MASK 0x0008 +#define NDIS_VLAN_INFO_ID_MASK 0xfff0 +#define NDIS_VLAN_INFO_MAKE(id, pri, cfi) \ + (((pri) & NDIS_VLAN_INFO_PRI_MASK) | \ + (((cfi) & 0x1) << 3) | (((id) & 0xfff) << 4)) +#define NDIS_VLAN_INFO_ID(inf) (((inf) & NDIS_VLAN_INFO_ID_MASK) >> 4) +#define NDIS_VLAN_INFO_CFI(inf) (((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3) +#define NDIS_VLAN_INFO_PRI(inf) ((inf) & NDIS_VLAN_INFO_PRI_MASK) + +/* Reception checksum */ +#define NDIS_RXCSUM_INFO_SIZE sizeof(uint32_t) +#define NDIS_RXCSUM_INFO_TCPCS_FAILED 0x0001 +#define NDIS_RXCSUM_INFO_UDPCS_FAILED 0x0002 +#define NDIS_RXCSUM_INFO_IPCS_FAILED 0x0004 +#define NDIS_RXCSUM_INFO_TCPCS_OK 0x0008 +#define NDIS_RXCSUM_INFO_UDPCS_OK 0x0010 +#define NDIS_RXCSUM_INFO_IPCS_OK 0x0020 +#define NDIS_RXCSUM_INFO_LOOPBACK 0x0040 +#define NDIS_RXCSUM_INFO_TCPCS_INVAL 0x0080 +#define NDIS_RXCSUM_INFO_IPCS_INVAL 0x0100 + +/* LSOv2 */ +#define NDIS_LSO2_INFO_SIZE sizeof(uint32_t) +#define NDIS_LSO2_INFO_MSS_MASK 0x000fffff +#define NDIS_LSO2_INFO_THOFF_MASK 0x3ff00000 +#define NDIS_LSO2_INFO_ISLSO2 0x40000000 +#define NDIS_LSO2_INFO_ISIPV6 0x80000000 + +#define NDIS_LSO2_INFO_MAKE(thoff, mss) \ + ((((uint32_t)(mss)) & NDIS_LSO2_INFO_MSS_MASK) | \ + ((((uint32_t)(thoff)) & 0x3ff) << 20) | \ + NDIS_LSO2_INFO_ISLSO2) + +#define NDIS_LSO2_INFO_MAKEIPV4(thoff, mss) \ + NDIS_LSO2_INFO_MAKE((thoff), (mss)) + +#define NDIS_LSO2_INFO_MAKEIPV6(thoff, mss) \ + (NDIS_LSO2_INFO_MAKE((thoff), (mss)) | NDIS_LSO2_INFO_ISIPV6) + +/* Transmission checksum */ +#define NDIS_TXCSUM_INFO_SIZE sizeof(uint32_t) +#define NDIS_TXCSUM_INFO_IPV4 0x00000001 +#define NDIS_TXCSUM_INFO_IPV6 0x00000002 +#define NDIS_TXCSUM_INFO_TCPCS 0x00000004 +#define NDIS_TXCSUM_INFO_UDPCS 0x00000008 +#define NDIS_TXCSUM_INFO_IPCS 0x00000010 +#define NDIS_TXCSUM_INFO_THOFF 0x03ff0000 + +#endif /* !_NET_NDIS_H_ */ diff --git a/sys/dev/hyperv/stordisengage/hv_ata_pci_disengage.c b/sys/dev/hyperv/stordisengage/hv_ata_pci_disengage.c deleted file mode 100644 index 8022026..0000000 --- a/sys/dev/hyperv/stordisengage/hv_ata_pci_disengage.c +++ /dev/null @@ -1,157 +0,0 @@ -/*- - * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer, - * without modification, immediately at the beginning of the file. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -/*- - * Copyright (c) 2009-2013 Microsoft Corp. - * Copyright (c) 2012 NetApp Inc. - * Copyright (c) 2012 Citrix Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/module.h> -#include <sys/ata.h> -#include <sys/bus.h> -#include <sys/conf.h> -#include <sys/malloc.h> -#include <sys/sema.h> -#include <sys/taskqueue.h> -#include <vm/uma.h> -#include <machine/stdarg.h> -#include <machine/resource.h> -#include <machine/bus.h> -#include <sys/rman.h> -#include <dev/pci/pcivar.h> -#include <dev/pci/pcireg.h> -#include <dev/ata/ata-all.h> -#include <dev/ata/ata-pci.h> -#include <ata_if.h> - -/* prototypes */ -static int hv_ata_pci_probe(device_t dev); -static int hv_ata_pci_attach(device_t dev); -static int hv_ata_pci_detach(device_t dev); - -/* - * generic PCI ATA device probe - */ -static int -hv_ata_pci_probe(device_t dev) -{ - device_t parent = device_get_parent(dev); - int ata_disk_enable; - - ata_disk_enable = 0; - - /* - * Don't probe if not running in a Hyper-V environment - */ - if (vm_guest != VM_GUEST_HV) - return (ENXIO); - - if (device_get_unit(parent) != 0 || device_get_ivars(dev) != 0) - return (ENXIO); - - /* - * On Hyper-V the default is to use the enlightened driver for - * IDE disks. However, if the user wishes to use the native - * ATA driver, the environment variable - * hw_ata.disk_enable must be explicitly set to 1. - */ - if (getenv_int("hw.ata.disk_enable", &ata_disk_enable)) { - if (bootverbose) - device_printf(dev, - "hw.ata.disk_enable flag is disabling Hyper-V" - " ATA driver support\n"); - return (ENXIO); - } - - device_set_desc(dev, "Hyper-V ATA storage disengage driver"); - - return (BUS_PROBE_DEFAULT); -} - -static int -hv_ata_pci_attach(device_t dev) -{ - - return (0); -} - -static int -hv_ata_pci_detach(device_t dev) -{ - - return (0); -} - -static device_method_t hv_ata_pci_methods[] = { - /* device interface */ - DEVMETHOD(device_probe, hv_ata_pci_probe), - DEVMETHOD(device_attach, hv_ata_pci_attach), - DEVMETHOD(device_detach, hv_ata_pci_detach), - DEVMETHOD(device_shutdown, bus_generic_shutdown), - - DEVMETHOD_END -}; - -devclass_t hv_ata_pci_devclass; - -static driver_t hv_ata_pci_disengage_driver = { - "ata", - hv_ata_pci_methods, - 0, -}; - -DRIVER_MODULE(atapci_dis, atapci, hv_ata_pci_disengage_driver, - hv_ata_pci_devclass, NULL, NULL); -MODULE_VERSION(atapci_dis, 1); -MODULE_DEPEND(atapci_dis, ata, 1, 1, 1); diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index 3264225..33cbed7 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/sema.h> #include <sys/sglist.h> +#include <sys/eventhandler.h> #include <machine/bus.h> #include <sys/bus_dma.h> @@ -75,11 +76,9 @@ __FBSDID("$FreeBSD$"); #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/vmbus.h> - #include "hv_vstorage.h" #include "vmbus_if.h" -#define STORVSC_RINGBUFFER_SIZE (20*PAGE_SIZE) #define STORVSC_MAX_LUNS_PER_TARGET (64) #define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2) #define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1) @@ -121,8 +120,6 @@ struct hv_sgl_page_pool{ boolean_t is_init; } g_hv_sgl_page_pool; -#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * STORVSC_DATA_SEGCNT_MAX - enum storvsc_request_type { WRITE_TYPE, READ_TYPE, @@ -130,17 +127,35 @@ enum storvsc_request_type { }; SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, - "Hyper-V storage interface"); + "Hyper-V storage interface"); + +static u_int hv_storvsc_use_win8ext_flags = 1; +SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW, + &hv_storvsc_use_win8ext_flags, 0, + "Use win8 extension flags or not"); static u_int hv_storvsc_use_pim_unmapped = 1; -SYSCTL_INT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN, - &hv_storvsc_use_pim_unmapped, 0, - "Optimize storvsc by using unmapped I/O"); +SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN, + &hv_storvsc_use_pim_unmapped, 0, + "Optimize storvsc by using unmapped I/O"); + +static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE); +SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN, + &hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size"); + +static u_int hv_storvsc_max_io = 512; +SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN, + &hv_storvsc_max_io, 0, "Hyper-V storage max io limit"); + +#define STORVSC_MAX_IO \ + vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size, \ + STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE) struct hv_storvsc_sysctl { u_long data_bio_cnt; u_long data_vaddr_cnt; u_long data_sg_cnt; + u_long chan_send_cnt[MAXCPU]; }; struct storvsc_gpa_range { @@ -184,10 +199,19 @@ struct storvsc_softc { device_t hs_dev; bus_dma_tag_t storvsc_req_dtag; struct hv_storvsc_sysctl sysctl_data; - - struct vmbus_channel *hs_cpu2chan[MAXCPU]; + uint32_t hs_nchan; + struct vmbus_channel *hs_sel_chan[MAXCPU]; }; +static eventhandler_tag storvsc_handler_tag; +/* + * The size of the vmscsi_request has changed in win8. The + * additional size is for the newly added elements in the + * structure. These elements are valid only when we are talking + * to a win8 host. + * Track the correct size we need to apply. + */ +static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension); /** * HyperV storvsc timeout testing cases: @@ -211,7 +235,7 @@ struct storvsc_driver_props { char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; - uint8_t drv_max_ios_per_target; + uint32_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; @@ -240,10 +264,10 @@ static const struct hyperv_guid gBlkVscDeviceType={ static struct storvsc_driver_props g_drv_props_table[] = { {"blkvsc", "Hyper-V IDE Storage Interface", BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS, - STORVSC_RINGBUFFER_SIZE}, + 20*PAGE_SIZE}, {"storvsc", "Hyper-V SCSI Storage Interface", STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS, - STORVSC_RINGBUFFER_SIZE} + 20*PAGE_SIZE} }; /* @@ -253,14 +277,6 @@ static struct storvsc_driver_props g_drv_props_table[] = { static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; /* - * The size of the vmscsi_request has changed in win8. The - * additional size is for the newly added elements in the - * structure. These elements are valid only when we are talking - * to a win8 host. - * Track the correct size we need to apply. - */ -static int vmscsi_size_delta; -/* * The storage protocol version is determined during the * initial exchange with the host. It will indicate which * storage functionality is available in the host. @@ -413,6 +429,9 @@ storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_chans) return; } + /* Update channel count */ + sc->hs_nchan = request_channels_cnt + 1; + /* Wait for sub-channels setup to complete. */ subchan = vmbus_subchan_get(sc->hs_chan, request_channels_cnt); @@ -585,7 +604,6 @@ hv_storvsc_channel_init(struct storvsc_softc *sc) */ if (support_multichannel) storvsc_send_multichannel_request(sc, max_chans); - cleanup: sema_destroy(&request->synch_sema); return (ret); @@ -624,7 +642,6 @@ hv_storvsc_connect_vsp(struct storvsc_softc *sc) } ret = hv_storvsc_channel_init(sc); - return (ret); } @@ -686,7 +703,7 @@ hv_storvsc_io_request(struct storvsc_softc *sc, { struct vstor_packet *vstor_packet = &request->vstor_packet; struct vmbus_channel* outgoing_channel = NULL; - int ret = 0; + int ret = 0, ch_sel; vstor_packet->flags |= REQUEST_COMPLETION_FLAG; @@ -700,7 +717,8 @@ hv_storvsc_io_request(struct storvsc_softc *sc, vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; - outgoing_channel = sc->hs_cpu2chan[curcpu]; + ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan; + outgoing_channel = sc->hs_sel_chan[ch_sel]; mtx_unlock(&request->softc->hs_lock); if (request->prp_list.gpa_range.gpa_len) { @@ -712,6 +730,10 @@ hv_storvsc_io_request(struct storvsc_softc *sc, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); } + /* statistic for successful request sending on each channel */ + if (!ret) { + sc->sysctl_data.chan_send_cnt[ch_sel]++; + } mtx_lock(&request->softc->hs_lock); if (ret != 0) { @@ -878,21 +900,15 @@ hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc) static int storvsc_probe(device_t dev) { - int ata_disk_enable = 0; int ret = ENXIO; switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) - device_printf(dev, "DRIVER_BLKVSC-Emulated ATA/IDE probe\n"); - if (!getenv_int("hw.ata.disk_enable", &ata_disk_enable)) { - if(bootverbose) - device_printf(dev, - "Enlightened ATA/IDE detected\n"); - device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); - ret = BUS_PROBE_DEFAULT; - } else if(bootverbose) - device_printf(dev, "Emulated ATA/IDE set (hw.ata.disk_enable set)\n"); + device_printf(dev, + "Enlightened ATA/IDE detected\n"); + device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); + ret = BUS_PROBE_DEFAULT; break; case DRIVER_STORVSC: if(bootverbose) @@ -907,17 +923,20 @@ storvsc_probe(device_t dev) } static void -storvsc_create_cpu2chan(struct storvsc_softc *sc) +storvsc_create_chan_sel(struct storvsc_softc *sc) { - int cpu; + struct vmbus_channel **subch; + int i, nsubch; - CPU_FOREACH(cpu) { - sc->hs_cpu2chan[cpu] = vmbus_chan_cpu2chan(sc->hs_chan, cpu); - if (bootverbose) { - device_printf(sc->hs_dev, "cpu%d -> chan%u\n", - cpu, vmbus_chan_id(sc->hs_cpu2chan[cpu])); - } - } + sc->hs_sel_chan[0] = sc->hs_chan; + nsubch = sc->hs_nchan - 1; + if (nsubch == 0) + return; + + subch = vmbus_subchan_get(sc->hs_chan, nsubch); + for (i = 0; i < nsubch; i++) + sc->hs_sel_chan[i + 1] = subch[i]; + vmbus_subchan_rel(subch, nsubch); } static int @@ -977,7 +996,10 @@ storvsc_sysctl(device_t dev) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; + struct sysctl_oid *ch_tree, *chid_tree; struct storvsc_softc *sc; + char name[16]; + int i; sc = device_get_softc(dev); ctx = device_get_sysctl_ctx(dev); @@ -989,6 +1011,28 @@ storvsc_sysctl(device_t dev) &sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW, &sc->sysctl_data.data_sg_cnt, "# of sg data block"); + + /* dev.storvsc.UNIT.channel */ + ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (ch_tree == NULL) + return; + + for (i = 0; i < sc->hs_nchan; i++) { + uint32_t ch_id; + + ch_id = vmbus_chan_id(sc->hs_sel_chan[i]); + snprintf(name, sizeof(name), "%d", ch_id); + /* dev.storvsc.UNIT.channel.CHID */ + chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (chid_tree == NULL) + return; + /* dev.storvsc.UNIT.channel.CHID.send_req */ + SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i], + "# of request sending from this channel"); + } } /** @@ -1019,6 +1063,7 @@ storvsc_attach(device_t dev) root_mount_token = root_mount_hold("storvsc"); sc = device_get_softc(dev); + sc->hs_nchan = 1; sc->hs_chan = vmbus_get_channel(dev); stor_type = storvsc_get_storage_type(dev); @@ -1030,7 +1075,14 @@ storvsc_attach(device_t dev) /* fill in driver specific properties */ sc->hs_drv_props = &g_drv_props_table[stor_type]; - + sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size; + sc->hs_drv_props->drv_max_ios_per_target = + MIN(STORVSC_MAX_IO, hv_storvsc_max_io); + if (bootverbose) { + printf("storvsc ringbuffer size: %d, max_io: %d\n", + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_max_ios_per_target); + } /* fill in device specific properties */ sc->hs_unit = device_get_unit(dev); sc->hs_dev = dev; @@ -1052,7 +1104,7 @@ storvsc_attach(device_t dev) * STORVSC_DATA_SEGCNT_MAX segments, each * segment has one page buffer */ - for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) { + for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) { sgl_node = malloc(sizeof(struct hv_sgl_node), M_DEVBUF, M_WAITOK|M_ZERO); @@ -1083,7 +1135,7 @@ storvsc_attach(device_t dev) } /* Construct cpu to channel mapping */ - storvsc_create_cpu2chan(sc); + storvsc_create_chan_sel(sc); /* * Create the device queue. @@ -1840,19 +1892,37 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) csio->cdb_len); } + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60; + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DISABLE_SYNCH_TRANSFER; + } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_OUT: - reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DATA_OUT; + } break; case CAM_DIR_IN: reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DATA_IN; + } break; case CAM_DIR_NONE: reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_NO_DATA_TRANSFER; + } break; default: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; + printf("Error: unexpected data direction: 0x%x\n", + ccb->ccb_h.flags & CAM_DIR_MASK); + return (EINVAL); } reqp->sense_data = &csio->sense_data; @@ -2081,27 +2151,45 @@ storvsc_io_done(struct hv_storvsc_request *reqp) ccb->ccb_h.status &= ~CAM_STATUS_MASK; if (vm_srb->scsi_status == SCSI_STATUS_OK) { const struct scsi_generic *cmd; - + cmd = (const struct scsi_generic *) + ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? + csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); if (vm_srb->srb_status != SRB_STATUS_SUCCESS) { - if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) { - xpt_print(ccb->ccb_h.path, "invalid LUN %d\n", - vm_srb->lun); - } else { - xpt_print(ccb->ccb_h.path, "Unknown SRB flag: %d\n", - vm_srb->srb_status); - } /* * If there are errors, for example, invalid LUN, * host will inform VM through SRB status. */ - ccb->ccb_h.status |= CAM_SEL_TIMEOUT; + if (bootverbose) { + if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) { + xpt_print(ccb->ccb_h.path, + "invalid LUN %d for op: %s\n", + vm_srb->lun, + scsi_op_desc(cmd->opcode, NULL)); + } else { + xpt_print(ccb->ccb_h.path, + "Unknown SRB flag: %d for op: %s\n", + vm_srb->srb_status, + scsi_op_desc(cmd->opcode, NULL)); + } + } + + /* + * XXX For a selection timeout, all of the LUNs + * on the target will be gone. It works for SCSI + * disks, but does not work for IDE disks. + * + * For CAM_DEV_NOT_THERE, CAM will only get + * rid of the device(s) specified by the path. + */ + if (storvsc_get_storage_type(sc->hs_dev) == + DRIVER_STORVSC) + ccb->ccb_h.status |= CAM_SEL_TIMEOUT; + else + ccb->ccb_h.status |= CAM_DEV_NOT_THERE; } else { ccb->ccb_h.status |= CAM_REQ_CMP; } - cmd = (const struct scsi_generic *) - ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? - csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); if (cmd->opcode == INQUIRY) { struct scsi_inquiry_data *inq_data = (struct scsi_inquiry_data *)csio->data_ptr; @@ -2122,7 +2210,7 @@ storvsc_io_done(struct hv_storvsc_request *reqp) resp_buf[3], resp_buf[4]); } if (vm_srb->srb_status == SRB_STATUS_SUCCESS && - data_len > SHORT_INQUIRY_LENGTH) { + data_len >= SHORT_INQUIRY_LENGTH) { char vendor[16]; cam_strvis(vendor, inq_data->vendor, @@ -2213,3 +2301,58 @@ storvsc_get_storage_type(device_t dev) return DRIVER_STORVSC; return DRIVER_UNKNOWN; } + +#define PCI_VENDOR_INTEL 0x8086 +#define PCI_PRODUCT_PIIX4 0x7111 + +static void +storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path, + struct ata_params *ident_buf __unused, int *veto) +{ + + /* + * The ATA disks are shared with the controllers managed + * by this driver, so veto the ATA disks' attachment; the + * ATA disks will be attached as SCSI disks once this driver + * attached. + */ + if (path->device->protocol == PROTO_ATA) { + struct ccb_pathinq cpi; + + bzero(&cpi, sizeof(cpi)); + xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE); + cpi.ccb_h.func_code = XPT_PATH_INQ; + xpt_action((union ccb *)&cpi); + if (cpi.ccb_h.status == CAM_REQ_CMP && + cpi.hba_vendor == PCI_VENDOR_INTEL && + cpi.hba_device == PCI_PRODUCT_PIIX4) { + (*veto)++; + if (bootverbose) { + xpt_print(path, + "Disable ATA disks on " + "simulated ATA controller (0x%04x%04x)\n", + cpi.hba_device, cpi.hba_vendor); + } + } + } +} + +static void +storvsc_sysinit(void *arg __unused) +{ + if (vm_guest == VM_GUEST_HV) { + storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto, + storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY); + } +} +SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit, + NULL); + +static void +storvsc_sysuninit(void *arg __unused) +{ + if (storvsc_handler_tag != NULL) + EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag); +} +SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, + storvsc_sysuninit, NULL); diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h index 9205e35..33e7df3 100644 --- a/sys/dev/hyperv/storvsc/hv_vstorage.h +++ b/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -253,6 +253,22 @@ struct vstor_packet { #define SRB_STATUS_AUTOSENSE_VALID 0x80 #define SRB_STATUS_INVALID_LUN 0X20 +/* + * SRB Flag Bits + */ + +#define SRB_FLAGS_QUEUE_ACTION_ENABLE 0x00000002 +#define SRB_FLAGS_DISABLE_DISCONNECT 0x00000004 +#define SRB_FLAGS_DISABLE_SYNCH_TRANSFER 0x00000008 +#define SRB_FLAGS_BYPASS_FROZEN_QUEUE 0x00000010 +#define SRB_FLAGS_DISABLE_AUTOSENSE 0x00000020 +#define SRB_FLAGS_DATA_IN 0x00000040 +#define SRB_FLAGS_DATA_OUT 0x00000080 +#define SRB_FLAGS_NO_DATA_TRANSFER 0x00000000 +#define SRB_FLAGS_UNSPECIFIED_DIRECTION (SRB_FLAGS_DATA_IN | SRB_FLAGS_DATA_OUT) +#define SRB_FLAGS_NO_QUEUE_FREEZE 0x00000100 +#define SRB_FLAGS_ADAPTER_CACHE_ENABLE 0x00000200 +#define SRB_FLAGS_FREE_SENSE_BUFFER 0x00000400 /** * Packet flags */ diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c index be0f3e6..3786714 100644 --- a/sys/dev/hyperv/utilities/hv_heartbeat.c +++ b/sys/dev/hyperv/utilities/hv_heartbeat.c @@ -22,99 +22,106 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + #include <sys/param.h> -#include <sys/kernel.h> #include <sys/bus.h> -#include <sys/malloc.h> +#include <sys/kernel.h> #include <sys/module.h> -#include <sys/timetc.h> -#include <sys/syscallsubr.h> #include <sys/systm.h> #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/vmbus.h> -#include <dev/hyperv/utilities/hv_utilreg.h> -#include "hv_util.h" +#include <dev/hyperv/utilities/hv_util.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> + #include "vmbus_if.h" -/* Heartbeat Service */ -static const struct hyperv_guid service_guid = { .hv_guid = - {0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, - 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} }; +static const struct vmbus_ic_desc vmbus_heartbeat_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, + 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} }, + .ic_desc = "Hyper-V Heartbeat" + }, + VMBUS_IC_DESC_END +}; -/** - * Process heartbeat message - */ static void -hv_heartbeat_cb(struct vmbus_channel *channel, void *context) +vmbus_heartbeat_cb(struct vmbus_channel *chan, void *xsc) { - uint8_t* buf; - int recvlen; - uint64_t requestid; - int ret; - - struct hv_vmbus_heartbeat_msg_data* heartbeat_msg; - struct hv_vmbus_icmsg_hdr* icmsghdrp; - hv_util_sc *softc; - - softc = (hv_util_sc*)context; - buf = softc->receive_buffer;; - - recvlen = PAGE_SIZE; - ret = vmbus_chan_recv(channel, buf, &recvlen, &requestid); - KASSERT(ret != ENOBUFS, ("hvheartbeat recvbuf is not large enough")); - /* XXX check recvlen to make sure that it contains enough data */ - - if ((ret == 0) && recvlen > 0) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, buf); - - } else { - heartbeat_msg = - (struct hv_vmbus_heartbeat_msg_data *) - &buf[sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - - heartbeat_msg->seq_num += 1; - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | - HV_ICMSGHDRFLAG_RESPONSE; - - vmbus_chan_send(channel, VMBUS_CHANPKT_TYPE_INBAND, 0, - buf, recvlen, requestid); + struct hv_util_sc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + int dlen, error; + uint64_t xactid; + void *data; + + /* + * Receive request. + */ + data = sc->receive_buffer; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; + + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen); + if (error) + return; + break; + + case VMBUS_ICMSG_TYPE_HEARTBEAT: + /* Only ic_seq is a must */ + if (dlen < VMBUS_ICMSG_HEARTBEAT_SIZE_MIN) { + device_printf(sc->ic_dev, "invalid heartbeat len %d\n", + dlen); + return; + } + ((struct vmbus_icmsg_heartbeat *)data)->ic_seq++; + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; } + + /* + * Send response by echoing the updated request back. + */ + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); } static int hv_heartbeat_probe(device_t dev) { - if (resource_disabled("hvheartbeat", 0)) - return ENXIO; - if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &service_guid) == 0) { - device_set_desc(dev, "Hyper-V Heartbeat Service"); - return BUS_PROBE_DEFAULT; - } - return ENXIO; + return (vmbus_ic_probe(dev, vmbus_heartbeat_descs)); } static int hv_heartbeat_attach(device_t dev) { - hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev); - - softc->callback = hv_heartbeat_cb; - return hv_util_attach(dev); + return (hv_util_attach(dev, vmbus_heartbeat_cb)); } static device_method_t heartbeat_methods[] = { diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c index c6d3533..797451e 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.c +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -54,17 +54,13 @@ __FBSDID("$FreeBSD$"); #include <sys/un.h> #include <sys/endian.h> #include <sys/_null.h> +#include <sys/sema.h> #include <sys/signal.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/mutex.h> -#include <net/if.h> -#include <net/if_arp.h> -#include <net/if_var.h> - #include <dev/hyperv/include/hyperv.h> -#include <dev/hyperv/netvsc/hv_net_vsc.h> #include <dev/hyperv/utilities/hv_utilreg.h> #include "hv_util.h" @@ -91,9 +87,15 @@ static int hv_kvp_log = 0; log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \ } while (0) -static const struct hyperv_guid service_guid = { .hv_guid = - {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, - 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6} }; +static const struct vmbus_ic_desc vmbus_kvp_descs[] = { + { + .ic_guid = { .hv_guid = { + 0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, + 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6 } }, + .ic_desc = "Hyper-V KVP" + }, + VMBUS_IC_DESC_END +}; /* character device prototypes */ static d_open_t hv_kvp_dev_open; @@ -215,10 +217,9 @@ hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len, * hv_kvp - version neogtiation function */ static void -hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, - struct hv_vmbus_icmsg_negotiate *negop, - uint8_t *buf) +hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, uint8_t *buf) { + struct hv_vmbus_icmsg_negotiate *negop; int icframe_vercnt; int icmsg_vercnt; @@ -331,24 +332,25 @@ hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) { for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) { - /* XXX access other driver's softc? are you kidding? */ device_t dev = devs[devcnt]; - struct hn_softc *sc = device_get_softc(dev); struct vmbus_channel *chan; char buf[HYPERV_GUID_STRLEN]; + int n; - /* - * Trying to find GUID of Network Device - * TODO: need vmbus interface. - */ chan = vmbus_get_channel(dev); - hyperv_guid2str(vmbus_chan_guid_inst(chan), - buf, sizeof(buf)); + n = hyperv_guid2str(vmbus_chan_guid_inst(chan), buf, + sizeof(buf)); - if (strncmp(buf, (char *)umsg->body.kvp_ip_val.adapter_id, - HYPERV_GUID_STRLEN - 1) == 0) { + /* + * The string in the 'kvp_ip_val.adapter_id' has + * braces around the GUID; skip the leading brace + * in 'kvp_ip_val.adapter_id'. + */ + if (strncmp(buf, + ((char *)&umsg->body.kvp_ip_val.adapter_id) + 1, + n) == 0) { strlcpy((char *)umsg->body.kvp_ip_val.adapter_id, - sc->hn_ifp->if_xname, MAX_ADAPTER_ID_SIZE); + device_get_nameunit(dev), MAX_ADAPTER_ID_SIZE); break; } } @@ -629,7 +631,7 @@ hv_kvp_process_request(void *context, int pending) kvp_buf = sc->util_sc.receive_buffer;; channel = vmbus_get_channel(sc->dev); - recvlen = 2 * PAGE_SIZE; + recvlen = sc->util_sc.ic_buflen; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ @@ -641,7 +643,7 @@ hv_kvp_process_request(void *context, int pending) hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf); if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf); + hv_kvp_negotiate_version(icmsghdrp, kvp_buf); hv_kvp_respond_host(sc, ret); /* @@ -696,7 +698,7 @@ hv_kvp_process_request(void *context, int pending) /* * Try reading next buffer */ - recvlen = 2 * PAGE_SIZE; + recvlen = sc->util_sc.ic_buflen; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ @@ -873,14 +875,8 @@ hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) static int hv_kvp_probe(device_t dev) { - if (resource_disabled("hvkvp", 0)) - return ENXIO; - if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &service_guid) == 0) { - device_set_desc(dev, "Hyper-V KVP Service"); - return BUS_PROBE_DEFAULT; - } - return ENXIO; + return (vmbus_ic_probe(dev, vmbus_kvp_descs)); } static int @@ -892,7 +888,6 @@ hv_kvp_attach(device_t dev) hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); - sc->util_sc.callback = hv_kvp_callback; sc->dev = dev; sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore"); mtx_init(&sc->pending_mutex, "hv-kvp pending mutex", @@ -920,7 +915,7 @@ hv_kvp_attach(device_t dev) return (error); sc->hv_kvp_dev->si_drv1 = sc; - return hv_util_attach(dev); + return hv_util_attach(dev, hv_kvp_callback); } static int diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c index 352df91..4580093 100644 --- a/sys/dev/hyperv/utilities/hv_shutdown.c +++ b/sys/dev/hyperv/utilities/hv_shutdown.c @@ -22,121 +22,121 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -/* - * A common driver for all hyper-V util services. - */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); #include <sys/param.h> -#include <sys/kernel.h> #include <sys/bus.h> -#include <sys/malloc.h> +#include <sys/kernel.h> #include <sys/module.h> #include <sys/reboot.h> -#include <sys/timetc.h> -#include <sys/syscallsubr.h> #include <sys/systm.h> #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/vmbus.h> -#include <dev/hyperv/utilities/hv_utilreg.h> -#include "hv_util.h" +#include <dev/hyperv/utilities/hv_util.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> + #include "vmbus_if.h" -static const struct hyperv_guid service_guid = { .hv_guid = - {0x31, 0x60, 0x0B, 0X0E, 0x13, 0x52, 0x34, 0x49, - 0x81, 0x8B, 0x38, 0XD9, 0x0C, 0xED, 0x39, 0xDB} }; +static const struct vmbus_ic_desc vmbus_shutdown_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x31, 0x60, 0x0b, 0x0e, 0x13, 0x52, 0x34, 0x49, + 0x81, 0x8b, 0x38, 0xd9, 0x0c, 0xed, 0x39, 0xdb } }, + .ic_desc = "Hyper-V Shutdown" + }, + VMBUS_IC_DESC_END +}; -/** - * Shutdown - */ static void -hv_shutdown_cb(struct vmbus_channel *channel, void *context) +vmbus_shutdown_cb(struct vmbus_channel *chan, void *xsc) { - uint8_t* buf; - uint8_t execute_shutdown = 0; - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recv_len; - uint64_t request_id; - int ret; - hv_vmbus_shutdown_msg_data* shutdown_msg; - hv_util_sc *softc; - - softc = (hv_util_sc*)context; - buf = softc->receive_buffer;; - - recv_len = PAGE_SIZE; - ret = vmbus_chan_recv(channel, buf, &recv_len, &request_id); - KASSERT(ret != ENOBUFS, ("hvshutdown recvbuf is not large enough")); - /* XXX check recv_len to make sure that it contains enough data */ - - if ((ret == 0) && recv_len > 0) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, buf); - - } else { - shutdown_msg = - (struct hv_vmbus_shutdown_msg_data *) - &buf[sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - - switch (shutdown_msg->flags) { - case 0: - case 1: - icmsghdrp->status = HV_S_OK; - execute_shutdown = 1; - if(bootverbose) - printf("Shutdown request received -" - " graceful shutdown initiated\n"); - break; - default: - icmsghdrp->status = HV_E_FAIL; - execute_shutdown = 0; - printf("Shutdown request received -" - " Invalid request\n"); - break; - } - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | - HV_ICMSGHDRFLAG_RESPONSE; - - vmbus_chan_send(channel, VMBUS_CHANPKT_TYPE_INBAND, 0, - buf, recv_len, request_id); + struct hv_util_sc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + struct vmbus_icmsg_shutdown *msg; + int dlen, error, do_shutdown = 0; + uint64_t xactid; + void *data; + + /* + * Receive request. + */ + data = sc->receive_buffer; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; + + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen); + if (error) + return; + break; + + case VMBUS_ICMSG_TYPE_SHUTDOWN: + if (dlen < VMBUS_ICMSG_SHUTDOWN_SIZE_MIN) { + device_printf(sc->ic_dev, "invalid shutdown len %d\n", + dlen); + return; + } + msg = data; + + /* XXX ic_flags definition? */ + if (msg->ic_haltflags == 0 || msg->ic_haltflags == 1) { + device_printf(sc->ic_dev, "shutdown requested\n"); + hdr->ic_status = VMBUS_ICMSG_STATUS_OK; + do_shutdown = 1; + } else { + device_printf(sc->ic_dev, "unknown shutdown flags " + "0x%08x\n", msg->ic_haltflags); + hdr->ic_status = VMBUS_ICMSG_STATUS_FAIL; + } + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; } - if (execute_shutdown) - shutdown_nice(RB_POWEROFF); + /* + * Send response by echoing the updated request back. + */ + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); + + if (do_shutdown) + shutdown_nice(RB_POWEROFF); } static int hv_shutdown_probe(device_t dev) { - if (resource_disabled("hvshutdown", 0)) - return ENXIO; - if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &service_guid) == 0) { - device_set_desc(dev, "Hyper-V Shutdown Service"); - return BUS_PROBE_DEFAULT; - } - return ENXIO; + return (vmbus_ic_probe(dev, vmbus_shutdown_descs)); } static int hv_shutdown_attach(device_t dev) { - hv_util_sc *softc = (hv_util_sc*)device_get_softc(dev); - - softc->callback = hv_shutdown_cb; - return hv_util_attach(dev); + return (hv_util_attach(dev, vmbus_shutdown_cb)); } static device_method_t shutdown_methods[] = { diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c index 7ac4fb6..2d44026 100644 --- a/sys/dev/hyperv/utilities/hv_timesync.c +++ b/sys/dev/hyperv/utilities/hv_timesync.c @@ -22,197 +22,199 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -/* - * A common driver for all hyper-V util services. - */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); #include <sys/param.h> -#include <sys/kernel.h> #include <sys/bus.h> -#include <sys/malloc.h> +#include <sys/kernel.h> #include <sys/module.h> -#include <sys/reboot.h> -#include <sys/timetc.h> #include <sys/syscallsubr.h> +#include <sys/sysctl.h> #include <sys/systm.h> -#include <sys/taskqueue.h> #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/vmbus.h> -#include <dev/hyperv/utilities/hv_utilreg.h> -#include "hv_util.h" +#include <dev/hyperv/utilities/hv_util.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> + #include "vmbus_if.h" -#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */ -#define HV_ICTIMESYNCFLAG_PROBE 0 -#define HV_ICTIMESYNCFLAG_SYNC 1 -#define HV_ICTIMESYNCFLAG_SAMPLE 2 -#define HV_NANO_SEC_PER_SEC 1000000000 - -/* Time Sync data */ -typedef struct { - uint64_t data; -} time_sync_data; - - /* Time Synch Service */ -static const struct hyperv_guid service_guid = {.hv_guid = - {0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, - 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } }; - -struct hv_ictimesync_data { - uint64_t parenttime; - uint64_t childtime; - uint64_t roundtriptime; - uint8_t flags; -} __packed; - -typedef struct hv_timesync_sc { - hv_util_sc util_sc; - struct task task; - time_sync_data time_msg; -} hv_timesync_sc; - -/** - * Set host time based on time sync message from host - */ -static void -hv_set_host_time(void *context, int pending) -{ - hv_timesync_sc *softc = (hv_timesync_sc*)context; - uint64_t hosttime = softc->time_msg.data; - struct timespec guest_ts, host_ts; - uint64_t host_tns; - int64_t diff; - int error; +static const struct vmbus_ic_desc vmbus_timesync_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, + 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } }, + .ic_desc = "Hyper-V Timesync" + }, + VMBUS_IC_DESC_END +}; - host_tns = (hosttime - HV_WLTIMEDELTA) * 100; - host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC); - host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC); +SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + "Hyper-V timesync interface"); - nanotime(&guest_ts); +static int vmbus_ts_ignore_sync = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN, + &vmbus_ts_ignore_sync, 0, "Ignore the sync request."); - diff = (int64_t)host_ts.tv_sec - (int64_t)guest_ts.tv_sec; +/* + * Trigger sample sync when drift exceeds threshold (ms). + * Ignore the sample request when set to 0. + */ +static int vmbus_ts_sample_thresh = 100; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN, + &vmbus_ts_sample_thresh, 0, + "Threshold that makes sample request trigger the sync (unit: ms)."); - /* - * If host differs by 5 seconds then make the guest catch up - */ - if (diff > 5 || diff < -5) { - error = kern_clock_settime(curthread, CLOCK_REALTIME, - &host_ts); - } -} +static int vmbus_ts_sample_verbose = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN, + &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity."); -/** - * @brief Synchronize time with host after reboot, restore, etc. - * - * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. - * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time - * message after the timesync channel is opened. Since the hv_utils module is - * loaded after hv_vmbus, the first message is usually missed. The other - * thing is, systime is automatically set to emulated hardware clock which may - * not be UTC time or in the same time zone. So, to override these effects, we - * use the first 50 time samples for initial system time setting. - */ -static inline -void hv_adj_guesttime(hv_timesync_sc *sc, uint64_t hosttime, uint8_t flags) +static void +vmbus_timesync(struct hv_util_sc *sc, uint64_t hvtime, uint8_t tsflags) { - sc->time_msg.data = hosttime; + struct timespec vm_ts; + uint64_t hv_ns, vm_ns; + + hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE) * VMBUS_ICMSG_TS_FACTOR; + nanotime(&vm_ts); + vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec; + + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) { + struct timespec hv_ts; + + if (bootverbose) { + device_printf(sc->ic_dev, "apply sync request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); + /* Done! */ + return; + } - if (((flags & HV_ICTIMESYNCFLAG_SYNC) != 0) || - ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0)) { - taskqueue_enqueue(taskqueue_thread, &sc->task); + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) && + vmbus_ts_sample_thresh > 0) { + int64_t diff; + + if (vmbus_ts_sample_verbose) { + device_printf(sc->ic_dev, "sample request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + + if (hv_ns > vm_ns) + diff = hv_ns - vm_ns; + else + diff = vm_ns - hv_ns; + /* nanosec -> millisec */ + diff /= 1000000; + + if (diff > vmbus_ts_sample_thresh) { + struct timespec hv_ts; + + if (bootverbose) { + device_printf(sc->ic_dev, + "apply sample request, hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); + } + /* Done */ + return; } } -/** - * Time Sync Channel message handler - */ static void -hv_timesync_cb(struct vmbus_channel *channel, void *context) +vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc) { - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recvlen; - uint64_t requestId; - int ret; - uint8_t* time_buf; - struct hv_ictimesync_data* timedatap; - hv_timesync_sc *softc; - - softc = (hv_timesync_sc*)context; - time_buf = softc->util_sc.receive_buffer; - - recvlen = PAGE_SIZE; - ret = vmbus_chan_recv(channel, time_buf, &recvlen, &requestId); - KASSERT(ret != ENOBUFS, ("hvtimesync recvbuf is not large enough")); - /* XXX check recvlen to make sure that it contains enough data */ - - if ((ret == 0) && recvlen > 0) { - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - hv_negotiate_version(icmsghdrp, NULL, time_buf); - } else { - timedatap = (struct hv_ictimesync_data *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - hv_adj_guesttime(softc, timedatap->parenttime, timedatap->flags); - } - - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION - | HV_ICMSGHDRFLAG_RESPONSE; - - vmbus_chan_send(channel, VMBUS_CHANPKT_TYPE_INBAND, 0, - time_buf, recvlen, requestId); - } -} + struct hv_util_sc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + const struct vmbus_icmsg_timesync *msg; + int dlen, error; + uint64_t xactid; + void *data; -static int -hv_timesync_probe(device_t dev) -{ - if (resource_disabled("hvtimesync", 0)) - return ENXIO; + /* + * Receive request. + */ + data = sc->receive_buffer; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; - if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &service_guid) == 0) { - device_set_desc(dev, "Hyper-V Time Synch Service"); - return BUS_PROBE_DEFAULT; + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen); + if (error) + return; + break; + + case VMBUS_ICMSG_TYPE_TIMESYNC: + if (dlen < sizeof(*msg)) { + device_printf(sc->ic_dev, "invalid timesync len %d\n", + dlen); + return; + } + msg = data; + vmbus_timesync(sc, msg->ic_hvtime, msg->ic_tsflags); + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; } - return ENXIO; + + /* + * Send response by echoing the updated request back. + */ + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); } static int -hv_timesync_attach(device_t dev) +hv_timesync_probe(device_t dev) { - hv_timesync_sc *softc = device_get_softc(dev); - softc->util_sc.callback = hv_timesync_cb; - TASK_INIT(&softc->task, 1, hv_set_host_time, softc); - - return hv_util_attach(dev); + return (vmbus_ic_probe(dev, vmbus_timesync_descs)); } static int -hv_timesync_detach(device_t dev) +hv_timesync_attach(device_t dev) { - hv_timesync_sc *softc = device_get_softc(dev); - taskqueue_drain(taskqueue_thread, &softc->task); - return hv_util_detach(dev); + return (hv_util_attach(dev, vmbus_timesync_cb)); } static device_method_t timesync_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hv_timesync_probe), DEVMETHOD(device_attach, hv_timesync_attach), - DEVMETHOD(device_detach, hv_timesync_detach), + DEVMETHOD(device_detach, hv_util_detach), { 0, 0 } }; -static driver_t timesync_driver = { "hvtimesync", timesync_methods, sizeof(hv_timesync_sc)}; +static driver_t timesync_driver = { "hvtimesync", timesync_methods, sizeof(hv_util_sc)}; static devclass_t timesync_devclass; diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c index e398faa..3fc16c9 100644 --- a/sys/dev/hyperv/utilities/hv_util.c +++ b/sys/dev/hyperv/utilities/hv_util.c @@ -36,54 +36,100 @@ #include <sys/malloc.h> #include <sys/module.h> #include <sys/reboot.h> +#include <sys/systm.h> #include <sys/timetc.h> -#include <sys/syscallsubr.h> #include <dev/hyperv/include/hyperv.h> #include <dev/hyperv/include/vmbus.h> -#include <dev/hyperv/utilities/hv_utilreg.h> -#include "hv_util.h" - -void -hv_negotiate_version( - struct hv_vmbus_icmsg_hdr* icmsghdrp, - struct hv_vmbus_icmsg_negotiate* negop, - uint8_t* buf) +#include <dev/hyperv/utilities/hv_util.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> + +#include "vmbus_if.h" + +#define VMBUS_IC_BRSIZE (4 * PAGE_SIZE) + +#define VMBUS_IC_VERCNT 2 +#define VMBUS_IC_NEGOSZ \ + __offsetof(struct vmbus_icmsg_negotiate, ic_ver[VMBUS_IC_VERCNT]) +CTASSERT(VMBUS_IC_NEGOSZ < VMBUS_IC_BRSIZE); + +int +vmbus_ic_negomsg(struct hv_util_sc *sc, void *data, int *dlen0) { - icmsghdrp->icmsgsize = 0x10; - - negop = (struct hv_vmbus_icmsg_negotiate *)&buf[ - sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - - if (negop->icframe_vercnt >= 2 && - negop->icversion_data[1].major == 3) { - negop->icversion_data[0].major = 3; - negop->icversion_data[0].minor = 0; - negop->icversion_data[1].major = 3; - negop->icversion_data[1].minor = 0; - } else { - negop->icversion_data[0].major = 1; - negop->icversion_data[0].minor = 0; - negop->icversion_data[1].major = 1; - negop->icversion_data[1].minor = 0; + struct vmbus_icmsg_negotiate *nego; + int cnt, major, dlen = *dlen0; + + /* + * Preliminary message size verification + */ + if (dlen < sizeof(*nego)) { + device_printf(sc->ic_dev, "truncated ic negotiate, len %d\n", + dlen); + return EINVAL; + } + nego = data; + + cnt = nego->ic_fwver_cnt + nego->ic_msgver_cnt; + if (dlen < __offsetof(struct vmbus_icmsg_negotiate, ic_ver[cnt])) { + device_printf(sc->ic_dev, "ic negotiate does not contain " + "versions %d\n", dlen); + return EINVAL; } - negop->icframe_vercnt = 1; - negop->icmsg_vercnt = 1; + /* Select major version; XXX looks wrong. */ + if (nego->ic_fwver_cnt >= 2 && VMBUS_ICVER_MAJOR(nego->ic_ver[1]) == 3) + major = 3; + else + major = 1; + + /* One framework version */ + nego->ic_fwver_cnt = 1; + nego->ic_ver[0] = VMBUS_IC_VERSION(major, 0); + + /* One message version */ + nego->ic_msgver_cnt = 1; + nego->ic_ver[1] = VMBUS_IC_VERSION(major, 0); + + /* Update data size */ + nego->ic_hdr.ic_dsize = VMBUS_IC_NEGOSZ - + sizeof(struct vmbus_icmsg_hdr); + + /* Update total size, if necessary */ + if (dlen < VMBUS_IC_NEGOSZ) + *dlen0 = VMBUS_IC_NEGOSZ; + + return 0; } int -hv_util_attach(device_t dev) +vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]) { - struct hv_util_sc* softc; - struct vmbus_channel *chan; - int ret; + device_t bus = device_get_parent(dev); + const struct vmbus_ic_desc *d; + + if (resource_disabled(device_get_name(dev), 0)) + return (ENXIO); - softc = device_get_softc(dev); - softc->receive_buffer = - malloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); - chan = vmbus_get_channel(dev); + for (d = descs; d->ic_desc != NULL; ++d) { + if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) { + device_set_desc(dev, d->ic_desc); + return (BUS_PROBE_DEFAULT); + } + } + return (ENXIO); +} + +int +hv_util_attach(device_t dev, vmbus_chan_callback_t cb) +{ + struct hv_util_sc *sc = device_get_softc(dev); + struct vmbus_channel *chan = vmbus_get_channel(dev); + int error; + + sc->ic_dev = dev; + sc->ic_buflen = VMBUS_IC_BRSIZE; + sc->receive_buffer = malloc(VMBUS_IC_BRSIZE, M_DEVBUF, + M_WAITOK | M_ZERO); /* * These services are not performance critical and do not need @@ -94,17 +140,13 @@ hv_util_attach(device_t dev) */ vmbus_chan_set_readbatch(chan, false); - ret = vmbus_chan_open(chan, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0, - softc->callback, softc); - - if (ret) - goto error0; - + error = vmbus_chan_open(chan, VMBUS_IC_BRSIZE, VMBUS_IC_BRSIZE, NULL, 0, + cb, sc); + if (error) { + free(sc->receive_buffer, M_DEVBUF); + return (error); + } return (0); - -error0: - free(softc->receive_buffer, M_DEVBUF); - return (ret); } int diff --git a/sys/dev/hyperv/utilities/hv_util.h b/sys/dev/hyperv/utilities/hv_util.h index 09202e7..012cdee 100644 --- a/sys/dev/hyperv/utilities/hv_util.h +++ b/sys/dev/hyperv/utilities/hv_util.h @@ -31,23 +31,29 @@ #ifndef _HVUTIL_H_ #define _HVUTIL_H_ +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> + /** * hv_util related structures * */ typedef struct hv_util_sc { - /* - * function to process Hyper-V messages - */ - void (*callback)(struct vmbus_channel *, void *); + device_t ic_dev; uint8_t *receive_buffer; + int ic_buflen; } hv_util_sc; -void hv_negotiate_version( - struct hv_vmbus_icmsg_hdr* icmsghdrp, - struct hv_vmbus_icmsg_negotiate* negop, - uint8_t* buf); +struct vmbus_ic_desc { + const struct hyperv_guid ic_guid; + const char *ic_desc; +}; + +#define VMBUS_IC_DESC_END { .ic_desc = NULL } + +int hv_util_attach(device_t dev, vmbus_chan_callback_t cb); +int hv_util_detach(device_t dev); +int vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]); +int vmbus_ic_negomsg(struct hv_util_sc *, void *data, int *dlen); -int hv_util_attach(device_t dev); -int hv_util_detach(device_t dev); #endif diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h new file mode 100644 index 0000000..683e2f8 --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_icreg.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_ICREG_H_ +#define _VMBUS_ICREG_H_ + +#define VMBUS_ICMSG_TYPE_NEGOTIATE 0 +#define VMBUS_ICMSG_TYPE_HEARTBEAT 1 +#define VMBUS_ICMSG_TYPE_KVP 2 +#define VMBUS_ICMSG_TYPE_SHUTDOWN 3 +#define VMBUS_ICMSG_TYPE_TIMESYNC 4 +#define VMBUS_ICMSG_TYPE_VSS 5 + +#define VMBUS_ICMSG_STATUS_OK 0x00000000 +#define VMBUS_ICMSG_STATUS_FAIL 0x80004005 + +#define VMBUS_IC_VERSION(major, minor) ((major) | (((uint32_t)(minor)) << 16)) +#define VMBUS_ICVER_MAJOR(ver) ((ver) & 0xffff) +#define VMBUS_ICVER_MINOR(ver) (((ver) & 0xffff0000) >> 16) + +struct vmbus_pipe_hdr { + uint32_t ph_flags; + uint32_t ph_msgsz; +} __packed; + +struct vmbus_icmsg_hdr { + struct vmbus_pipe_hdr ic_pipe; + uint32_t ic_fwver; /* framework version */ + uint16_t ic_type; + uint32_t ic_msgver; /* message version */ + uint16_t ic_dsize; /* data size */ + uint32_t ic_status; /* VMBUS_ICMSG_STATUS_ */ + uint8_t ic_xactid; + uint8_t ic_flags; /* VMBUS_ICMSG_FLAG_ */ + uint8_t ic_rsvd[2]; +} __packed; + +#define VMBUS_ICMSG_FLAG_XACT 0x0001 +#define VMBUS_ICMSG_FLAG_REQ 0x0002 +#define VMBUS_ICMSG_FLAG_RESP 0x0004 + +/* VMBUS_ICMSG_TYPE_NEGOTIATE */ +struct vmbus_icmsg_negotiate { + struct vmbus_icmsg_hdr ic_hdr; + uint16_t ic_fwver_cnt; + uint16_t ic_msgver_cnt; + uint32_t ic_rsvd; + /* + * This version array contains two set of supported + * versions: + * - The first set consists of #ic_fwver_cnt supported framework + * versions. + * - The second set consists of #ic_msgver_cnt supported message + * versions. + */ + uint32_t ic_ver[]; +} __packed; + +/* VMBUS_ICMSG_TYPE_HEARTBEAT */ +struct vmbus_icmsg_heartbeat { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_seq; + uint32_t ic_rsvd[8]; +} __packed; + +#define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN \ + __offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0]) + +/* VMBUS_ICMSG_TYPE_SHUTDOWN */ +struct vmbus_icmsg_shutdown { + struct vmbus_icmsg_hdr ic_hdr; + uint32_t ic_code; + uint32_t ic_timeo; + uint32_t ic_haltflags; + uint8_t ic_msg[2048]; +} __packed; + +#define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN \ + __offsetof(struct vmbus_icmsg_shutdown, ic_msg[0]) + +/* VMBUS_ICMSG_TYPE_TIMESYNC */ +struct vmbus_icmsg_timesync { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_hvtime; + uint64_t ic_vmtime; + uint64_t ic_rtt; + uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */ +} __packed; + +#define VMBUS_ICMSG_TS_FLAG_SYNC 0x01 +#define VMBUS_ICMSG_TS_FLAG_SAMPLE 0x02 + +/* XXX consolidate w/ hyperv */ +#define VMBUS_ICMSG_TS_BASE 116444736000000000ULL +#define VMBUS_ICMSG_TS_FACTOR 100ULL +#ifndef NANOSEC +#define NANOSEC 1000000000ULL +#endif + +#endif /* !_VMBUS_ICREG_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c index 27e72fe..117e8fd 100644 --- a/sys/dev/hyperv/vmbus/vmbus.c +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <contrib/dev/acpica/include/acpi.h> #include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus_xact.h> #include <dev/hyperv/vmbus/hyperv_reg.h> #include <dev/hyperv/vmbus/hyperv_var.h> #include <dev/hyperv/vmbus/vmbus_reg.h> @@ -64,25 +65,10 @@ __FBSDID("$FreeBSD$"); #define VMBUS_GPADL_START 0xe1e10 struct vmbus_msghc { - struct hypercall_postmsg_in *mh_inprm; + struct vmbus_xact *mh_xact; struct hypercall_postmsg_in mh_inprm_save; - struct hyperv_dma mh_inprm_dma; - - struct vmbus_message *mh_resp; - struct vmbus_message mh_resp0; -}; - -struct vmbus_msghc_ctx { - struct vmbus_msghc *mhc_free; - struct mtx mhc_free_lock; - uint32_t mhc_flags; - - struct vmbus_msghc *mhc_active; - struct mtx mhc_active_lock; }; -#define VMBUS_MSGHC_CTXF_DESTROY 0x0001 - static int vmbus_probe(device_t); static int vmbus_attach(device_t); static int vmbus_detach(device_t); @@ -99,9 +85,7 @@ static int vmbus_connect(struct vmbus_softc *, uint32_t); static int vmbus_req_channels(struct vmbus_softc *sc); static void vmbus_disconnect(struct vmbus_softc *); static int vmbus_scan(struct vmbus_softc *); -static void vmbus_scan_wait(struct vmbus_softc *); -static void vmbus_scan_newchan(struct vmbus_softc *); -static void vmbus_scan_newdev(struct vmbus_softc *); +static void vmbus_scan_teardown(struct vmbus_softc *); static void vmbus_scan_done(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chanmsg_handle(struct vmbus_softc *, @@ -118,15 +102,6 @@ static int vmbus_doattach(struct vmbus_softc *); static void vmbus_event_proc_dummy(struct vmbus_softc *, int); -static struct vmbus_msghc_ctx *vmbus_msghc_ctx_create(bus_dma_tag_t); -static void vmbus_msghc_ctx_destroy( - struct vmbus_msghc_ctx *); -static void vmbus_msghc_ctx_free(struct vmbus_msghc_ctx *); -static struct vmbus_msghc *vmbus_msghc_alloc(bus_dma_tag_t); -static void vmbus_msghc_free(struct vmbus_msghc *); -static struct vmbus_msghc *vmbus_msghc_get1(struct vmbus_msghc_ctx *, - uint32_t); - static struct vmbus_softc *vmbus_sc; extern inthand_t IDTVEC(rsvd), IDTVEC(vmbus_isr); @@ -184,85 +159,6 @@ vmbus_get_softc(void) return vmbus_sc; } -static struct vmbus_msghc * -vmbus_msghc_alloc(bus_dma_tag_t parent_dtag) -{ - struct vmbus_msghc *mh; - - mh = malloc(sizeof(*mh), M_DEVBUF, M_WAITOK | M_ZERO); - - mh->mh_inprm = hyperv_dmamem_alloc(parent_dtag, - HYPERCALL_PARAM_ALIGN, 0, HYPERCALL_POSTMSGIN_SIZE, - &mh->mh_inprm_dma, BUS_DMA_WAITOK); - if (mh->mh_inprm == NULL) { - free(mh, M_DEVBUF); - return NULL; - } - return mh; -} - -static void -vmbus_msghc_free(struct vmbus_msghc *mh) -{ - hyperv_dmamem_free(&mh->mh_inprm_dma, mh->mh_inprm); - free(mh, M_DEVBUF); -} - -static void -vmbus_msghc_ctx_free(struct vmbus_msghc_ctx *mhc) -{ - KASSERT(mhc->mhc_active == NULL, ("still have active msg hypercall")); - KASSERT(mhc->mhc_free == NULL, ("still have hypercall msg")); - - mtx_destroy(&mhc->mhc_free_lock); - mtx_destroy(&mhc->mhc_active_lock); - free(mhc, M_DEVBUF); -} - -static struct vmbus_msghc_ctx * -vmbus_msghc_ctx_create(bus_dma_tag_t parent_dtag) -{ - struct vmbus_msghc_ctx *mhc; - - mhc = malloc(sizeof(*mhc), M_DEVBUF, M_WAITOK | M_ZERO); - mtx_init(&mhc->mhc_free_lock, "vmbus msghc free", NULL, MTX_DEF); - mtx_init(&mhc->mhc_active_lock, "vmbus msghc act", NULL, MTX_DEF); - - mhc->mhc_free = vmbus_msghc_alloc(parent_dtag); - if (mhc->mhc_free == NULL) { - vmbus_msghc_ctx_free(mhc); - return NULL; - } - return mhc; -} - -static struct vmbus_msghc * -vmbus_msghc_get1(struct vmbus_msghc_ctx *mhc, uint32_t dtor_flag) -{ - struct vmbus_msghc *mh; - - mtx_lock(&mhc->mhc_free_lock); - - while ((mhc->mhc_flags & dtor_flag) == 0 && mhc->mhc_free == NULL) { - mtx_sleep(&mhc->mhc_free, &mhc->mhc_free_lock, 0, - "gmsghc", 0); - } - if (mhc->mhc_flags & dtor_flag) { - /* Being destroyed */ - mh = NULL; - } else { - mh = mhc->mhc_free; - KASSERT(mh != NULL, ("no free hypercall msg")); - KASSERT(mh->mh_resp == NULL, - ("hypercall msg has pending response")); - mhc->mhc_free = NULL; - } - - mtx_unlock(&mhc->mhc_free_lock); - - return mh; -} - void vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) { @@ -271,7 +167,7 @@ vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); - inprm = mh->mh_inprm; + inprm = vmbus_xact_req_data(mh->mh_xact); memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); inprm->hc_connid = VMBUS_CONNID_MESSAGE; inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; @@ -282,63 +178,50 @@ struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) { struct vmbus_msghc *mh; + struct vmbus_xact *xact; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); - mh = vmbus_msghc_get1(sc->vmbus_msg_hc, VMBUS_MSGHC_CTXF_DESTROY); - if (mh == NULL) - return NULL; + xact = vmbus_xact_get(sc->vmbus_xc, + dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); + if (xact == NULL) + return (NULL); + + mh = vmbus_xact_priv(xact, sizeof(*mh)); + mh->mh_xact = xact; vmbus_msghc_reset(mh, dsize); - return mh; + return (mh); } void -vmbus_msghc_put(struct vmbus_softc *sc, struct vmbus_msghc *mh) +vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { - struct vmbus_msghc_ctx *mhc = sc->vmbus_msg_hc; - KASSERT(mhc->mhc_active == NULL, ("msg hypercall is active")); - mh->mh_resp = NULL; - - mtx_lock(&mhc->mhc_free_lock); - KASSERT(mhc->mhc_free == NULL, ("has free hypercall msg")); - mhc->mhc_free = mh; - mtx_unlock(&mhc->mhc_free_lock); - wakeup(&mhc->mhc_free); + vmbus_xact_put(mh->mh_xact); } void * vmbus_msghc_dataptr(struct vmbus_msghc *mh) { - return mh->mh_inprm->hc_data; -} - -static void -vmbus_msghc_ctx_destroy(struct vmbus_msghc_ctx *mhc) -{ - struct vmbus_msghc *mh; - - mtx_lock(&mhc->mhc_free_lock); - mhc->mhc_flags |= VMBUS_MSGHC_CTXF_DESTROY; - mtx_unlock(&mhc->mhc_free_lock); - wakeup(&mhc->mhc_free); - - mh = vmbus_msghc_get1(mhc, 0); - if (mh == NULL) - panic("can't get msghc"); + struct hypercall_postmsg_in *inprm; - vmbus_msghc_free(mh); - vmbus_msghc_ctx_free(mhc); + inprm = vmbus_xact_req_data(mh->mh_xact); + return (inprm->hc_data); } int vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) { sbintime_t time = SBT_1MS; + struct hypercall_postmsg_in *inprm; + bus_addr_t inprm_paddr; int i; + inprm = vmbus_xact_req_data(mh->mh_xact); + inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); + /* * Save the input parameter so that we could restore the input * parameter if the Hypercall failed. @@ -347,7 +230,7 @@ vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) * Is this really necessary?! i.e. Will the Hypercall ever * overwrite the input parameter? */ - memcpy(&mh->mh_inprm_save, mh->mh_inprm, HYPERCALL_POSTMSGIN_SIZE); + memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); /* * In order to cope with transient failures, e.g. insufficient @@ -359,7 +242,7 @@ vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) for (i = 0; i < HC_RETRY_MAX; ++i) { uint64_t status; - status = hypercall_post_message(mh->mh_inprm_dma.hv_paddr); + status = hypercall_post_message(inprm_paddr); if (status == HYPERCALL_STATUS_SUCCESS) return 0; @@ -368,8 +251,7 @@ vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) time *= 2; /* Restore input parameter and try again */ - memcpy(mh->mh_inprm, &mh->mh_inprm_save, - HYPERCALL_POSTMSGIN_SIZE); + memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); } #undef HC_RETRY_MAX @@ -378,62 +260,30 @@ vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) } int -vmbus_msghc_exec(struct vmbus_softc *sc, struct vmbus_msghc *mh) +vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { - struct vmbus_msghc_ctx *mhc = sc->vmbus_msg_hc; int error; - KASSERT(mh->mh_resp == NULL, ("hypercall msg has pending response")); - - mtx_lock(&mhc->mhc_active_lock); - KASSERT(mhc->mhc_active == NULL, ("pending active msg hypercall")); - mhc->mhc_active = mh; - mtx_unlock(&mhc->mhc_active_lock); - + vmbus_xact_activate(mh->mh_xact); error = vmbus_msghc_exec_noresult(mh); - if (error) { - mtx_lock(&mhc->mhc_active_lock); - KASSERT(mhc->mhc_active == mh, ("msghc mismatch")); - mhc->mhc_active = NULL; - mtx_unlock(&mhc->mhc_active_lock); - } + if (error) + vmbus_xact_deactivate(mh->mh_xact); return error; } const struct vmbus_message * -vmbus_msghc_wait_result(struct vmbus_softc *sc, struct vmbus_msghc *mh) +vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { - struct vmbus_msghc_ctx *mhc = sc->vmbus_msg_hc; + size_t resp_len; - mtx_lock(&mhc->mhc_active_lock); - - KASSERT(mhc->mhc_active == mh, ("msghc mismatch")); - while (mh->mh_resp == NULL) { - mtx_sleep(&mhc->mhc_active, &mhc->mhc_active_lock, 0, - "wmsghc", 0); - } - mhc->mhc_active = NULL; - - mtx_unlock(&mhc->mhc_active_lock); - - return mh->mh_resp; + return (vmbus_xact_wait(mh->mh_xact, &resp_len)); } void vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) { - struct vmbus_msghc_ctx *mhc = sc->vmbus_msg_hc; - struct vmbus_msghc *mh; - mtx_lock(&mhc->mhc_active_lock); - - mh = mhc->mhc_active; - KASSERT(mh != NULL, ("no pending msg hypercall")); - memcpy(&mh->mh_resp0, msg, sizeof(mh->mh_resp0)); - mh->mh_resp = &mh->mh_resp0; - - mtx_unlock(&mhc->mhc_active_lock); - wakeup(&mhc->mhc_active); + vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); } uint32_t @@ -543,50 +393,22 @@ vmbus_req_channels(struct vmbus_softc *sc) } static void -vmbus_scan_newchan(struct vmbus_softc *sc) +vmbus_scan_done_task(void *xsc, int pending __unused) { - mtx_lock(&sc->vmbus_scan_lock); - if ((sc->vmbus_scan_chcnt & VMBUS_SCAN_CHCNT_DONE) == 0) - sc->vmbus_scan_chcnt++; - mtx_unlock(&sc->vmbus_scan_lock); + struct vmbus_softc *sc = xsc; + + mtx_lock(&Giant); + sc->vmbus_scandone = true; + mtx_unlock(&Giant); + wakeup(&sc->vmbus_scandone); } static void vmbus_scan_done(struct vmbus_softc *sc, const struct vmbus_message *msg __unused) { - mtx_lock(&sc->vmbus_scan_lock); - sc->vmbus_scan_chcnt |= VMBUS_SCAN_CHCNT_DONE; - mtx_unlock(&sc->vmbus_scan_lock); - wakeup(&sc->vmbus_scan_chcnt); -} -static void -vmbus_scan_newdev(struct vmbus_softc *sc) -{ - mtx_lock(&sc->vmbus_scan_lock); - sc->vmbus_scan_devcnt++; - mtx_unlock(&sc->vmbus_scan_lock); - wakeup(&sc->vmbus_scan_devcnt); -} - -static void -vmbus_scan_wait(struct vmbus_softc *sc) -{ - uint32_t chancnt; - - mtx_lock(&sc->vmbus_scan_lock); - while ((sc->vmbus_scan_chcnt & VMBUS_SCAN_CHCNT_DONE) == 0) { - mtx_sleep(&sc->vmbus_scan_chcnt, &sc->vmbus_scan_lock, 0, - "waitch", 0); - } - chancnt = sc->vmbus_scan_chcnt & ~VMBUS_SCAN_CHCNT_DONE; - - while (sc->vmbus_scan_devcnt != chancnt) { - mtx_sleep(&sc->vmbus_scan_devcnt, &sc->vmbus_scan_lock, 0, - "waitdev", 0); - } - mtx_unlock(&sc->vmbus_scan_lock); + taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); } static int @@ -595,31 +417,71 @@ vmbus_scan(struct vmbus_softc *sc) int error; /* + * Identify, probe and attach for non-channel devices. + */ + bus_generic_probe(sc->vmbus_dev); + bus_generic_attach(sc->vmbus_dev); + + /* + * This taskqueue serializes vmbus devices' attach and detach + * for channel offer and rescind messages. + */ + sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, + taskqueue_thread_enqueue, &sc->vmbus_devtq); + taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); + TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); + + /* + * This taskqueue handles sub-channel detach, so that vmbus + * device's detach running in vmbus_devtq can drain its sub- + * channels. + */ + sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, + taskqueue_thread_enqueue, &sc->vmbus_subchtq); + taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); + + /* * Start vmbus scanning. */ error = vmbus_req_channels(sc); if (error) { device_printf(sc->vmbus_dev, "channel request failed: %d\n", error); - return error; + return (error); } /* - * Wait for all devices are added to vmbus. + * Wait for all vmbus devices from the initial channel offers to be + * attached. */ - vmbus_scan_wait(sc); - - /* - * Identify, probe and attach. - */ - bus_generic_probe(sc->vmbus_dev); - bus_generic_attach(sc->vmbus_dev); + GIANT_REQUIRED; + while (!sc->vmbus_scandone) + mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0); if (bootverbose) { device_printf(sc->vmbus_dev, "device scan, probe and attach " "done\n"); } - return 0; + return (0); +} + +static void +vmbus_scan_teardown(struct vmbus_softc *sc) +{ + + GIANT_REQUIRED; + if (sc->vmbus_devtq != NULL) { + mtx_unlock(&Giant); + taskqueue_free(sc->vmbus_devtq); + mtx_lock(&Giant); + sc->vmbus_devtq = NULL; + } + if (sc->vmbus_subchtq != NULL) { + mtx_unlock(&Giant); + taskqueue_free(sc->vmbus_subchtq); + mtx_lock(&Giant); + sc->vmbus_subchtq = NULL; + } } static void @@ -1148,45 +1010,35 @@ vmbus_add_child(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; device_t parent = sc->vmbus_dev; - int error = 0; - /* New channel has been offered */ - vmbus_scan_newchan(sc); + mtx_lock(&Giant); chan->ch_dev = device_add_child(parent, NULL, -1); if (chan->ch_dev == NULL) { + mtx_unlock(&Giant); device_printf(parent, "device_add_child for chan%u failed\n", chan->ch_id); - error = ENXIO; - goto done; + return (ENXIO); } device_set_ivars(chan->ch_dev, chan); + device_probe_and_attach(chan->ch_dev); -done: - /* New device has been/should be added to vmbus. */ - vmbus_scan_newdev(sc); - return error; + mtx_unlock(&Giant); + return (0); } int vmbus_delete_child(struct vmbus_channel *chan) { - int error; - - if (chan->ch_dev == NULL) { - /* Failed to add a device. */ - return 0; - } + int error = 0; - /* - * XXXKYS: Ensure that this is the opposite of - * device_add_child() - */ mtx_lock(&Giant); - error = device_delete_child(chan->ch_vmbus->vmbus_dev, chan->ch_dev); + if (chan->ch_dev != NULL) { + error = device_delete_child(chan->ch_vmbus->vmbus_dev, + chan->ch_dev); + } mtx_unlock(&Giant); - - return error; + return (error); } static int @@ -1258,10 +1110,11 @@ vmbus_doattach(struct vmbus_softc *sc) return (0); sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; - mtx_init(&sc->vmbus_scan_lock, "vmbus scan", NULL, MTX_DEF); sc->vmbus_gpadl = VMBUS_GPADL_START; mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_prichans); + mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); + TAILQ_INIT(&sc->vmbus_chans); sc->vmbus_chmap = malloc( sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, M_WAITOK | M_ZERO); @@ -1269,9 +1122,10 @@ vmbus_doattach(struct vmbus_softc *sc) /* * Create context for "post message" Hypercalls */ - sc->vmbus_msg_hc = vmbus_msghc_ctx_create( - bus_get_dma_tag(sc->vmbus_dev)); - if (sc->vmbus_msg_hc == NULL) { + sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), + HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, + sizeof(struct vmbus_msghc)); + if (sc->vmbus_xc == NULL) { ret = ENXIO; goto cleanup; } @@ -1324,15 +1178,16 @@ vmbus_doattach(struct vmbus_softc *sc) return (ret); cleanup: + vmbus_scan_teardown(sc); vmbus_intr_teardown(sc); vmbus_dma_free(sc); - if (sc->vmbus_msg_hc != NULL) { - vmbus_msghc_ctx_destroy(sc->vmbus_msg_hc); - sc->vmbus_msg_hc = NULL; + if (sc->vmbus_xc != NULL) { + vmbus_xact_ctx_destroy(sc->vmbus_xc); + sc->vmbus_xc = NULL; } free(sc->vmbus_chmap, M_DEVBUF); - mtx_destroy(&sc->vmbus_scan_lock); mtx_destroy(&sc->vmbus_prichan_lock); + mtx_destroy(&sc->vmbus_chan_lock); return (ret); } @@ -1372,8 +1227,11 @@ vmbus_detach(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); + bus_generic_detach(dev); vmbus_chan_destroy_all(sc); + vmbus_scan_teardown(sc); + vmbus_disconnect(sc); if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { @@ -1384,14 +1242,14 @@ vmbus_detach(device_t dev) vmbus_intr_teardown(sc); vmbus_dma_free(sc); - if (sc->vmbus_msg_hc != NULL) { - vmbus_msghc_ctx_destroy(sc->vmbus_msg_hc); - sc->vmbus_msg_hc = NULL; + if (sc->vmbus_xc != NULL) { + vmbus_xact_ctx_destroy(sc->vmbus_xc); + sc->vmbus_xc = NULL; } free(sc->vmbus_chmap, M_DEVBUF); - mtx_destroy(&sc->vmbus_scan_lock); mtx_destroy(&sc->vmbus_prichan_lock); + mtx_destroy(&sc->vmbus_chan_lock); return (0); } diff --git a/sys/dev/hyperv/vmbus/vmbus_brvar.h b/sys/dev/hyperv/vmbus/vmbus_brvar.h index da0ca9d..cbec3eb 100644 --- a/sys/dev/hyperv/vmbus/vmbus_brvar.h +++ b/sys/dev/hyperv/vmbus/vmbus_brvar.h @@ -74,6 +74,7 @@ struct sysctl_oid; static __inline int vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr) { + /* * - 64 bits for the trailing start index (- sizeof(uint64_t)). * - The rindex and windex can't be same (- 1). See @@ -82,6 +83,31 @@ vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr) return (tbr->txbr_dsize - sizeof(uint64_t) - 1); } +static __inline bool +vmbus_txbr_empty(const struct vmbus_txbr *tbr) +{ + + return (tbr->txbr_windex == tbr->txbr_rindex ? true : false); +} + +static __inline bool +vmbus_rxbr_empty(const struct vmbus_rxbr *rbr) +{ + + return (rbr->rxbr_windex == rbr->rxbr_rindex ? true : false); +} + +static __inline int +vmbus_br_nelem(int br_size, int elem_size) +{ + + /* Strip bufring header */ + br_size -= sizeof(struct vmbus_bufring); + /* Add per-element trailing index */ + elem_size += sizeof(uint64_t); + return (br_size / elem_size); +} + void vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree, struct vmbus_br *br, const char *name); diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c index bb88c0c..34b07ee 100644 --- a/sys/dev/hyperv/vmbus/vmbus_chan.c +++ b/sys/dev/hyperv/vmbus/vmbus_chan.c @@ -59,10 +59,30 @@ static struct vmbus_channel *vmbus_chan_alloc(struct vmbus_softc *); static void vmbus_chan_free(struct vmbus_channel *); static int vmbus_chan_add(struct vmbus_channel *); static void vmbus_chan_cpu_default(struct vmbus_channel *); +static int vmbus_chan_release(struct vmbus_channel *); +static void vmbus_chan_set_chmap(struct vmbus_channel *); +static void vmbus_chan_clear_chmap(struct vmbus_channel *); + +static void vmbus_chan_ins_prilist(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_rem_prilist(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_ins_list(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_rem_list(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_ins_sublist(struct vmbus_channel *, + struct vmbus_channel *); +static void vmbus_chan_rem_sublist(struct vmbus_channel *, + struct vmbus_channel *); static void vmbus_chan_task(void *, int); static void vmbus_chan_task_nobatch(void *, int); -static void vmbus_chan_detach_task(void *, int); +static void vmbus_chan_clrchmap_task(void *, int); +static void vmbus_prichan_attach_task(void *, int); +static void vmbus_subchan_attach_task(void *, int); +static void vmbus_prichan_detach_task(void *, int); +static void vmbus_subchan_detach_task(void *, int); static void vmbus_chan_msgproc_choffer(struct vmbus_softc *, const struct vmbus_message *); @@ -96,6 +116,83 @@ vmbus_chan_signal_tx(const struct vmbus_channel *chan) hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); } +static void +vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONPRIL_SHIFT)) + panic("channel is already on the prilist"); + TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink); +} + +static void +vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0) + panic("channel is not on the prilist"); + TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); +} + +static void +vmbus_chan_ins_sublist(struct vmbus_channel *prichan, + struct vmbus_channel *chan) +{ + + mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); + + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONSUBL_SHIFT)) + panic("channel is already on the sublist"); + TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink); + + /* Bump sub-channel count. */ + prichan->ch_subchan_cnt++; +} + +static void +vmbus_chan_rem_sublist(struct vmbus_channel *prichan, + struct vmbus_channel *chan) +{ + + mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); + + KASSERT(prichan->ch_subchan_cnt > 0, + ("invalid subchan_cnt %d", prichan->ch_subchan_cnt)); + prichan->ch_subchan_cnt--; + + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0) + panic("channel is not on the sublist"); + TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink); +} + +static void +vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONLIST_SHIFT)) + panic("channel is already on the list"); + TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link); +} + +static void +vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONLIST_SHIFT) == 0) + panic("channel is not on the list"); + TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link); +} + static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS) { @@ -196,13 +293,46 @@ int vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { + struct vmbus_chan_br cbr; + int error; + + /* + * Allocate the TX+RX bufrings. + */ + KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated")); + chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev), + PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, + BUS_DMA_WAITOK); + if (chan->ch_bufring == NULL) { + device_printf(chan->ch_dev, "bufring allocation failed\n"); + return (ENOMEM); + } + + cbr.cbr = chan->ch_bufring; + cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr; + cbr.cbr_txsz = txbr_size; + cbr.cbr_rxsz = rxbr_size; + + error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg); + if (error) { + hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); + chan->ch_bufring = NULL; + } + return (error); +} + +int +vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr, + const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) +{ struct vmbus_softc *sc = chan->ch_vmbus; const struct vmbus_chanmsg_chopen_resp *resp; const struct vmbus_message *msg; struct vmbus_chanmsg_chopen *req; struct vmbus_msghc *mh; uint32_t status; - int error; + int error, txbr_size, rxbr_size; + task_fn_t *task_fn; uint8_t *br; if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) { @@ -210,10 +340,21 @@ vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, "invalid udata len %d for chan%u\n", udlen, chan->ch_id); return EINVAL; } + + br = cbr->cbr; + txbr_size = cbr->cbr_txsz; + rxbr_size = cbr->cbr_rxsz; KASSERT((txbr_size & PAGE_MASK) == 0, ("send bufring size is not multiple page")); KASSERT((rxbr_size & PAGE_MASK) == 0, ("recv bufring size is not multiple page")); + KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0, + ("bufring is not page aligned")); + + /* + * Zero out the TX/RX bufrings, in case that they were used before. + */ + memset(br, 0, txbr_size + rxbr_size); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED_SHIFT)) @@ -226,23 +367,10 @@ vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) - TASK_INIT(&chan->ch_task, 0, vmbus_chan_task, chan); + task_fn = vmbus_chan_task; else - TASK_INIT(&chan->ch_task, 0, vmbus_chan_task_nobatch, chan); - - /* - * Allocate the TX+RX bufrings. - * XXX should use ch_dev dtag - */ - br = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), - PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); - if (br == NULL) { - device_printf(sc->vmbus_dev, "bufring allocation failed\n"); - error = ENOMEM; - goto failed; - } - chan->ch_bufring = br; + task_fn = vmbus_chan_task_nobatch; + TASK_INIT(&chan->ch_task, 0, task_fn, chan); /* TX bufring comes first */ vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size); @@ -255,7 +383,7 @@ vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, /* * Connect the bufrings, both RX and TX, to this channel. */ - error = vmbus_chan_gpadl_connect(chan, chan->ch_bufring_dma.hv_paddr, + error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr, txbr_size + rxbr_size, &chan->ch_bufring_gpadl); if (error) { device_printf(sc->vmbus_dev, @@ -264,6 +392,12 @@ vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, } /* + * Install this channel, before it is opened, but after everything + * else has been setup. + */ + vmbus_chan_set_chmap(chan); + + /* * Open channel w/ the bufring GPADL on the target CPU. */ mh = vmbus_msghc_get(sc, sizeof(*req)); @@ -312,14 +446,11 @@ vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, error = ENXIO; failed: + vmbus_chan_clear_chmap(chan); if (chan->ch_bufring_gpadl) { vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); chan->ch_bufring_gpadl = 0; } - if (chan->ch_bufring != NULL) { - hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); - chan->ch_bufring = NULL; - } atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); return error; } @@ -492,12 +623,38 @@ vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl) } static void +vmbus_chan_clrchmap_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + critical_enter(); + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; + critical_exit(); +} + +static void +vmbus_chan_clear_chmap(struct vmbus_channel *chan) +{ + struct task chmap_task; + + TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan); + taskqueue_enqueue(chan->ch_tq, &chmap_task); + taskqueue_drain(chan->ch_tq, &chmap_task); +} + +static void +vmbus_chan_set_chmap(struct vmbus_channel *chan) +{ + __compiler_membar(); + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; +} + +static void vmbus_chan_close_internal(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_chclose *req; - struct taskqueue *tq = chan->ch_tq; int error; /* TODO: stringent check */ @@ -510,12 +667,14 @@ vmbus_chan_close_internal(struct vmbus_channel *chan) sysctl_ctx_free(&chan->ch_sysctl_ctx); /* - * Set ch_tq to NULL to avoid more requests be scheduled. - * XXX pretty broken; need rework. + * NOTE: + * Order is critical. This channel _must_ be uninstalled first, + * else the channel task may be enqueued by the IDT after it has + * been drained. */ + vmbus_chan_clear_chmap(chan); + taskqueue_drain(chan->ch_tq, &chan->ch_task); chan->ch_tq = NULL; - taskqueue_drain(tq, &chan->ch_task); - chan->ch_cb = NULL; /* * Close this channel. @@ -597,6 +756,13 @@ vmbus_chan_close(struct vmbus_channel *chan) vmbus_chan_close_internal(chan); } +void +vmbus_chan_intr_drain(struct vmbus_channel *chan) +{ + + taskqueue_drain(chan->ch_tq, &chan->ch_task); +} + int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid) @@ -721,7 +887,20 @@ vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) - return error; + return (error); + + if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { + device_printf(chan->ch_dev, "invalid hlen %u\n", + pkt.cph_hlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) { + device_printf(chan->ch_dev, "invalid hlen %u and tlen %u\n", + pkt.cph_hlen, pkt.cph_tlen); + /* XXX this channel is dead actually. */ + return (EIO); + } hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen); dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen; @@ -729,7 +908,7 @@ vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, if (*dlen0 < dlen) { /* Return the size of this packet's data. */ *dlen0 = dlen; - return ENOBUFS; + return (ENOBUFS); } *xactid = pkt.cph_xactid; @@ -739,7 +918,7 @@ vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen); KASSERT(!error, ("vmbus_rxbr_read failed")); - return 0; + return (0); } int @@ -751,13 +930,26 @@ vmbus_chan_recv_pkt(struct vmbus_channel *chan, error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) - return error; + return (error); + + if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { + device_printf(chan->ch_dev, "invalid hlen %u\n", + pkt.cph_hlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) { + device_printf(chan->ch_dev, "invalid hlen %u and tlen %u\n", + pkt.cph_hlen, pkt.cph_tlen); + /* XXX this channel is dead actually. */ + return (EIO); + } pktlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen); if (*pktlen0 < pktlen) { /* Return the size of this packet. */ *pktlen0 = pktlen; - return ENOBUFS; + return (ENOBUFS); } *pktlen0 = pktlen; @@ -765,7 +957,7 @@ vmbus_chan_recv_pkt(struct vmbus_channel *chan, error = vmbus_rxbr_read(&chan->ch_rxbr, pkt0, pktlen, 0); KASSERT(!error, ("vmbus_rxbr_read failed")); - return 0; + return (0); } static void @@ -833,10 +1025,11 @@ vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, flags &= ~(1UL << chid_ofs); chan = sc->vmbus_chmap[chid_base + chid_ofs]; - - /* if channel is closed or closing */ - if (chan == NULL || chan->ch_tq == NULL) + if (__predict_false(chan == NULL)) { + /* Channel is closed. */ continue; + } + __compiler_membar(); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) vmbus_rxbr_intr_mask(&chan->ch_rxbr); @@ -917,7 +1110,6 @@ vmbus_chan_alloc(struct vmbus_softc *sc) chan->ch_vmbus = sc; mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF); TAILQ_INIT(&chan->ch_subchans); - TASK_INIT(&chan->ch_detach_task, 0, vmbus_chan_detach_task, chan); vmbus_rxbr_init(&chan->ch_rxbr); vmbus_txbr_init(&chan->ch_txbr); @@ -927,9 +1119,14 @@ vmbus_chan_alloc(struct vmbus_softc *sc) static void vmbus_chan_free(struct vmbus_channel *chan) { - /* TODO: assert sub-channel list is empty */ - /* TODO: asset no longer on the primary channel's sub-channel list */ - /* TODO: asset no longer on the vmbus channel list */ + + KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0, + ("still owns sub-channels")); + KASSERT((chan->ch_stflags & + (VMBUS_CHAN_ST_OPENED | + VMBUS_CHAN_ST_ONPRIL | + VMBUS_CHAN_ST_ONSUBL | + VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel")); hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm); mtx_destroy(&chan->ch_subchan_lock); vmbus_rxbr_deinit(&chan->ch_rxbr); @@ -956,7 +1153,6 @@ vmbus_chan_add(struct vmbus_channel *newchan) newchan->ch_id); return EINVAL; } - sc->vmbus_chmap[newchan->ch_id] = newchan; if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u subidx%u offer\n", @@ -978,10 +1174,9 @@ vmbus_chan_add(struct vmbus_channel *newchan) if (VMBUS_CHAN_ISPRIMARY(newchan)) { if (prichan == NULL) { /* Install the new primary channel */ - TAILQ_INSERT_TAIL(&sc->vmbus_prichans, newchan, - ch_prilink); + vmbus_chan_ins_prilist(sc, newchan); mtx_unlock(&sc->vmbus_prichan_lock); - return 0; + goto done; } else { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "duplicated primary " @@ -1015,16 +1210,20 @@ vmbus_chan_add(struct vmbus_channel *newchan) newchan->ch_dev = prichan->ch_dev; mtx_lock(&prichan->ch_subchan_lock); - TAILQ_INSERT_TAIL(&prichan->ch_subchans, newchan, ch_sublink); + vmbus_chan_ins_sublist(prichan, newchan); + mtx_unlock(&prichan->ch_subchan_lock); /* - * Bump up sub-channel count and notify anyone that is - * interested in this sub-channel, after this sub-channel - * is setup. + * Notify anyone that is interested in this sub-channel, + * after this sub-channel is setup. */ - prichan->ch_subchan_cnt++; - mtx_unlock(&prichan->ch_subchan_lock); wakeup(prichan); - +done: + /* + * Hook this channel up for later rescind. + */ + mtx_lock(&sc->vmbus_chan_lock); + vmbus_chan_ins_list(sc, newchan); + mtx_unlock(&sc->vmbus_chan_lock); return 0; } @@ -1075,6 +1274,7 @@ vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, { const struct vmbus_chanmsg_choffer *offer; struct vmbus_channel *chan; + task_fn_t *detach_fn, *attach_fn; int error; offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data; @@ -1123,6 +1323,21 @@ vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT]; chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK); + /* + * Setup attach and detach tasks. + */ + if (VMBUS_CHAN_ISPRIMARY(chan)) { + chan->ch_mgmt_tq = sc->vmbus_devtq; + attach_fn = vmbus_prichan_attach_task; + detach_fn = vmbus_prichan_detach_task; + } else { + chan->ch_mgmt_tq = sc->vmbus_subchtq; + attach_fn = vmbus_subchan_attach_task; + detach_fn = vmbus_subchan_detach_task; + } + TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan); + TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan); + /* Select default cpu for this channel. */ vmbus_chan_cpu_default(chan); @@ -1133,22 +1348,9 @@ vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, vmbus_chan_free(chan); return; } - - if (VMBUS_CHAN_ISPRIMARY(chan)) { - /* - * Add device for this primary channel. - * - * NOTE: - * Error is ignored here; don't have much to do if error - * really happens. - */ - vmbus_add_child(chan); - } + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task); } -/* - * XXX pretty broken; need rework. - */ static void vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, const struct vmbus_message *msg) @@ -1168,91 +1370,162 @@ vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, note->chm_chanid); } - chan = sc->vmbus_chmap[note->chm_chanid]; - if (chan == NULL) + /* + * Find and remove the target channel from the channel list. + */ + mtx_lock(&sc->vmbus_chan_lock); + TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { + if (chan->ch_id == note->chm_chanid) + break; + } + if (chan == NULL) { + mtx_unlock(&sc->vmbus_chan_lock); + device_printf(sc->vmbus_dev, "chan%u is not offered\n", + note->chm_chanid); return; - sc->vmbus_chmap[note->chm_chanid] = NULL; + } + vmbus_chan_rem_list(sc, chan); + mtx_unlock(&sc->vmbus_chan_lock); - taskqueue_enqueue(taskqueue_thread, &chan->ch_detach_task); + if (VMBUS_CHAN_ISPRIMARY(chan)) { + /* + * The target channel is a primary channel; remove the + * target channel from the primary channel list now, + * instead of later, so that it will not be found by + * other sub-channel offers, which are processed in + * this thread. + */ + mtx_lock(&sc->vmbus_prichan_lock); + vmbus_chan_rem_prilist(sc, chan); + mtx_unlock(&sc->vmbus_prichan_lock); + } + + /* Detach the target channel. */ + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } -static void -vmbus_chan_detach_task(void *xchan, int pending __unused) +static int +vmbus_chan_release(struct vmbus_channel *chan) { - struct vmbus_channel *chan = xchan; + struct vmbus_softc *sc = chan->ch_vmbus; + struct vmbus_chanmsg_chfree *req; + struct vmbus_msghc *mh; + int error; - if (VMBUS_CHAN_ISPRIMARY(chan)) { - /* Only primary channel owns the device */ - vmbus_delete_child(chan); - /* NOTE: DO NOT free primary channel for now */ + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + device_printf(sc->vmbus_dev, "can not get msg hypercall for " + "chfree(chan%u)\n", chan->ch_id); + return (ENXIO); + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; + req->chm_chanid = chan->ch_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + device_printf(sc->vmbus_dev, "chfree(chan%u) failed: %d", + chan->ch_id, error); } else { - struct vmbus_softc *sc = chan->ch_vmbus; - struct vmbus_channel *pri_chan = chan->ch_prichan; - struct vmbus_chanmsg_chfree *req; - struct vmbus_msghc *mh; - int error; - - mh = vmbus_msghc_get(sc, sizeof(*req)); - if (mh == NULL) { - device_printf(sc->vmbus_dev, - "can not get msg hypercall for chfree(chan%u)\n", + if (bootverbose) { + device_printf(sc->vmbus_dev, "chan%u freed\n", chan->ch_id); - goto remove; } + } + return (error); +} - req = vmbus_msghc_dataptr(mh); - req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; - req->chm_chanid = chan->ch_id; +static void +vmbus_prichan_detach_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; - error = vmbus_msghc_exec_noresult(mh); - vmbus_msghc_put(sc, mh); + KASSERT(VMBUS_CHAN_ISPRIMARY(chan), + ("chan%u is not primary channel", chan->ch_id)); - if (error) { - device_printf(sc->vmbus_dev, - "chfree(chan%u) failed: %d", - chan->ch_id, error); - /* NOTE: Move on! */ - } else { - if (bootverbose) { - device_printf(sc->vmbus_dev, "chan%u freed\n", - chan->ch_id); - } - } -remove: - mtx_lock(&pri_chan->ch_subchan_lock); - TAILQ_REMOVE(&pri_chan->ch_subchans, chan, ch_sublink); - KASSERT(pri_chan->ch_subchan_cnt > 0, - ("invalid subchan_cnt %d", pri_chan->ch_subchan_cnt)); - pri_chan->ch_subchan_cnt--; - mtx_unlock(&pri_chan->ch_subchan_lock); - wakeup(pri_chan); + /* Delete and detach the device associated with this channel. */ + vmbus_delete_child(chan); - vmbus_chan_free(chan); - } + /* Release this channel (back to vmbus). */ + vmbus_chan_release(chan); + + /* Free this channel's resource. */ + vmbus_chan_free(chan); +} + +static void +vmbus_subchan_detach_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + struct vmbus_channel *pri_chan = chan->ch_prichan; + + KASSERT(!VMBUS_CHAN_ISPRIMARY(chan), + ("chan%u is primary channel", chan->ch_id)); + + /* Release this channel (back to vmbus). */ + vmbus_chan_release(chan); + + /* Unlink from its primary channel's sub-channel list. */ + mtx_lock(&pri_chan->ch_subchan_lock); + vmbus_chan_rem_sublist(pri_chan, chan); + mtx_unlock(&pri_chan->ch_subchan_lock); + /* Notify anyone that is waiting for this sub-channel to vanish. */ + wakeup(pri_chan); + + /* Free this channel's resource. */ + vmbus_chan_free(chan); +} + +static void +vmbus_prichan_attach_task(void *xchan, int pending __unused) +{ + + /* + * Add device for this primary channel. + */ + vmbus_add_child(xchan); +} + +static void +vmbus_subchan_attach_task(void *xchan __unused, int pending __unused) +{ + + /* Nothing */ } -/* - * Detach all devices and destroy the corresponding primary channels. - */ void vmbus_chan_destroy_all(struct vmbus_softc *sc) { - struct vmbus_channel *chan; - mtx_lock(&sc->vmbus_prichan_lock); - while ((chan = TAILQ_FIRST(&sc->vmbus_prichans)) != NULL) { - KASSERT(VMBUS_CHAN_ISPRIMARY(chan), ("not primary channel")); - TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); - mtx_unlock(&sc->vmbus_prichan_lock); + /* + * Detach all devices and destroy the corresponding primary + * channels. + */ + for (;;) { + struct vmbus_channel *chan; - vmbus_delete_child(chan); - vmbus_chan_free(chan); + mtx_lock(&sc->vmbus_chan_lock); + TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { + if (VMBUS_CHAN_ISPRIMARY(chan)) + break; + } + if (chan == NULL) { + /* No more primary channels; done. */ + mtx_unlock(&sc->vmbus_chan_lock); + break; + } + vmbus_chan_rem_list(sc, chan); + mtx_unlock(&sc->vmbus_chan_lock); mtx_lock(&sc->vmbus_prichan_lock); + vmbus_chan_rem_prilist(sc, chan); + mtx_unlock(&sc->vmbus_prichan_lock); + + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } - bzero(sc->vmbus_chmap, - sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX); - mtx_unlock(&sc->vmbus_prichan_lock); } /* @@ -1320,6 +1593,8 @@ vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt) struct vmbus_channel **ret, *chan; int i; + KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt)); + ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP, M_WAITOK); @@ -1411,3 +1686,38 @@ vmbus_chan_guid_inst(const struct vmbus_channel *chan) { return &chan->ch_guid_inst; } + +int +vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max) +{ + int elem_size; + + elem_size = __offsetof(struct vmbus_chanpkt_prplist, + cp_range[0].gpa_page[prpcnt_max]); + elem_size += dlen_max; + elem_size = VMBUS_CHANPKT_TOTLEN(elem_size); + + return (vmbus_br_nelem(br_size, elem_size)); +} + +bool +vmbus_chan_tx_empty(const struct vmbus_channel *chan) +{ + + return (vmbus_txbr_empty(&chan->ch_txbr)); +} + +bool +vmbus_chan_rx_empty(const struct vmbus_channel *chan) +{ + + return (vmbus_rxbr_empty(&chan->ch_rxbr)); +} + +void +vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task) +{ + + taskqueue_enqueue(chan->ch_tq, task); + taskqueue_drain(chan->ch_tq, task); +} diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h index 68a134d3..b415d00 100644 --- a/sys/dev/hyperv/vmbus/vmbus_chanvar.h +++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h @@ -124,8 +124,14 @@ struct vmbus_channel { struct hyperv_dma ch_bufring_dma; uint32_t ch_bufring_gpadl; - struct task ch_detach_task; + struct task ch_attach_task; /* run in ch_mgmt_tq */ + struct task ch_detach_task; /* run in ch_mgmt_tq */ + struct taskqueue *ch_mgmt_tq; + + /* If this is a primary channel */ TAILQ_ENTRY(vmbus_channel) ch_prilink; /* primary chan link */ + + TAILQ_ENTRY(vmbus_channel) ch_link; /* channel link */ uint32_t ch_subidx; /* subchan index */ volatile uint32_t ch_stflags; /* atomic-op */ /* VMBUS_CHAN_ST_ */ @@ -150,7 +156,13 @@ struct vmbus_channel { #define VMBUS_CHAN_TXF_HASMNF 0x0001 #define VMBUS_CHAN_ST_OPENED_SHIFT 0 +#define VMBUS_CHAN_ST_ONPRIL_SHIFT 1 +#define VMBUS_CHAN_ST_ONSUBL_SHIFT 2 +#define VMBUS_CHAN_ST_ONLIST_SHIFT 3 #define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) +#define VMBUS_CHAN_ST_ONPRIL (1 << VMBUS_CHAN_ST_ONPRIL_SHIFT) +#define VMBUS_CHAN_ST_ONSUBL (1 << VMBUS_CHAN_ST_ONSUBL_SHIFT) +#define VMBUS_CHAN_ST_ONLIST (1 << VMBUS_CHAN_ST_ONLIST_SHIFT) struct vmbus_softc; struct vmbus_message; diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h index b1a2de0..522e68c 100644 --- a/sys/dev/hyperv/vmbus/vmbus_reg.h +++ b/sys/dev/hyperv/vmbus/vmbus_reg.h @@ -153,6 +153,9 @@ do { \ #define VMBUS_CHANPKT_TOTLEN(tlen) \ roundup2((tlen), VMBUS_CHANPKT_SIZE_ALIGN) +#define VMBUS_CHANPKT_HLEN_MIN \ + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) + struct vmbus_chanpkt { struct vmbus_chanpkt_hdr cp_hdr; } __packed; diff --git a/sys/dev/hyperv/vmbus/vmbus_var.h b/sys/dev/hyperv/vmbus/vmbus_var.h index c278c15..df4553a 100644 --- a/sys/dev/hyperv/vmbus/vmbus_var.h +++ b/sys/dev/hyperv/vmbus/vmbus_var.h @@ -86,7 +86,7 @@ struct vmbus_softc { u_long *vmbus_rx_evtflags; /* compat evtflgs from host */ struct vmbus_channel **vmbus_chmap; - struct vmbus_msghc_ctx *vmbus_msg_hc; + struct vmbus_xact_ctx *vmbus_xc; struct vmbus_pcpu_data vmbus_pcpu[MAXCPU]; /* @@ -107,14 +107,19 @@ struct vmbus_softc { struct hyperv_dma vmbus_mnf1_dma; struct hyperv_dma vmbus_mnf2_dma; - struct mtx vmbus_scan_lock; - uint32_t vmbus_scan_chcnt; -#define VMBUS_SCAN_CHCNT_DONE 0x80000000 - uint32_t vmbus_scan_devcnt; + bool vmbus_scandone; + struct task vmbus_scandone_task; + + struct taskqueue *vmbus_devtq; /* for dev attach/detach */ + struct taskqueue *vmbus_subchtq; /* for sub-chan attach/detach */ /* Primary channels */ struct mtx vmbus_prichan_lock; TAILQ_HEAD(, vmbus_channel) vmbus_prichans; + + /* Complete channel list */ + struct mtx vmbus_chan_lock; + TAILQ_HEAD(, vmbus_channel) vmbus_chans; }; #define VMBUS_FLAG_ATTACHED 0x0001 /* vmbus was attached */ diff --git a/sys/dev/hyperv/vmbus/vmbus_xact.c b/sys/dev/hyperv/vmbus/vmbus_xact.c new file mode 100644 index 0000000..642c165 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_xact.c @@ -0,0 +1,313 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/systm.h> + +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus_xact.h> + +struct vmbus_xact { + struct vmbus_xact_ctx *x_ctx; + void *x_priv; + + void *x_req; + struct hyperv_dma x_req_dma; + + const void *x_resp; + size_t x_resp_len; + void *x_resp0; +}; + +struct vmbus_xact_ctx { + uint32_t xc_flags; + size_t xc_req_size; + size_t xc_resp_size; + size_t xc_priv_size; + + struct vmbus_xact *xc_free; + struct mtx xc_free_lock; + + struct vmbus_xact *xc_active; + struct mtx xc_active_lock; +}; + +#define VMBUS_XACT_CTXF_DESTROY 0x0001 + +static struct vmbus_xact *vmbus_xact_alloc(struct vmbus_xact_ctx *, + bus_dma_tag_t); +static void vmbus_xact_free(struct vmbus_xact *); +static struct vmbus_xact *vmbus_xact_get1(struct vmbus_xact_ctx *, + uint32_t); + +static struct vmbus_xact * +vmbus_xact_alloc(struct vmbus_xact_ctx *ctx, bus_dma_tag_t parent_dtag) +{ + struct vmbus_xact *xact; + + xact = malloc(sizeof(*xact), M_DEVBUF, M_WAITOK | M_ZERO); + xact->x_ctx = ctx; + + /* XXX assume that page aligned is enough */ + xact->x_req = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + ctx->xc_req_size, &xact->x_req_dma, BUS_DMA_WAITOK); + if (xact->x_req == NULL) { + free(xact, M_DEVBUF); + return (NULL); + } + if (ctx->xc_priv_size != 0) + xact->x_priv = malloc(ctx->xc_priv_size, M_DEVBUF, M_WAITOK); + xact->x_resp0 = malloc(ctx->xc_resp_size, M_DEVBUF, M_WAITOK); + + return (xact); +} + +static void +vmbus_xact_free(struct vmbus_xact *xact) +{ + + hyperv_dmamem_free(&xact->x_req_dma, xact->x_req); + free(xact->x_resp0, M_DEVBUF); + if (xact->x_priv != NULL) + free(xact->x_priv, M_DEVBUF); + free(xact, M_DEVBUF); +} + +static struct vmbus_xact * +vmbus_xact_get1(struct vmbus_xact_ctx *ctx, uint32_t dtor_flag) +{ + struct vmbus_xact *xact; + + mtx_lock(&ctx->xc_free_lock); + + while ((ctx->xc_flags & dtor_flag) == 0 && ctx->xc_free == NULL) + mtx_sleep(&ctx->xc_free, &ctx->xc_free_lock, 0, "gxact", 0); + if (ctx->xc_flags & dtor_flag) { + /* Being destroyed */ + xact = NULL; + } else { + xact = ctx->xc_free; + KASSERT(xact != NULL, ("no free xact")); + KASSERT(xact->x_resp == NULL, ("xact has pending response")); + ctx->xc_free = NULL; + } + + mtx_unlock(&ctx->xc_free_lock); + + return (xact); +} + +struct vmbus_xact_ctx * +vmbus_xact_ctx_create(bus_dma_tag_t dtag, size_t req_size, size_t resp_size, + size_t priv_size) +{ + struct vmbus_xact_ctx *ctx; + + ctx = malloc(sizeof(*ctx), M_DEVBUF, M_WAITOK | M_ZERO); + ctx->xc_req_size = req_size; + ctx->xc_resp_size = resp_size; + ctx->xc_priv_size = priv_size; + + ctx->xc_free = vmbus_xact_alloc(ctx, dtag); + if (ctx->xc_free == NULL) { + free(ctx, M_DEVBUF); + return (NULL); + } + + mtx_init(&ctx->xc_free_lock, "vmbus xact free", NULL, MTX_DEF); + mtx_init(&ctx->xc_active_lock, "vmbus xact active", NULL, MTX_DEF); + + return (ctx); +} + +void +vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx) +{ + struct vmbus_xact *xact; + + mtx_lock(&ctx->xc_free_lock); + ctx->xc_flags |= VMBUS_XACT_CTXF_DESTROY; + mtx_unlock(&ctx->xc_free_lock); + wakeup(&ctx->xc_free); + + xact = vmbus_xact_get1(ctx, 0); + if (xact == NULL) + panic("can't get xact"); + + vmbus_xact_free(xact); + mtx_destroy(&ctx->xc_free_lock); + mtx_destroy(&ctx->xc_active_lock); + free(ctx, M_DEVBUF); +} + +struct vmbus_xact * +vmbus_xact_get(struct vmbus_xact_ctx *ctx, size_t req_len) +{ + struct vmbus_xact *xact; + + if (req_len > ctx->xc_req_size) + panic("invalid request size %zu", req_len); + + xact = vmbus_xact_get1(ctx, VMBUS_XACT_CTXF_DESTROY); + if (xact == NULL) + return (NULL); + + memset(xact->x_req, 0, req_len); + return (xact); +} + +void +vmbus_xact_put(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + KASSERT(ctx->xc_active == NULL, ("pending active xact")); + xact->x_resp = NULL; + + mtx_lock(&ctx->xc_free_lock); + KASSERT(ctx->xc_free == NULL, ("has free xact")); + ctx->xc_free = xact; + mtx_unlock(&ctx->xc_free_lock); + wakeup(&ctx->xc_free); +} + +void * +vmbus_xact_req_data(const struct vmbus_xact *xact) +{ + + return (xact->x_req); +} + +bus_addr_t +vmbus_xact_req_paddr(const struct vmbus_xact *xact) +{ + + return (xact->x_req_dma.hv_paddr); +} + +void * +vmbus_xact_priv(const struct vmbus_xact *xact, size_t priv_len) +{ + + if (priv_len > xact->x_ctx->xc_priv_size) + panic("invalid priv size %zu", priv_len); + return (xact->x_priv); +} + +void +vmbus_xact_activate(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + KASSERT(xact->x_resp == NULL, ("xact has pending response")); + + mtx_lock(&ctx->xc_active_lock); + KASSERT(ctx->xc_active == NULL, ("pending active xact")); + ctx->xc_active = xact; + mtx_unlock(&ctx->xc_active_lock); +} + +void +vmbus_xact_deactivate(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + mtx_lock(&ctx->xc_active_lock); + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + ctx->xc_active = NULL; + mtx_unlock(&ctx->xc_active_lock); +} + +const void * +vmbus_xact_wait(struct vmbus_xact *xact, size_t *resp_len) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + const void *resp; + + mtx_lock(&ctx->xc_active_lock); + + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + while (xact->x_resp == NULL) { + mtx_sleep(&ctx->xc_active, &ctx->xc_active_lock, 0, + "wxact", 0); + } + ctx->xc_active = NULL; + + resp = xact->x_resp; + *resp_len = xact->x_resp_len; + + mtx_unlock(&ctx->xc_active_lock); + + return (resp); +} + +static void +vmbus_xact_save_resp(struct vmbus_xact *xact, const void *data, size_t dlen) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + size_t cplen = dlen; + + mtx_assert(&ctx->xc_active_lock, MA_OWNED); + + if (cplen > ctx->xc_resp_size) { + printf("vmbus: xact response truncated %zu -> %zu\n", + cplen, ctx->xc_resp_size); + cplen = ctx->xc_resp_size; + } + + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + memcpy(xact->x_resp0, data, cplen); + xact->x_resp_len = cplen; + xact->x_resp = xact->x_resp0; +} + +void +vmbus_xact_wakeup(struct vmbus_xact *xact, const void *data, size_t dlen) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + mtx_lock(&ctx->xc_active_lock); + vmbus_xact_save_resp(xact, data, dlen); + mtx_unlock(&ctx->xc_active_lock); + wakeup(&ctx->xc_active); +} + +void +vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, const void *data, size_t dlen) +{ + mtx_lock(&ctx->xc_active_lock); + KASSERT(ctx->xc_active != NULL, ("no pending xact")); + vmbus_xact_save_resp(ctx->xc_active, data, dlen); + mtx_unlock(&ctx->xc_active_lock); + wakeup(&ctx->xc_active); +} diff --git a/sys/dev/iscsi/icl.c b/sys/dev/iscsi/icl.c index 7b085bd..07b1ec6 100644 --- a/sys/dev/iscsi/icl.c +++ b/sys/dev/iscsi/icl.c @@ -746,10 +746,6 @@ icl_receive_thread(void *arg) ic = arg; so = ic->ic_socket; - ICL_CONN_LOCK(ic); - ic->ic_receive_running = true; - ICL_CONN_UNLOCK(ic); - for (;;) { if (ic->ic_disconnecting) { //ICL_DEBUG("terminating"); @@ -971,8 +967,6 @@ icl_send_thread(void *arg) STAILQ_INIT(&queue); ICL_CONN_LOCK(ic); - ic->ic_send_running = true; - for (;;) { for (;;) { /* @@ -1224,35 +1218,45 @@ icl_conn_start(struct icl_conn *ic) } /* + * Register socket upcall, to get notified about incoming PDUs + * and free space to send outgoing ones. + */ + SOCKBUF_LOCK(&ic->ic_socket->so_snd); + soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); + SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); + SOCKBUF_LOCK(&ic->ic_socket->so_rcv); + soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); + SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); + + /* * Start threads. */ + ICL_CONN_LOCK(ic); + ic->ic_send_running = ic->ic_receive_running = true; + ICL_CONN_UNLOCK(ic); error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", ic->ic_name); if (error != 0) { ICL_WARN("kthread_add(9) failed with error %d", error); + ICL_CONN_LOCK(ic); + ic->ic_send_running = ic->ic_receive_running = false; + cv_signal(&ic->ic_send_cv); + ICL_CONN_UNLOCK(ic); icl_conn_close(ic); return (error); } - error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", ic->ic_name); if (error != 0) { ICL_WARN("kthread_add(9) failed with error %d", error); + ICL_CONN_LOCK(ic); + ic->ic_receive_running = false; + cv_signal(&ic->ic_send_cv); + ICL_CONN_UNLOCK(ic); icl_conn_close(ic); return (error); } - /* - * Register socket upcall, to get notified about incoming PDUs - * and free space to send outgoing ones. - */ - SOCKBUF_LOCK(&ic->ic_socket->so_snd); - soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic); - SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); - SOCKBUF_LOCK(&ic->ic_socket->so_rcv); - soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic); - SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); - return (0); } @@ -1306,46 +1310,42 @@ void icl_conn_close(struct icl_conn *ic) { struct icl_pdu *pdu; + struct socket *so; - ICL_CONN_LOCK_ASSERT_NOT(ic); - - ICL_CONN_LOCK(ic); - if (ic->ic_socket == NULL) { - ICL_CONN_UNLOCK(ic); - return; - } - - /* - * Deregister socket upcalls. - */ - ICL_CONN_UNLOCK(ic); - SOCKBUF_LOCK(&ic->ic_socket->so_snd); - if (ic->ic_socket->so_snd.sb_upcall != NULL) - soupcall_clear(ic->ic_socket, SO_SND); - SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); - SOCKBUF_LOCK(&ic->ic_socket->so_rcv); - if (ic->ic_socket->so_rcv.sb_upcall != NULL) - soupcall_clear(ic->ic_socket, SO_RCV); - SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); ICL_CONN_LOCK(ic); - ic->ic_disconnecting = true; - /* * Wake up the threads, so they can properly terminate. */ + ic->ic_disconnecting = true; while (ic->ic_receive_running || ic->ic_send_running) { - //ICL_DEBUG("waiting for send/receive threads to terminate"); cv_signal(&ic->ic_receive_cv); cv_signal(&ic->ic_send_cv); cv_wait(&ic->ic_send_cv, ic->ic_lock); } - //ICL_DEBUG("send/receive threads terminated"); + /* Some other thread could close the connection same time. */ + so = ic->ic_socket; + if (so == NULL) { + ICL_CONN_UNLOCK(ic); + return; + } + ic->ic_socket = NULL; + + /* + * Deregister socket upcalls. + */ ICL_CONN_UNLOCK(ic); - soclose(ic->ic_socket); + SOCKBUF_LOCK(&so->so_snd); + if (so->so_snd.sb_upcall != NULL) + soupcall_clear(so, SO_SND); + SOCKBUF_UNLOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_upcall != NULL) + soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + soclose(so); ICL_CONN_LOCK(ic); - ic->ic_socket = NULL; if (ic->ic_receive_pdu != NULL) { //ICL_DEBUG("freeing partially received PDU"); diff --git a/sys/dev/kbd/kbd.c b/sys/dev/kbd/kbd.c index f1a1b29..ea84d46 100644 --- a/sys/dev/kbd/kbd.c +++ b/sys/dev/kbd/kbd.c @@ -888,7 +888,7 @@ genkbd_commonioctl(keyboard_t *kbd, u_long cmd, caddr_t arg) case PIO_KEYMAP: /* set keyboard translation table */ case OPIO_KEYMAP: /* set keyboard translation table (compat) */ #ifndef KBD_DISABLE_KEYMAP_LOAD - mapp = malloc(sizeof *mapp, M_TEMP, M_NOWAIT); + mapp = malloc(sizeof *mapp, M_TEMP, M_WAITOK); if (cmd == OPIO_KEYMAP) { omapp = (okeymap_t *)arg; mapp->n_keys = omapp->n_keys; diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index bb90375c..9bf37bc 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -4576,6 +4576,7 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid, struct resource_list *rl = &dinfo->resources; struct resource *res; struct pci_map *pm; + uint16_t cmd; pci_addr_t map, testval; int mapsize; @@ -4660,8 +4661,17 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid, device_printf(child, "Lazy allocation of %#lx bytes rid %#x type %d at %#lx\n", count, *rid, type, rman_get_start(res)); + + /* Disable decoding via the CMD register before updating the BAR */ + cmd = pci_read_config(child, PCIR_COMMAND, 2); + pci_write_config(child, PCIR_COMMAND, + cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2); + map = rman_get_start(res); pci_write_bar(child, pm, map); + + /* Restore the original value of the CMD register */ + pci_write_config(child, PCIR_COMMAND, cmd, 2); out: return (res); } diff --git a/sys/dev/qlxgbe/ql_hw.c b/sys/dev/qlxgbe/ql_hw.c index ad8dba5..8c40aba 100644 --- a/sys/dev/qlxgbe/ql_hw.c +++ b/sys/dev/qlxgbe/ql_hw.c @@ -1128,12 +1128,21 @@ qla_config_intr_coalesce(qla_host_t *ha, uint16_t cntxt_id, int tenable, * Can be unicast, multicast or broadcast. */ static int -qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac) +qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac, + uint32_t num_mac) { q80_config_mac_addr_t *cmac; q80_config_mac_addr_rsp_t *cmac_rsp; uint32_t err; device_t dev = ha->pci_dev; + int i; + uint8_t *mac_cpy = mac_addr; + + if (num_mac > Q8_MAX_MAC_ADDRS) { + device_printf(dev, "%s: %s num_mac [0x%x] > Q8_MAX_MAC_ADDRS\n", + __func__, (add_mac ? "Add" : "Del"), num_mac); + return (-1); + } cmac = (q80_config_mac_addr_t *)ha->hw.mbox; bzero(cmac, (sizeof (q80_config_mac_addr_t))); @@ -1149,9 +1158,13 @@ qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac) cmac->cmd |= Q8_MBX_CMAC_CMD_CAM_INGRESS; - cmac->nmac_entries = 1; + cmac->nmac_entries = num_mac; cmac->cntxt_id = ha->hw.rcv_cntxt_id; - bcopy(mac_addr, cmac->mac_addr[0].addr, 6); + + for (i = 0; i < num_mac; i++) { + bcopy(mac_addr, cmac->mac_addr[i].addr, Q8_ETHER_ADDR_LEN); + mac_addr = mac_addr + ETHER_ADDR_LEN; + } if (qla_mbx_cmd(ha, (uint32_t *)cmac, (sizeof (q80_config_mac_addr_t) >> 2), @@ -1165,11 +1178,14 @@ qla_config_mac_addr(qla_host_t *ha, uint8_t *mac_addr, uint32_t add_mac) err = Q8_MBX_RSP_STATUS(cmac_rsp->regcnt_status); if (err) { - device_printf(dev, "%s: %s " - "%02x:%02x:%02x:%02x:%02x:%02x failed1 [0x%08x]\n", - __func__, (add_mac ? "Add" : "Del"), - mac_addr[0], mac_addr[1], mac_addr[2], - mac_addr[3], mac_addr[4], mac_addr[5], err); + device_printf(dev, "%s: %s failed1 [0x%08x]\n", __func__, + (add_mac ? "Add" : "Del"), err); + for (i = 0; i < num_mac; i++) { + device_printf(dev, "%s: %02x:%02x:%02x:%02x:%02x:%02x\n", + __func__, mac_cpy[0], mac_cpy[1], mac_cpy[2], + mac_cpy[3], mac_cpy[4], mac_cpy[5]); + mac_cpy += ETHER_ADDR_LEN; + } return (-1); } @@ -2254,6 +2270,7 @@ ql_del_hw_if(qla_host_t *ha) (void)qla_stop_nic_func(ha); qla_del_rcv_cntxt(ha); + qla_del_xmt_cntxt(ha); if (ha->hw.flags.init_intr_cnxt) { @@ -2270,6 +2287,7 @@ ql_del_hw_if(qla_host_t *ha) ha->hw.flags.init_intr_cnxt = 0; } + return; } @@ -2368,7 +2386,7 @@ ql_init_hw_if(qla_host_t *ha) } ha->hw.max_tx_segs = 0; - if (qla_config_mac_addr(ha, ha->hw.mac_addr, 1)) + if (qla_config_mac_addr(ha, ha->hw.mac_addr, 1, 1)) return(-1); ha->hw.flags.unicast_mac = 1; @@ -2376,7 +2394,7 @@ ql_init_hw_if(qla_host_t *ha) bcast_mac[0] = 0xFF; bcast_mac[1] = 0xFF; bcast_mac[2] = 0xFF; bcast_mac[3] = 0xFF; bcast_mac[4] = 0xFF; bcast_mac[5] = 0xFF; - if (qla_config_mac_addr(ha, bcast_mac, 1)) + if (qla_config_mac_addr(ha, bcast_mac, 1, 1)) return (-1); ha->hw.flags.bcast_mac = 1; @@ -2733,14 +2751,14 @@ qla_del_rcv_cntxt(qla_host_t *ha) bcast_mac[0] = 0xFF; bcast_mac[1] = 0xFF; bcast_mac[2] = 0xFF; bcast_mac[3] = 0xFF; bcast_mac[4] = 0xFF; bcast_mac[5] = 0xFF; - if (qla_config_mac_addr(ha, bcast_mac, 0)) + if (qla_config_mac_addr(ha, bcast_mac, 0, 1)) return; ha->hw.flags.bcast_mac = 0; } if (ha->hw.flags.unicast_mac) { - if (qla_config_mac_addr(ha, ha->hw.mac_addr, 0)) + if (qla_config_mac_addr(ha, ha->hw.mac_addr, 0, 1)) return; ha->hw.flags.unicast_mac = 0; } @@ -2926,12 +2944,20 @@ qla_init_xmt_cntxt(qla_host_t *ha) } static int -qla_hw_add_all_mcast(qla_host_t *ha) +qla_hw_all_mcast(qla_host_t *ha, uint32_t add_mcast) { int i, nmcast; + uint32_t count = 0; + uint8_t *mcast; nmcast = ha->hw.nmcast; + QL_DPRINT2(ha, (ha->pci_dev, + "%s:[0x%x] enter nmcast = %d \n", __func__, add_mcast, nmcast)); + + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); + for (i = 0 ; ((i < Q8_MAX_NUM_MULTICAST_ADDRS) && nmcast); i++) { if ((ha->hw.mcast[i].addr[0] != 0) || (ha->hw.mcast[i].addr[1] != 0) || @@ -2940,52 +2966,80 @@ qla_hw_add_all_mcast(qla_host_t *ha) (ha->hw.mcast[i].addr[4] != 0) || (ha->hw.mcast[i].addr[5] != 0)) { - if (qla_config_mac_addr(ha, ha->hw.mcast[i].addr, 1)) { - device_printf(ha->pci_dev, "%s: failed\n", - __func__); - return (-1); + bcopy(ha->hw.mcast[i].addr, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + + if (count == Q8_MAX_MAC_ADDRS) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, + add_mcast, count)) { + device_printf(ha->pci_dev, + "%s: failed\n", __func__); + return (-1); + } + + count = 0; + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, + (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); } nmcast--; } } + + if (count) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, add_mcast, + count)) { + device_printf(ha->pci_dev, "%s: failed\n", __func__); + return (-1); + } + } + QL_DPRINT2(ha, (ha->pci_dev, + "%s:[0x%x] exit nmcast = %d \n", __func__, add_mcast, nmcast)); + return 0; } static int -qla_hw_del_all_mcast(qla_host_t *ha) +qla_hw_add_all_mcast(qla_host_t *ha) { - int i, nmcast; + int ret; - nmcast = ha->hw.nmcast; + ret = qla_hw_all_mcast(ha, 1); - for (i = 0 ; ((i < Q8_MAX_NUM_MULTICAST_ADDRS) && nmcast); i++) { - if ((ha->hw.mcast[i].addr[0] != 0) || - (ha->hw.mcast[i].addr[1] != 0) || - (ha->hw.mcast[i].addr[2] != 0) || - (ha->hw.mcast[i].addr[3] != 0) || - (ha->hw.mcast[i].addr[4] != 0) || - (ha->hw.mcast[i].addr[5] != 0)) { + return (ret); +} - if (qla_config_mac_addr(ha, ha->hw.mcast[i].addr, 0)) - return (-1); +static int +qla_hw_del_all_mcast(qla_host_t *ha) +{ + int ret; - nmcast--; - } - } - return 0; + ret = qla_hw_all_mcast(ha, 0); + + bzero(ha->hw.mcast, (sizeof (qla_mcast_t) * Q8_MAX_NUM_MULTICAST_ADDRS)); + ha->hw.nmcast = 0; + + return (ret); } static int -qla_hw_add_mcast(qla_host_t *ha, uint8_t *mta) +qla_hw_mac_addr_present(qla_host_t *ha, uint8_t *mta) { int i; for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { - if (QL_MAC_CMP(ha->hw.mcast[i].addr, mta) == 0) - return 0; /* its been already added */ + return (0); /* its been already added */ } + return (-1); +} + +static int +qla_hw_add_mcast(qla_host_t *ha, uint8_t *mta, uint32_t nmcast) +{ + int i; for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { @@ -2996,29 +3050,28 @@ qla_hw_add_mcast(qla_host_t *ha, uint8_t *mta) (ha->hw.mcast[i].addr[4] == 0) && (ha->hw.mcast[i].addr[5] == 0)) { - if (qla_config_mac_addr(ha, mta, 1)) - return (-1); - bcopy(mta, ha->hw.mcast[i].addr, Q8_MAC_ADDR_LEN); ha->hw.nmcast++; - return 0; + mta = mta + ETHER_ADDR_LEN; + nmcast--; + + if (nmcast == 0) + break; } + } return 0; } static int -qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta) +qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta, uint32_t nmcast) { int i; for (i = 0; i < Q8_MAX_NUM_MULTICAST_ADDRS; i++) { if (QL_MAC_CMP(ha->hw.mcast[i].addr, mta) == 0) { - if (qla_config_mac_addr(ha, mta, 0)) - return (-1); - ha->hw.mcast[i].addr[0] = 0; ha->hw.mcast[i].addr[1] = 0; ha->hw.mcast[i].addr[2] = 0; @@ -3028,7 +3081,11 @@ qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta) ha->hw.nmcast--; - return 0; + mta = mta + ETHER_ADDR_LEN; + nmcast--; + + if (nmcast == 0) + break; } } return 0; @@ -3036,30 +3093,75 @@ qla_hw_del_mcast(qla_host_t *ha, uint8_t *mta) /* * Name: ql_hw_set_multi - * Function: Sets the Multicast Addresses provided the host O.S into the + * Function: Sets the Multicast Addresses provided by the host O.S into the * hardware (for the given interface) */ int -ql_hw_set_multi(qla_host_t *ha, uint8_t *mcast, uint32_t mcnt, +ql_hw_set_multi(qla_host_t *ha, uint8_t *mcast_addr, uint32_t mcnt, uint32_t add_mac) { + uint8_t *mta = mcast_addr; int i; - uint8_t *mta = mcast; int ret = 0; + uint32_t count = 0; + uint8_t *mcast; + + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); for (i = 0; i < mcnt; i++) { - if (add_mac) { - ret = qla_hw_add_mcast(ha, mta); - if (ret) - break; - } else { - ret = qla_hw_del_mcast(ha, mta); - if (ret) - break; + if (mta[0] || mta[1] || mta[2] || mta[3] || mta[4] || mta[5]) { + if (add_mac) { + if (qla_hw_mac_addr_present(ha, mta) != 0) { + bcopy(mta, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + } + } else { + if (qla_hw_mac_addr_present(ha, mta) == 0) { + bcopy(mta, mcast, ETHER_ADDR_LEN); + mcast = mcast + ETHER_ADDR_LEN; + count++; + } + } + } + if (count == Q8_MAX_MAC_ADDRS) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, + add_mac, count)) { + device_printf(ha->pci_dev, "%s: failed\n", + __func__); + return (-1); + } + + if (add_mac) { + qla_hw_add_mcast(ha, ha->hw.mac_addr_arr, + count); + } else { + qla_hw_del_mcast(ha, ha->hw.mac_addr_arr, + count); + } + + count = 0; + mcast = ha->hw.mac_addr_arr; + memset(mcast, 0, (Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)); } mta += Q8_MAC_ADDR_LEN; } + + if (count) { + if (qla_config_mac_addr(ha, ha->hw.mac_addr_arr, add_mac, + count)) { + device_printf(ha->pci_dev, "%s: failed\n", __func__); + return (-1); + } + if (add_mac) { + qla_hw_add_mcast(ha, ha->hw.mac_addr_arr, count); + } else { + qla_hw_del_mcast(ha, ha->hw.mac_addr_arr, count); + } + } + return (ret); } diff --git a/sys/dev/qlxgbe/ql_hw.h b/sys/dev/qlxgbe/ql_hw.h index ff33219..e50bc5e 100644 --- a/sys/dev/qlxgbe/ql_hw.h +++ b/sys/dev/qlxgbe/ql_hw.h @@ -210,7 +210,7 @@ #define Q8_NUM_MBOX 512 -#define Q8_MAX_NUM_MULTICAST_ADDRS 1023 +#define Q8_MAX_NUM_MULTICAST_ADDRS 1022 #define Q8_MAC_ADDR_LEN 6 /* @@ -511,8 +511,9 @@ typedef struct _q80_config_intr_coalesc_rsp { /* * Configure MAC Address */ +#define Q8_ETHER_ADDR_LEN 6 typedef struct _q80_mac_addr { - uint8_t addr[6]; + uint8_t addr[Q8_ETHER_ADDR_LEN]; uint16_t vlan_tci; } __packed q80_mac_addr_t; @@ -1548,7 +1549,7 @@ typedef struct _qla_hw_tx_cntxt { typedef struct _qla_mcast { uint16_t rsrvd; - uint8_t addr[6]; + uint8_t addr[ETHER_ADDR_LEN]; } __packed qla_mcast_t; typedef struct _qla_rdesc { @@ -1660,6 +1661,7 @@ typedef struct _qla_hw { /* multicast address list */ uint32_t nmcast; qla_mcast_t mcast[Q8_MAX_NUM_MULTICAST_ADDRS]; + uint8_t mac_addr_arr[(Q8_MAX_MAC_ADDRS * ETHER_ADDR_LEN)]; /* reset sequence */ #define Q8_MAX_RESET_SEQ_IDX 16 diff --git a/sys/dev/qlxgbe/ql_os.c b/sys/dev/qlxgbe/ql_os.c index 9ebd2ef..ecccdb81 100644 --- a/sys/dev/qlxgbe/ql_os.c +++ b/sys/dev/qlxgbe/ql_os.c @@ -243,6 +243,8 @@ qla_watchdog(void *arg) ha->flags.qla_watchdog_pause = 1; ha->qla_initiate_recovery = 0; ha->err_inject = 0; + device_printf(ha->pci_dev, + "%s: taskqueue_enqueue(err_task) \n", __func__); taskqueue_enqueue(ha->err_tq, &ha->err_task); } else if (ha->flags.qla_interface_up) { @@ -452,7 +454,7 @@ qla_pci_attach(device_t dev) TASK_INIT(&ha->tx_task, 0, qla_tx_done, ha); - ha->tx_tq = taskqueue_create_fast("qla_txq", M_NOWAIT, + ha->tx_tq = taskqueue_create("qla_txq", M_NOWAIT, taskqueue_thread_enqueue, &ha->tx_tq); taskqueue_start_threads(&ha->tx_tq, 1, PI_NET, "%s txq", device_get_nameunit(ha->pci_dev)); @@ -470,13 +472,13 @@ qla_pci_attach(device_t dev) qla_watchdog, ha); TASK_INIT(&ha->err_task, 0, qla_error_recovery, ha); - ha->err_tq = taskqueue_create_fast("qla_errq", M_NOWAIT, + ha->err_tq = taskqueue_create("qla_errq", M_NOWAIT, taskqueue_thread_enqueue, &ha->err_tq); taskqueue_start_threads(&ha->err_tq, 1, PI_NET, "%s errq", device_get_nameunit(ha->pci_dev)); TASK_INIT(&ha->async_event_task, 0, qla_async_event, ha); - ha->async_event_tq = taskqueue_create_fast("qla_asyncq", M_NOWAIT, + ha->async_event_tq = taskqueue_create("qla_asyncq", M_NOWAIT, taskqueue_thread_enqueue, &ha->async_event_tq); taskqueue_start_threads(&ha->async_event_tq, 1, PI_NET, "%s asyncq", device_get_nameunit(ha->pci_dev)); diff --git a/sys/dev/qlxgbe/ql_ver.h b/sys/dev/qlxgbe/ql_ver.h index de85173..182fa32 100644 --- a/sys/dev/qlxgbe/ql_ver.h +++ b/sys/dev/qlxgbe/ql_ver.h @@ -36,6 +36,6 @@ #define QLA_VERSION_MAJOR 3 #define QLA_VERSION_MINOR 10 -#define QLA_VERSION_BUILD 30 +#define QLA_VERSION_BUILD 31 #endif /* #ifndef _QL_VER_H_ */ diff --git a/sys/dev/sfxge/common/ef10_nic.c b/sys/dev/sfxge/common/ef10_nic.c index 3c40588..6423154 100644 --- a/sys/dev/sfxge/common/ef10_nic.c +++ b/sys/dev/sfxge/common/ef10_nic.c @@ -164,6 +164,7 @@ ef10_nic_get_port_mode_bandwidth( break; case TLV_PORT_MODE_10G_10G_10G_10G: case TLV_PORT_MODE_10G_10G_10G_10G_Q: + case TLV_PORT_MODE_10G_10G_10G_10G_Q1_Q2: case TLV_PORT_MODE_10G_10G_10G_10G_Q2: bandwidth = 10000 * 4; break; @@ -1098,57 +1099,74 @@ fail1: /* - * The external port mapping is a one-based numbering of the external - * connectors on the board. It does not distinguish off-board separated - * outputs such as multi-headed cables. - * The number of ports that map to each external port connector - * on the board is determined by the chip family and the port modes to - * which the NIC can be configured. The mapping table lists modes with - * port numbering requirements in increasing order. + * Table of mapping schemes from port number to the number of the external + * connector on the board. The external numbering does not distinguish + * off-board separated outputs such as from multi-headed cables. + * + * The count of adjacent port numbers that map to each external port + * and the offset in the numbering, is determined by the chip family and + * current port mode. + * + * For the Huntington family, the current port mode cannot be discovered, + * so the mapping used is instead the last match in the table to the full + * set of port modes to which the NIC can be configured. Therefore the + * ordering of entries in the the mapping table is significant. */ static struct { efx_family_t family; uint32_t modes_mask; - uint32_t stride; + int32_t count; + int32_t offset; } __ef10_external_port_mappings[] = { - /* Supported modes requiring 1 output per port */ + /* Supported modes with 1 output per external port */ { EFX_FAMILY_HUNTINGTON, (1 << TLV_PORT_MODE_10G) | (1 << TLV_PORT_MODE_10G_10G) | (1 << TLV_PORT_MODE_10G_10G_10G_10G), + 1, 1 }, { EFX_FAMILY_MEDFORD, (1 << TLV_PORT_MODE_10G) | - (1 << TLV_PORT_MODE_10G_10G) | - (1 << TLV_PORT_MODE_10G_10G_10G_10G), + (1 << TLV_PORT_MODE_10G_10G), + 1, 1 }, - /* Supported modes requiring 2 outputs per port */ + /* Supported modes with 2 outputs per external port */ { EFX_FAMILY_HUNTINGTON, (1 << TLV_PORT_MODE_40G) | (1 << TLV_PORT_MODE_40G_40G) | (1 << TLV_PORT_MODE_40G_10G_10G) | (1 << TLV_PORT_MODE_10G_10G_40G), - 2 + 2, + 1 }, { EFX_FAMILY_MEDFORD, (1 << TLV_PORT_MODE_40G) | (1 << TLV_PORT_MODE_40G_40G) | (1 << TLV_PORT_MODE_40G_10G_10G) | - (1 << TLV_PORT_MODE_10G_10G_40G), - 2 + (1 << TLV_PORT_MODE_10G_10G_40G) | + (1 << TLV_PORT_MODE_10G_10G_10G_10G_Q1_Q2), + 2, + 1 }, - /* Supported modes requiring 4 outputs per port */ + /* Supported modes with 4 outputs per external port */ { EFX_FAMILY_MEDFORD, (1 << TLV_PORT_MODE_10G_10G_10G_10G_Q) | + (1 << TLV_PORT_MODE_10G_10G_10G_10G_Q1), + 4, + 1, + }, + { + EFX_FAMILY_MEDFORD, (1 << TLV_PORT_MODE_10G_10G_10G_10G_Q2), - 4 + 4, + 2 }, }; @@ -1162,11 +1180,26 @@ ef10_external_port_mapping( int i; uint32_t port_modes; uint32_t matches; - uint32_t stride = 1; /* default 1-1 mapping */ - - if ((rc = efx_mcdi_get_port_modes(enp, &port_modes, NULL)) != 0) { - /* No port mode information available - use default mapping */ - goto out; + uint32_t current; + int32_t count = 1; /* Default 1-1 mapping */ + int32_t offset = 1; /* Default starting external port number */ + + if ((rc = efx_mcdi_get_port_modes(enp, &port_modes, ¤t)) != 0) { + /* + * No current port mode information + * - infer mapping from available modes + */ + if ((rc = efx_mcdi_get_port_modes(enp, + &port_modes, NULL)) != 0) { + /* + * No port mode information available + * - use default mapping + */ + goto out; + } + } else { + /* Only need to scan the current mode */ + port_modes = 1 << current; } /* @@ -1180,7 +1213,8 @@ ef10_external_port_mapping( matches = (__ef10_external_port_mappings[i].modes_mask & port_modes); if (matches != 0) { - stride = __ef10_external_port_mappings[i].stride; + count = __ef10_external_port_mappings[i].count; + offset = __ef10_external_port_mappings[i].offset; port_modes &= ~matches; } } @@ -1194,9 +1228,9 @@ ef10_external_port_mapping( out: /* * Scale as required by last matched mode and then convert to - * one-based numbering + * correctly offset numbering */ - *external_portp = (uint8_t)(port / stride) + 1; + *external_portp = (uint8_t)((port / count) + offset); return (0); fail1: diff --git a/sys/dev/sfxge/common/ef10_tlv_layout.h b/sys/dev/sfxge/common/ef10_tlv_layout.h index 811d7c3..033cdf9 100644 --- a/sys/dev/sfxge/common/ef10_tlv_layout.h +++ b/sys/dev/sfxge/common/ef10_tlv_layout.h @@ -553,12 +553,14 @@ struct tlv_global_port_mode { #define TLV_PORT_MODE_40G (1) /* 40G, single QSFP/40G-KR */ #define TLV_PORT_MODE_10G_10G (2) /* 2x10G, dual SFP/10G-KR or single QSFP */ #define TLV_PORT_MODE_40G_40G (3) /* 40G + 40G, dual QSFP/40G-KR (Greenport, Medford) */ -#define TLV_PORT_MODE_10G_10G_10G_10G (4) /* 2x10G + 2x10G, quad SFP/10G-KR or dual QSFP (Greenport, Medford) */ -#define TLV_PORT_MODE_10G_10G_10G_10G_Q (5) /* 4x10G, single QSFP, cage 0 (Medford) */ +#define TLV_PORT_MODE_10G_10G_10G_10G (4) /* 2x10G + 2x10G, quad SFP/10G-KR or dual QSFP (Greenport) */ +#define TLV_PORT_MODE_10G_10G_10G_10G_Q1 (4) /* 4x10G, single QSFP, cage 0 (Medford) */ +#define TLV_PORT_MODE_10G_10G_10G_10G_Q (5) /* 4x10G, single QSFP, cage 0 (Medford) OBSOLETE DO NOT USE */ #define TLV_PORT_MODE_40G_10G_10G (6) /* 1x40G + 2x10G, dual QSFP (Greenport, Medford) */ #define TLV_PORT_MODE_10G_10G_40G (7) /* 2x10G + 1x40G, dual QSFP (Greenport, Medford) */ #define TLV_PORT_MODE_10G_10G_10G_10G_Q2 (8) /* 4x10G, single QSFP, cage 1 (Medford) */ -#define TLV_PORT_MODE_MAX TLV_PORT_MODE_10G_10G_10G_10G_Q2 +#define TLV_PORT_MODE_10G_10G_10G_10G_Q1_Q2 (9) /* 2x10G + 2x10G, dual QSFP (Medford) */ +#define TLV_PORT_MODE_MAX TLV_PORT_MODE_10G_10G_10G_10G_Q1_Q2 }; /* Type of the v-switch created implicitly by the firmware */ @@ -765,8 +767,8 @@ struct tlv_rx_event_merging_config { #define TLV_RX_EVENT_MERGING_CONFIG_MAX_EVENTS_MAX ((1 << 4) - 1) uint32_t timeout_ns; }; -#define TLV_RX_EVENT_MERGING_MAX_EVENTS_DEFAULT 7 -#define TLV_RX_EVENT_MERGING_TIMEOUT_NS_DEFAULT 8740 +#define TLV_RX_EVENT_MERGING_MAX_EVENTS_DEFAULT (0xffffffff) +#define TLV_RX_EVENT_MERGING_TIMEOUT_NS_DEFAULT (0xffffffff) #define TLV_TAG_PCIE_LINK_SETTINGS (0x101f0000) struct tlv_pcie_link_settings { @@ -791,9 +793,9 @@ struct tlv_tx_event_merging_config { uint32_t timeout_ns; uint32_t qempty_timeout_ns; /* Medford only */ }; -#define TLV_TX_EVENT_MERGING_MAX_EVENTS_DEFAULT 7 -#define TLV_TX_EVENT_MERGING_TIMEOUT_NS_DEFAULT 1400 -#define TLV_TX_EVENT_MERGING_QEMPTY_TIMEOUT_NS_DEFAULT 700 +#define TLV_TX_EVENT_MERGING_MAX_EVENTS_DEFAULT (0xffffffff) +#define TLV_TX_EVENT_MERGING_TIMEOUT_NS_DEFAULT (0xffffffff) +#define TLV_TX_EVENT_MERGING_QEMPTY_TIMEOUT_NS_DEFAULT (0xffffffff) /* Tx vFIFO Low latency configuration * @@ -809,6 +811,20 @@ struct tlv_tx_vfifo_ull_mode { #define TLV_TX_VFIFO_ULL_MODE_DEFAULT 0 }; +/* BIU mode + * + * Medford2 tag for selecting VI window decode (see values below) + */ +#define TLV_TAG_BIU_VI_WINDOW_MODE (0x10280000) +struct tlv_biu_vi_window_mode { + uint32_t tag; + uint32_t length; + uint8_t mode; +#define TLV_BIU_VI_WINDOW_MODE_8K 0 /* 8k per VI, CTPIO not mapped, medford/hunt compatible */ +#define TLV_BIU_VI_WINDOW_MODE_16K 1 /* 16k per VI, CTPIO mapped */ +#define TLV_BIU_VI_WINDOW_MODE_64K 2 /* 64k per VI, CTPIO mapped, POWER-friendly */ +}; + #define TLV_TAG_LICENSE (0x30800000) typedef struct tlv_license { diff --git a/sys/dev/usb/net/if_urndis.c b/sys/dev/usb/net/if_urndis.c index 5d2a637..ceb6acf 100644 --- a/sys/dev/usb/net/if_urndis.c +++ b/sys/dev/usb/net/if_urndis.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/rndis.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -79,17 +80,17 @@ static uether_fn_t urndis_setmulti; static uether_fn_t urndis_setpromisc; static uint32_t urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid, - struct urndis_query_req *msg, uint16_t len, + struct rndis_query_req *msg, uint16_t len, const void **rbuf, uint16_t *rbufsz); static uint32_t urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid, - struct urndis_set_req *msg, uint16_t len); + struct rndis_set_req *msg, uint16_t len); static uint32_t urndis_ctrl_handle_init(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr); + const struct rndis_comp_hdr *hdr); static uint32_t urndis_ctrl_handle_query(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr, const void **buf, + const struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz); static uint32_t urndis_ctrl_handle_reset(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr); + const struct rndis_comp_hdr *hdr); static uint32_t urndis_ctrl_init(struct urndis_softc *sc); static uint32_t urndis_ctrl_halt(struct urndis_softc *sc); @@ -211,8 +212,8 @@ urndis_attach(device_t dev) { static struct { union { - struct urndis_query_req query; - struct urndis_set_req set; + struct rndis_query_req query; + struct rndis_set_req set; } hdr; union { uint8_t eaddr[ETHER_ADDR_LEN]; @@ -290,8 +291,8 @@ urndis_attach(device_t dev) memcpy(&sc->sc_ue.ue_eaddr, buf, ETHER_ADDR_LEN); /* Initialize packet filter */ - sc->sc_filter = RNDIS_PACKET_TYPE_BROADCAST | - RNDIS_PACKET_TYPE_ALL_MULTICAST; + sc->sc_filter = NDIS_PACKET_TYPE_BROADCAST | + NDIS_PACKET_TYPE_ALL_MULTICAST; msg.ibuf.filter = htole32(sc->sc_filter); URNDIS_LOCK(sc); error = urndis_ctrl_set(sc, OID_GEN_CURRENT_PACKET_FILTER, @@ -452,10 +453,10 @@ urndis_ctrl_send(struct urndis_softc *sc, void *buf, uint16_t len) return (err); } -static struct urndis_comp_hdr * +static struct rndis_comp_hdr * urndis_ctrl_recv(struct urndis_softc *sc) { - struct urndis_comp_hdr *hdr; + struct rndis_comp_hdr *hdr; usb_error_t err; err = urndis_ctrl_msg(sc, UT_READ_CLASS_INTERFACE, @@ -465,7 +466,7 @@ urndis_ctrl_recv(struct urndis_softc *sc) if (err != USB_ERR_NORMAL_COMPLETION) return (NULL); - hdr = (struct urndis_comp_hdr *)sc->sc_response_buf; + hdr = (struct rndis_comp_hdr *)sc->sc_response_buf; DPRINTF("type 0x%x len %u\n", le32toh(hdr->rm_type), le32toh(hdr->rm_len)); @@ -479,7 +480,7 @@ urndis_ctrl_recv(struct urndis_softc *sc) } static uint32_t -urndis_ctrl_handle(struct urndis_softc *sc, struct urndis_comp_hdr *hdr, +urndis_ctrl_handle(struct urndis_softc *sc, struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz) { uint32_t rval; @@ -520,11 +521,11 @@ urndis_ctrl_handle(struct urndis_softc *sc, struct urndis_comp_hdr *hdr, static uint32_t urndis_ctrl_handle_init(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr) + const struct rndis_comp_hdr *hdr) { - const struct urndis_init_comp *msg; + const struct rndis_init_comp *msg; - msg = (const struct urndis_init_comp *)hdr; + msg = (const struct rndis_init_comp *)hdr; DPRINTF("len %u rid %u status 0x%x " "ver_major %u ver_minor %u devflags 0x%x medium 0x%x pktmaxcnt %u " @@ -563,12 +564,12 @@ urndis_ctrl_handle_init(struct urndis_softc *sc, static uint32_t urndis_ctrl_handle_query(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz) + const struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz) { - const struct urndis_query_comp *msg; + const struct rndis_query_comp *msg; uint64_t limit; - msg = (const struct urndis_query_comp *)hdr; + msg = (const struct rndis_query_comp *)hdr; DPRINTF("len %u rid %u status 0x%x " "buflen %u bufoff %u\n", @@ -608,12 +609,12 @@ urndis_ctrl_handle_query(struct urndis_softc *sc, static uint32_t urndis_ctrl_handle_reset(struct urndis_softc *sc, - const struct urndis_comp_hdr *hdr) + const struct rndis_comp_hdr *hdr) { - const struct urndis_reset_comp *msg; + const struct rndis_reset_comp *msg; uint32_t rval; - msg = (const struct urndis_reset_comp *)hdr; + msg = (const struct rndis_reset_comp *)hdr; rval = le32toh(msg->rm_status); @@ -629,7 +630,7 @@ urndis_ctrl_handle_reset(struct urndis_softc *sc, } if (msg->rm_adrreset != 0) { struct { - struct urndis_set_req hdr; + struct rndis_set_req hdr; uint32_t filter; } msg_filter; @@ -649,14 +650,14 @@ urndis_ctrl_handle_reset(struct urndis_softc *sc, static uint32_t urndis_ctrl_init(struct urndis_softc *sc) { - struct urndis_init_req msg; - struct urndis_comp_hdr *hdr; + struct rndis_init_req msg; + struct rndis_comp_hdr *hdr; uint32_t rval; msg.rm_type = htole32(REMOTE_NDIS_INITIALIZE_MSG); msg.rm_len = htole32(sizeof(msg)); msg.rm_rid = 0; - msg.rm_ver_major = htole32(1); + msg.rm_ver_major = htole32(RNDIS_VERSION_MAJOR); msg.rm_ver_minor = htole32(1); msg.rm_max_xfersz = htole32(RNDIS_RX_MAXLEN); @@ -687,7 +688,7 @@ urndis_ctrl_init(struct urndis_softc *sc) static uint32_t urndis_ctrl_halt(struct urndis_softc *sc) { - struct urndis_halt_req msg; + struct rndis_halt_req msg; uint32_t rval; msg.rm_type = htole32(REMOTE_NDIS_HALT_MSG); @@ -713,10 +714,10 @@ urndis_ctrl_halt(struct urndis_softc *sc) */ static uint32_t urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid, - struct urndis_query_req *msg, uint16_t len, const void **rbuf, + struct rndis_query_req *msg, uint16_t len, const void **rbuf, uint16_t *rbufsz) { - struct urndis_comp_hdr *hdr; + struct rndis_comp_hdr *hdr; uint32_t datalen, rval; msg->rm_type = htole32(REMOTE_NDIS_QUERY_MSG); @@ -760,9 +761,9 @@ urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid, static uint32_t urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid, - struct urndis_set_req *msg, uint16_t len) + struct rndis_set_req *msg, uint16_t len) { - struct urndis_comp_hdr *hdr; + struct rndis_comp_hdr *hdr; uint32_t datalen, rval; msg->rm_type = htole32(REMOTE_NDIS_SET_MSG); @@ -812,7 +813,7 @@ urndis_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error) struct urndis_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc = usbd_xfer_get_frame(xfer, 0); struct ifnet *ifp = uether_getifp(&sc->sc_ue); - struct urndis_packet_msg msg; + struct rndis_packet_msg msg; struct mbuf *m; int actlen; int aframes; @@ -872,11 +873,11 @@ urndis_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error) "datalen %u\n", msg.rm_datalen, actlen); goto tr_setup; } else if ((msg.rm_dataoffset + msg.rm_datalen + - (uint32_t)__offsetof(struct urndis_packet_msg, + (uint32_t)__offsetof(struct rndis_packet_msg, rm_dataoffset)) > (uint32_t)actlen) { DPRINTF("invalid dataoffset %u larger than %u\n", msg.rm_dataoffset + msg.rm_datalen + - (uint32_t)__offsetof(struct urndis_packet_msg, + (uint32_t)__offsetof(struct rndis_packet_msg, rm_dataoffset), actlen); goto tr_setup; } else if (msg.rm_datalen < (uint32_t)sizeof(struct ether_header)) { @@ -902,7 +903,7 @@ urndis_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error) m_adj(m, ETHER_ALIGN); usbd_copy_out(pc, offset + msg.rm_dataoffset + - __offsetof(struct urndis_packet_msg, + __offsetof(struct rndis_packet_msg, rm_dataoffset), m->m_data, msg.rm_datalen); /* enqueue */ @@ -938,7 +939,7 @@ tr_setup: static void urndis_bulk_write_callback(struct usb_xfer *xfer, usb_error_t error) { - struct urndis_packet_msg msg; + struct rndis_packet_msg msg; struct urndis_softc *sc = usbd_xfer_softc(xfer); struct ifnet *ifp = uether_getifp(&sc->sc_ue); struct mbuf *m; diff --git a/sys/dev/usb/net/if_urndisreg.h b/sys/dev/usb/net/if_urndisreg.h index 6dc3660..c76807a 100644 --- a/sys/dev/usb/net/if_urndisreg.h +++ b/sys/dev/usb/net/if_urndisreg.h @@ -55,242 +55,4 @@ struct urndis_softc { #define URNDIS_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define URNDIS_LOCK_ASSERT(sc, what) mtx_assert(&(sc)->sc_mtx, (what)) -#define RNDIS_STATUS_BUFFER_OVERFLOW 0x80000005L -#define RNDIS_STATUS_FAILURE 0xC0000001L -#define RNDIS_STATUS_INVALID_DATA 0xC0010015L -#define RNDIS_STATUS_MEDIA_CONNECT 0x4001000BL -#define RNDIS_STATUS_MEDIA_DISCONNECT 0x4001000CL -#define RNDIS_STATUS_NOT_SUPPORTED 0xC00000BBL -#define RNDIS_STATUS_PENDING STATUS_PENDING /* XXX */ -#define RNDIS_STATUS_RESOURCES 0xC000009AL -#define RNDIS_STATUS_SUCCESS 0x00000000L - -#define OID_GEN_SUPPORTED_LIST 0x00010101 -#define OID_GEN_HARDWARE_STATUS 0x00010102 -#define OID_GEN_MEDIA_SUPPORTED 0x00010103 -#define OID_GEN_MEDIA_IN_USE 0x00010104 -#define OID_GEN_MAXIMUM_LOOKAHEAD 0x00010105 -#define OID_GEN_MAXIMUM_FRAME_SIZE 0x00010106 -#define OID_GEN_LINK_SPEED 0x00010107 -#define OID_GEN_TRANSMIT_BUFFER_SPACE 0x00010108 -#define OID_GEN_RECEIVE_BUFFER_SPACE 0x00010109 -#define OID_GEN_TRANSMIT_BLOCK_SIZE 0x0001010A -#define OID_GEN_RECEIVE_BLOCK_SIZE 0x0001010B -#define OID_GEN_VENDOR_ID 0x0001010C -#define OID_GEN_VENDOR_DESCRIPTION 0x0001010D -#define OID_GEN_CURRENT_PACKET_FILTER 0x0001010E -#define OID_GEN_CURRENT_LOOKAHEAD 0x0001010F -#define OID_GEN_DRIVER_VERSION 0x00010110 -#define OID_GEN_MAXIMUM_TOTAL_SIZE 0x00010111 -#define OID_GEN_PROTOCOL_OPTIONS 0x00010112 -#define OID_GEN_MAC_OPTIONS 0x00010113 -#define OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 -#define OID_GEN_MAXIMUM_SEND_PACKETS 0x00010115 -#define OID_GEN_VENDOR_DRIVER_VERSION 0x00010116 -#define OID_GEN_SUPPORTED_GUIDS 0x00010117 -#define OID_GEN_NETWORK_LAYER_ADDRESSES 0x00010118 -#define OID_GEN_TRANSPORT_HEADER_OFFSET 0x00010119 -#define OID_GEN_MACHINE_NAME 0x0001021A -#define OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B -#define OID_GEN_VLAN_ID 0x0001021C - -#define OID_802_3_PERMANENT_ADDRESS 0x01010101 -#define OID_802_3_CURRENT_ADDRESS 0x01010102 -#define OID_802_3_MULTICAST_LIST 0x01010103 -#define OID_802_3_MAXIMUM_LIST_SIZE 0x01010104 -#define OID_802_3_MAC_OPTIONS 0x01010105 -#define OID_802_3_RCV_ERROR_ALIGNMENT 0x01020101 -#define OID_802_3_XMIT_ONE_COLLISION 0x01020102 -#define OID_802_3_XMIT_MORE_COLLISIONS 0x01020103 -#define OID_802_3_XMIT_DEFERRED 0x01020201 -#define OID_802_3_XMIT_MAX_COLLISIONS 0x01020202 -#define OID_802_3_RCV_OVERRUN 0x01020203 -#define OID_802_3_XMIT_UNDERRUN 0x01020204 -#define OID_802_3_XMIT_HEARTBEAT_FAILURE 0x01020205 -#define OID_802_3_XMIT_TIMES_CRS_LOST 0x01020206 -#define OID_802_3_XMIT_LATE_COLLISIONS 0x01020207 - -#define RNDIS_MEDIUM_802_3 0x00000000 - -/* Device flags */ -#define RNDIS_DF_CONNECTIONLESS 0x00000001 -#define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 - -/* - * RNDIS data message - */ -#define REMOTE_NDIS_PACKET_MSG 0x00000001 - -struct urndis_packet_msg { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_dataoffset; - uint32_t rm_datalen; - uint32_t rm_oobdataoffset; - uint32_t rm_oobdatalen; - uint32_t rm_oobdataelements; - uint32_t rm_pktinfooffset; - uint32_t rm_pktinfolen; - uint32_t rm_vchandle; - uint32_t rm_reserved; -}; - -/* - * RNDIS control messages - */ -struct urndis_comp_hdr { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_status; -}; - -/* Initialize the device. */ -#define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 -#define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 - -struct urndis_init_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_ver_major; - uint32_t rm_ver_minor; - uint32_t rm_max_xfersz; -}; - -struct urndis_init_comp { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_status; - uint32_t rm_ver_major; - uint32_t rm_ver_minor; - uint32_t rm_devflags; - uint32_t rm_medium; - uint32_t rm_pktmaxcnt; - uint32_t rm_pktmaxsz; - uint32_t rm_align; - uint32_t rm_aflistoffset; - uint32_t rm_aflistsz; -}; - -/* Halt the device. No response sent. */ -#define REMOTE_NDIS_HALT_MSG 0x00000003 - -struct urndis_halt_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; -}; - -/* Send a query object. */ -#define REMOTE_NDIS_QUERY_MSG 0x00000004 -#define REMOTE_NDIS_QUERY_CMPLT 0x80000004 - -struct urndis_query_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_oid; - uint32_t rm_infobuflen; - uint32_t rm_infobufoffset; - uint32_t rm_devicevchdl; -}; - -struct urndis_query_comp { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_status; - uint32_t rm_infobuflen; - uint32_t rm_infobufoffset; -}; - -/* Send a set object request. */ -#define REMOTE_NDIS_SET_MSG 0x00000005 -#define REMOTE_NDIS_SET_CMPLT 0x80000005 - -struct urndis_set_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_oid; - uint32_t rm_infobuflen; - uint32_t rm_infobufoffset; - uint32_t rm_devicevchdl; -}; - -struct urndis_set_comp { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_status; -}; - -#define REMOTE_NDIS_SET_PARAM_NUMERIC 0x00000000 -#define REMOTE_NDIS_SET_PARAM_STRING 0x00000002 - -struct urndis_set_parameter { - uint32_t rm_nameoffset; - uint32_t rm_namelen; - uint32_t rm_type; - uint32_t rm_valueoffset; - uint32_t rm_valuelen; -}; - -/* Perform a soft reset on the device. */ -#define REMOTE_NDIS_RESET_MSG 0x00000006 -#define REMOTE_NDIS_RESET_CMPLT 0x80000006 - -struct urndis_reset_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; -}; - -struct urndis_reset_comp { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_status; - uint32_t rm_adrreset; -}; - -/* 802.3 link-state or undefined message error. */ -#define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 - -/* Keepalive messsage. May be sent by device. */ -#define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 -#define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 - -struct urndis_keepalive_req { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; -}; - -struct urndis_keepalive_comp { - uint32_t rm_type; - uint32_t rm_len; - uint32_t rm_rid; - uint32_t rm_status; -}; - -/* packet filter bits used by OID_GEN_CURRENT_PACKET_FILTER */ -#define RNDIS_PACKET_TYPE_DIRECTED 0x00000001 -#define RNDIS_PACKET_TYPE_MULTICAST 0x00000002 -#define RNDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 -#define RNDIS_PACKET_TYPE_BROADCAST 0x00000008 -#define RNDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 -#define RNDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 -#define RNDIS_PACKET_TYPE_SMT 0x00000040 -#define RNDIS_PACKET_TYPE_ALL_LOCAL 0x00000080 -#define RNDIS_PACKET_TYPE_GROUP 0x00001000 -#define RNDIS_PACKET_TYPE_ALL_FUNCTIONAL 0x00002000 -#define RNDIS_PACKET_TYPE_FUNCTIONAL 0x00004000 -#define RNDIS_PACKET_TYPE_MAC_FRAME 0x00008000 - -/* RNDIS offsets */ -#define RNDIS_HEADER_OFFSET 8 /* bytes */ -#define RNDIS_DATA_OFFSET ((uint32_t)(sizeof(struct urndis_packet_msg) - RNDIS_HEADER_OFFSET)) - #endif /* _IF_URNDISREG_H_ */ diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index c847afc..8d19f2e 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -848,7 +848,7 @@ __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct mtx *m; - if (panicstr != NULL || dumping) + if (panicstr != NULL || dumping || SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); diff --git a/sys/modules/hyperv/Makefile b/sys/modules/hyperv/Makefile index 25b32e3..3bae26a 100644 --- a/sys/modules/hyperv/Makefile +++ b/sys/modules/hyperv/Makefile @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR = vmbus netvsc stordisengage storvsc utilities +SUBDIR = vmbus netvsc storvsc utilities .include <bsd.subdir.mk> diff --git a/sys/modules/hyperv/stordisengage/Makefile b/sys/modules/hyperv/stordisengage/Makefile deleted file mode 100644 index 497deb98..0000000 --- a/sys/modules/hyperv/stordisengage/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/hyperv/stordisengage - -KMOD= hv_ata_pci_disengage -SRCS= hv_ata_pci_disengage.c -SRCS+= ata_if.h bus_if.h device_if.h pci_if.h - -.include <bsd.kmod.mk> diff --git a/sys/modules/hyperv/vmbus/Makefile b/sys/modules/hyperv/vmbus/Makefile index ea7154f..462557c 100644 --- a/sys/modules/hyperv/vmbus/Makefile +++ b/sys/modules/hyperv/vmbus/Makefile @@ -10,7 +10,9 @@ SRCS= hyperv.c \ vmbus.c \ vmbus_br.c \ vmbus_chan.c \ - vmbus_et.c + vmbus_et.c \ + vmbus_if.c \ + vmbus_xact.c SRCS+= acpi_if.h bus_if.h device_if.h opt_acpi.h vmbus_if.h # XXX: for assym.s diff --git a/sys/net/if.c b/sys/net/if.c index e03584f..881cd60 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2171,7 +2171,7 @@ do_link_state_change(void *arg, int pending) if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); - EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state); + EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } diff --git a/sys/net/rndis.h b/sys/net/rndis.h new file mode 100644 index 0000000..9da76bc --- /dev/null +++ b/sys/net/rndis.h @@ -0,0 +1,369 @@ +/* $FreeBSD$ */ +/* $OpenBSD: if_urndisreg.h,v 1.19 2013/11/21 14:08:05 mpi Exp $ */ + +/* + * Copyright (c) 2010 Jonathan Armani <armani@openbsd.org> + * Copyright (c) 2010 Fabien Romano <fabien@openbsd.org> + * Copyright (c) 2010 Michael Knudsen <mk@openbsd.org> + * All rights reserved. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _NET_RNDIS_H_ +#define _NET_RNDIS_H_ + +/* Canonical major/minor version as of 22th Aug. 2016. */ +#define RNDIS_VERSION_MAJOR 0x00000001 +#define RNDIS_VERSION_MINOR 0x00000000 + +#define RNDIS_STATUS_SUCCESS 0x00000000L +#define RNDIS_STATUS_PENDING 0x00000103L +#define RNDIS_STATUS_MEDIA_CONNECT 0x4001000BL +#define RNDIS_STATUS_MEDIA_DISCONNECT 0x4001000CL +#define RNDIS_STATUS_NETWORK_CHANGE 0x40010018L +#define RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG 0x40020006L +#define RNDIS_STATUS_BUFFER_OVERFLOW 0x80000005L +#define RNDIS_STATUS_FAILURE 0xC0000001L +#define RNDIS_STATUS_NOT_SUPPORTED 0xC00000BBL +#define RNDIS_STATUS_RESOURCES 0xC000009AL +#define RNDIS_STATUS_INVALID_DATA 0xC0010015L + +#define OID_GEN_SUPPORTED_LIST 0x00010101 +#define OID_GEN_HARDWARE_STATUS 0x00010102 +#define OID_GEN_MEDIA_SUPPORTED 0x00010103 +#define OID_GEN_MEDIA_IN_USE 0x00010104 +#define OID_GEN_MAXIMUM_LOOKAHEAD 0x00010105 +#define OID_GEN_MAXIMUM_FRAME_SIZE 0x00010106 +#define OID_GEN_LINK_SPEED 0x00010107 +#define OID_GEN_TRANSMIT_BUFFER_SPACE 0x00010108 +#define OID_GEN_RECEIVE_BUFFER_SPACE 0x00010109 +#define OID_GEN_TRANSMIT_BLOCK_SIZE 0x0001010A +#define OID_GEN_RECEIVE_BLOCK_SIZE 0x0001010B +#define OID_GEN_VENDOR_ID 0x0001010C +#define OID_GEN_VENDOR_DESCRIPTION 0x0001010D +#define OID_GEN_CURRENT_PACKET_FILTER 0x0001010E +#define OID_GEN_CURRENT_LOOKAHEAD 0x0001010F +#define OID_GEN_DRIVER_VERSION 0x00010110 +#define OID_GEN_MAXIMUM_TOTAL_SIZE 0x00010111 +#define OID_GEN_PROTOCOL_OPTIONS 0x00010112 +#define OID_GEN_MAC_OPTIONS 0x00010113 +#define OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 +#define OID_GEN_MAXIMUM_SEND_PACKETS 0x00010115 +#define OID_GEN_VENDOR_DRIVER_VERSION 0x00010116 +#define OID_GEN_SUPPORTED_GUIDS 0x00010117 +#define OID_GEN_NETWORK_LAYER_ADDRESSES 0x00010118 +#define OID_GEN_TRANSPORT_HEADER_OFFSET 0x00010119 +#define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203 +#define OID_GEN_RECEIVE_SCALE_PARAMETERS 0x00010204 +#define OID_GEN_MACHINE_NAME 0x0001021A +#define OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B +#define OID_GEN_VLAN_ID 0x0001021C + +#define OID_802_3_PERMANENT_ADDRESS 0x01010101 +#define OID_802_3_CURRENT_ADDRESS 0x01010102 +#define OID_802_3_MULTICAST_LIST 0x01010103 +#define OID_802_3_MAXIMUM_LIST_SIZE 0x01010104 +#define OID_802_3_MAC_OPTIONS 0x01010105 +#define OID_802_3_RCV_ERROR_ALIGNMENT 0x01020101 +#define OID_802_3_XMIT_ONE_COLLISION 0x01020102 +#define OID_802_3_XMIT_MORE_COLLISIONS 0x01020103 +#define OID_802_3_XMIT_DEFERRED 0x01020201 +#define OID_802_3_XMIT_MAX_COLLISIONS 0x01020202 +#define OID_802_3_RCV_OVERRUN 0x01020203 +#define OID_802_3_XMIT_UNDERRUN 0x01020204 +#define OID_802_3_XMIT_HEARTBEAT_FAILURE 0x01020205 +#define OID_802_3_XMIT_TIMES_CRS_LOST 0x01020206 +#define OID_802_3_XMIT_LATE_COLLISIONS 0x01020207 + +#define OID_TCP_OFFLOAD_PARAMETERS 0xFC01020C +#define OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES 0xFC01020D + +#define RNDIS_MEDIUM_802_3 0x00000000 + +/* Device flags */ +#define RNDIS_DF_CONNECTIONLESS 0x00000001 +#define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 + +/* + * Common RNDIS message header. + */ +struct rndis_msghdr { + uint32_t rm_type; + uint32_t rm_len; +}; + +/* + * RNDIS data message + */ +#define REMOTE_NDIS_PACKET_MSG 0x00000001 + +struct rndis_packet_msg { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_dataoffset; + uint32_t rm_datalen; + uint32_t rm_oobdataoffset; + uint32_t rm_oobdatalen; + uint32_t rm_oobdataelements; + uint32_t rm_pktinfooffset; + uint32_t rm_pktinfolen; + uint32_t rm_vchandle; + uint32_t rm_reserved; +}; + +/* + * Minimum value for rm_dataoffset, rm_oobdataoffset, and + * rm_pktinfooffset. + */ +#define RNDIS_PACKET_MSG_OFFSET_MIN \ + (sizeof(struct rndis_packet_msg) - \ + __offsetof(struct rndis_packet_msg, rm_dataoffset)) + +/* Offset from the beginning of rndis_packet_msg. */ +#define RNDIS_PACKET_MSG_OFFSET_ABS(ofs) \ + ((ofs) + __offsetof(struct rndis_packet_msg, rm_dataoffset)) + +#define RNDIS_PACKET_MSG_OFFSET_ALIGN 4 +#define RNDIS_PACKET_MSG_OFFSET_ALIGNMASK \ + (RNDIS_PACKET_MSG_OFFSET_ALIGN - 1) + +/* Per-packet-info for RNDIS data message */ +struct rndis_pktinfo { + uint32_t rm_size; + uint32_t rm_type; /* NDIS_PKTINFO_TYPE_ */ + uint32_t rm_pktinfooffset; + uint8_t rm_data[]; +}; + +#define RNDIS_PKTINFO_OFFSET \ + __offsetof(struct rndis_pktinfo, rm_data[0]) +#define RNDIS_PKTINFO_SIZE_ALIGN 4 +#define RNDIS_PKTINFO_SIZE_ALIGNMASK (RNDIS_PKTINFO_SIZE_ALIGN - 1) + +#define NDIS_PKTINFO_TYPE_CSUM 0 +#define NDIS_PKTINFO_TYPE_IPSEC 1 +#define NDIS_PKTINFO_TYPE_LSO 2 +#define NDIS_PKTINFO_TYPE_CLASSIFY 3 +/* reserved 4 */ +#define NDIS_PKTINFO_TYPE_SGLIST 5 +#define NDIS_PKTINFO_TYPE_VLAN 6 +#define NDIS_PKTINFO_TYPE_ORIG 7 +#define NDIS_PKTINFO_TYPE_PKT_CANCELID 8 +#define NDIS_PKTINFO_TYPE_ORIG_NBLIST 9 +#define NDIS_PKTINFO_TYPE_CACHE_NBLIST 10 +#define NDIS_PKTINFO_TYPE_PKT_PAD 11 + +/* + * RNDIS control messages + */ + +/* + * Common header for RNDIS completion messages. + * + * NOTE: It does not apply to REMOTE_NDIS_RESET_CMPLT. + */ +struct rndis_comp_hdr { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_status; +}; + +/* Initialize the device. */ +#define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 +#define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 + +struct rndis_init_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_ver_major; + uint32_t rm_ver_minor; + uint32_t rm_max_xfersz; +}; + +struct rndis_init_comp { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_status; + uint32_t rm_ver_major; + uint32_t rm_ver_minor; + uint32_t rm_devflags; + uint32_t rm_medium; + uint32_t rm_pktmaxcnt; + uint32_t rm_pktmaxsz; + uint32_t rm_align; + uint32_t rm_aflistoffset; + uint32_t rm_aflistsz; +}; + +#define RNDIS_INIT_COMP_SIZE_MIN \ + __offsetof(struct rndis_init_comp, rm_aflistsz) + +/* Halt the device. No response sent. */ +#define REMOTE_NDIS_HALT_MSG 0x00000003 + +struct rndis_halt_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; +}; + +/* Send a query object. */ +#define REMOTE_NDIS_QUERY_MSG 0x00000004 +#define REMOTE_NDIS_QUERY_CMPLT 0x80000004 + +struct rndis_query_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_oid; + uint32_t rm_infobuflen; + uint32_t rm_infobufoffset; + uint32_t rm_devicevchdl; +}; + +#define RNDIS_QUERY_REQ_INFOBUFOFFSET \ + (sizeof(struct rndis_query_req) - \ + __offsetof(struct rndis_query_req, rm_rid)) + +struct rndis_query_comp { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_status; + uint32_t rm_infobuflen; + uint32_t rm_infobufoffset; +}; + +/* infobuf offset from the beginning of rndis_query_comp. */ +#define RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(ofs) \ + ((ofs) + __offsetof(struct rndis_query_req, rm_rid)) + +/* Send a set object request. */ +#define REMOTE_NDIS_SET_MSG 0x00000005 +#define REMOTE_NDIS_SET_CMPLT 0x80000005 + +struct rndis_set_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_oid; + uint32_t rm_infobuflen; + uint32_t rm_infobufoffset; + uint32_t rm_devicevchdl; +}; + +#define RNDIS_SET_REQ_INFOBUFOFFSET \ + (sizeof(struct rndis_set_req) - \ + __offsetof(struct rndis_set_req, rm_rid)) + +struct rndis_set_comp { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_status; +}; + +/* + * Parameter used by OID_GEN_RNDIS_CONFIG_PARAMETER. + */ +#define REMOTE_NDIS_SET_PARAM_NUMERIC 0x00000000 +#define REMOTE_NDIS_SET_PARAM_STRING 0x00000002 + +struct rndis_set_parameter { + uint32_t rm_nameoffset; + uint32_t rm_namelen; + uint32_t rm_type; + uint32_t rm_valueoffset; + uint32_t rm_valuelen; +}; + +/* Perform a soft reset on the device. */ +#define REMOTE_NDIS_RESET_MSG 0x00000006 +#define REMOTE_NDIS_RESET_CMPLT 0x80000006 + +struct rndis_reset_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; +}; + +struct rndis_reset_comp { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_status; + uint32_t rm_adrreset; +}; + +/* 802.3 link-state or undefined message error. Sent by device. */ +#define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 + +struct rndis_status_msg { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_status; + uint32_t rm_stbuflen; + uint32_t rm_stbufoffset; + /* rndis_diag_info */ +}; + +/* + * Immediately after rndis_status_msg.rm_stbufoffset, if a control + * message is malformatted, or a packet message contains inappropriate + * content. + */ +struct rndis_diag_info { + uint32_t rm_diagstatus; + uint32_t rm_erroffset; +}; + +/* Keepalive messsage. May be sent by device. */ +#define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 +#define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 + +struct rndis_keepalive_req { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; +}; + +struct rndis_keepalive_comp { + uint32_t rm_type; + uint32_t rm_len; + uint32_t rm_rid; + uint32_t rm_status; +}; + +/* packet filter bits used by OID_GEN_CURRENT_PACKET_FILTER */ +#define NDIS_PACKET_TYPE_DIRECTED 0x00000001 +#define NDIS_PACKET_TYPE_MULTICAST 0x00000002 +#define NDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 +#define NDIS_PACKET_TYPE_BROADCAST 0x00000008 +#define NDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 +#define NDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 +#define NDIS_PACKET_TYPE_SMT 0x00000040 +#define NDIS_PACKET_TYPE_ALL_LOCAL 0x00000080 +#define NDIS_PACKET_TYPE_GROUP 0x00001000 +#define NDIS_PACKET_TYPE_ALL_FUNCTIONAL 0x00002000 +#define NDIS_PACKET_TYPE_FUNCTIONAL 0x00004000 +#define NDIS_PACKET_TYPE_MAC_FRAME 0x00008000 + +/* RNDIS offsets */ +#define RNDIS_HEADER_OFFSET ((uint32_t)sizeof(struct rndis_msghdr)) +#define RNDIS_DATA_OFFSET \ + ((uint32_t)(sizeof(struct rndis_packet_msg) - RNDIS_HEADER_OFFSET)) + +#endif /* !_NET_RNDIS_H_ */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index f422d8a..b9a72ef 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -764,7 +764,6 @@ struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); -u_long tcp_seq_subtract(u_long, u_long ); int tcp_compute_pipe(struct tcpcb *); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h index cae08c0..16dfc75 100644 --- a/sys/sys/eventhandler.h +++ b/sys/sys/eventhandler.h @@ -283,4 +283,11 @@ typedef void (*unregister_framebuffer_fn)(void *, struct fb_info *); EVENTHANDLER_DECLARE(register_framebuffer, register_framebuffer_fn); EVENTHANDLER_DECLARE(unregister_framebuffer, unregister_framebuffer_fn); +/* Veto ada attachment */ +struct cam_path; +struct ata_params; +typedef void (*ada_probe_veto_fn)(void *, struct cam_path *, + struct ata_params *, int *); +EVENTHANDLER_DECLARE(ada_probe_veto, ada_probe_veto_fn); + #endif /* SYS_EVENTHANDLER_H */ diff --git a/sys/sys/param.h b/sys/sys/param.h index df3fd61..26ab838 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1003508 /* Master, propagated to newvers */ +#define __FreeBSD_version 1003509 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/queue.h b/sys/sys/queue.h index 1be9e9c..f26c492 100644 --- a/sys/sys/queue.h +++ b/sys/sys/queue.h @@ -76,6 +76,10 @@ * * For details on the use of these macros, see the queue(3) manual page. * + * Below is a summary of implemented functions where: + * + means the macro is available + * - means the macro is not available + * s means the macro is available but is slow (runs in O(n) time) * * SLIST LIST STAILQ TAILQ * _HEAD + + + + @@ -101,10 +105,10 @@ * _INSERT_BEFORE - + - + * _INSERT_AFTER + + + + * _INSERT_TAIL - - + + - * _CONCAT - - + + + * _CONCAT s s + + * _REMOVE_AFTER + - + - * _REMOVE_HEAD + - + - - * _REMOVE + + + + + * _REMOVE s + s + * _SWAP + + + + * */ @@ -183,6 +187,19 @@ struct { \ /* * Singly-linked List functions. */ +#define SLIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL) \ + SLIST_INIT(head2); \ + } else if (SLIST_FIRST(head2) != NULL) { \ + while (SLIST_NEXT(curelm, field) != NULL) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = SLIST_FIRST(head2); \ + SLIST_INIT(head2); \ + } \ +} while (0) + #define SLIST_EMPTY(head) ((head)->slh_first == NULL) #define SLIST_FIRST(head) ((head)->slh_first) @@ -447,6 +464,23 @@ struct { \ #define QMD_LIST_CHECK_PREV(elm, field) #endif /* (_KERNEL && INVARIANTS) */ +#define LIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) { \ + LIST_FIRST(head2)->field.le_prev = \ + &LIST_FIRST((head1)); \ + LIST_INIT(head2); \ + } \ + } else if (LIST_FIRST(head2) != NULL) { \ + while (LIST_NEXT(curelm, field) != NULL) \ + curelm = LIST_NEXT(curelm, field); \ + LIST_NEXT(curelm, field) = LIST_FIRST(head2); \ + LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \ + LIST_INIT(head2); \ + } \ +} while (0) + #define LIST_EMPTY(head) ((head)->lh_first == NULL) #define LIST_FIRST(head) ((head)->lh_first) diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 42c3c62..18e609e 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -751,16 +751,16 @@ static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int sync_cgs(struct mount *, int); -static int handle_written_filepage(struct pagedep *, struct buf *); +static int handle_written_filepage(struct pagedep *, struct buf *, int); static int handle_written_sbdep(struct sbdep *, struct buf *); static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_indirdep(struct indirdep *, struct buf *, - struct buf**); -static int handle_written_inodeblock(struct inodedep *, struct buf *); + struct buf**, int); +static int handle_written_inodeblock(struct inodedep *, struct buf *, int); static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, uint8_t *); -static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); +static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); static void handle_written_jaddref(struct jaddref *); static void handle_written_jremref(struct jremref *); static void handle_written_jseg(struct jseg *, struct buf *); @@ -10868,6 +10868,10 @@ initiate_write_bmsafemap(bmsafemap, bp) struct fs *fs; ino_t ino; + /* + * If this is a background write, we did this at the time that + * the copy was made, so do not need to do it again. + */ if (bmsafemap->sm_state & IOSTARTED) return; bmsafemap->sm_state |= IOSTARTED; @@ -10941,10 +10945,39 @@ softdep_disk_write_complete(bp) /* * If an error occurred while doing the write, then the data - * has not hit the disk and the dependencies cannot be unrolled. + * has not hit the disk and the dependencies cannot be processed. + * But we do have to go through and roll forward any dependencies + * that were rolled back before the disk write. */ - if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) + if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + switch (wk->wk_type) { + + case D_PAGEDEP: + handle_written_filepage(WK_PAGEDEP(wk), bp, 0); + continue; + + case D_INODEDEP: + handle_written_inodeblock(WK_INODEDEP(wk), + bp, 0); + continue; + + case D_BMSAFEMAP: + handle_written_bmsafemap(WK_BMSAFEMAP(wk), + bp, 0); + continue; + + case D_INDIRDEP: + handle_written_indirdep(WK_INDIRDEP(wk), + bp, &sbp, 0); + continue; + default: + /* nothing to roll forward */ + continue; + } + } return; + } if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return; ump = VFSTOUFS(wk->wk_mp); @@ -10964,17 +10997,20 @@ softdep_disk_write_complete(bp) switch (wk->wk_type) { case D_PAGEDEP: - if (handle_written_filepage(WK_PAGEDEP(wk), bp)) + if (handle_written_filepage(WK_PAGEDEP(wk), bp, + WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: - if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) + if (handle_written_inodeblock(WK_INODEDEP(wk), bp, + WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: - if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) + if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, + WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; @@ -10993,7 +11029,8 @@ softdep_disk_write_complete(bp) continue; case D_INDIRDEP: - if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) + if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, + WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; @@ -11293,12 +11330,17 @@ handle_bufwait(inodedep, refhd) * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further - * splbio interrupts blocked. + * interrupts from this device blocked. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. */ static int -handle_written_inodeblock(inodedep, bp) +handle_written_inodeblock(inodedep, bp, flags) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ + int flags; { struct freefile *freefile; struct allocdirect *adp, *nextadp; @@ -11328,7 +11370,8 @@ handle_written_inodeblock(inodedep, bp) /* * Leave this inodeblock dirty until it's in the list. */ - if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) { + if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && + (flags & WRITESUCCEEDED)) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); @@ -11367,7 +11410,8 @@ handle_written_inodeblock(inodedep, bp) goto bufwait; return (1); } - inodedep->id_state |= COMPLETE; + if (flags & WRITESUCCEEDED) + inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. @@ -11482,6 +11526,13 @@ handle_written_inodeblock(inodedep, bp) bdirty(bp); bufwait: /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) + return (hadchanges); + /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) @@ -11538,11 +11589,20 @@ bufwait: return (hadchanges); } +/* + * Perform needed roll-forwards and kick off any dependencies that + * can now be processed. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. + */ static int -handle_written_indirdep(indirdep, bp, bpp) +handle_written_indirdep(indirdep, bp, bpp, flags) struct indirdep *indirdep; struct buf *bp; struct buf **bpp; + int flags; { struct allocindir *aip; struct buf *sbp; @@ -11567,6 +11627,16 @@ handle_written_indirdep(indirdep, bp, bpp) indirdep->ir_state &= ~(UNDONE | IOSTARTED); indirdep->ir_state |= ATTACHED; /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) { + stat_indir_blk_ptrs++; + bdirty(bp); + return (1); + } + /* * Move allocindirs with written pointers to the completehd if * the indirdep's pointer is not yet written. Otherwise * free them here. @@ -11720,11 +11790,16 @@ jnewblk_rollforward(jnewblk, fs, cgp, blksfree) * Complete a write to a bmsafemap structure. Roll forward any bitmap * changes if it's not a background write. Set all written dependencies * to DEPCOMPLETE and free the structure if possible. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. */ static int -handle_written_bmsafemap(bmsafemap, bp) +handle_written_bmsafemap(bmsafemap, bp, flags) struct bmsafemap *bmsafemap; struct buf *bp; + int flags; { struct newblk *newblk; struct inodedep *inodedep; @@ -11740,15 +11815,20 @@ handle_written_bmsafemap(bmsafemap, bp) int chgs; if ((bmsafemap->sm_state & IOSTARTED) == 0) - panic("initiate_write_bmsafemap: Not started\n"); + panic("handle_written_bmsafemap: Not started\n"); ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); chgs = 0; bmsafemap->sm_state &= ~IOSTARTED; foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; /* - * Release journal work that was waiting on the write. + * If write was successful, release journal work that was waiting + * on the write. Otherwise move the work back. */ - handle_jwork(&bmsafemap->sm_freewr); + if (flags & WRITESUCCEEDED) + handle_jwork(&bmsafemap->sm_freewr); + else + LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, + worklist, wk_list); /* * Restore unwritten inode allocation pending jaddref writes. @@ -11798,6 +11878,20 @@ handle_written_bmsafemap(bmsafemap, bp) free_jnewblk(jnewblk); } } + /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) { + LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); + LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, + worklist, wk_list); + if (foreground) + bdirty(bp); + return (1); + } while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_state &= ~ONDEPLIST; @@ -11901,12 +11995,17 @@ free_pagedep(pagedep) * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level - * with further splbio interrupts blocked. + * with further interrupts from this device blocked. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. */ static int -handle_written_filepage(pagedep, bp) +handle_written_filepage(pagedep, bp, flags) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ + int flags; { struct dirrem *dirrem; struct diradd *dap, *nextdap; @@ -11916,6 +12015,8 @@ handle_written_filepage(pagedep, bp) if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; + if ((flags & WRITESUCCEEDED) == 0) + goto rollforward; /* * Process any directory removals that have been committed. */ @@ -11935,6 +12036,7 @@ handle_written_filepage(pagedep, bp) if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); +rollforward: /* * Uncommitted directory entries must be restored. */ @@ -11967,7 +12069,7 @@ handle_written_filepage(pagedep, bp) * marked dirty so that its will eventually get written back in * its correct form. */ - if (chgs) { + if (chgs || (flags & WRITESUCCEEDED) == 0) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index ce93279..6c07800 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -140,6 +140,7 @@ #define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */ #define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */ #define UNLINKLINKS (UNLINKNEXT | UNLINKPREV) +#define WRITESUCCEEDED 0x400000 /* the disk write completed successfully */ #define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 433c875..f166649 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -515,15 +515,26 @@ void vm_page_busy_downgrade(vm_page_t m) { u_int x; + bool locked; vm_page_assert_xbusied(m); + locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; + if (x != 0 && !locked) + vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, - VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1) | x)) + VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; + if (x != 0 && !locked) + vm_page_unlock(m); + } + if (x != 0) { + wakeup(m); + if (!locked) + vm_page_unlock(m); } } diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c index cc50321..980da87 100644 --- a/sys/x86/x86/intr_machdep.c +++ b/sys/x86/x86/intr_machdep.c @@ -46,6 +46,7 @@ #include <sys/mutex.h> #include <sys/proc.h> #include <sys/smp.h> +#include <sys/sx.h> #include <sys/syslog.h> #include <sys/systm.h> #include <machine/clock.h> @@ -73,7 +74,8 @@ typedef void (*mask_fn)(void *); static int intrcnt_index; static struct intsrc *interrupt_sources[NUM_IO_INTS]; -static struct mtx intr_table_lock; +static struct sx intrsrc_lock; +static struct mtx intrpic_lock; static struct mtx intrcnt_lock; static TAILQ_HEAD(pics_head, pic) pics; @@ -117,14 +119,14 @@ intr_register_pic(struct pic *pic) { int error; - mtx_lock(&intr_table_lock); + mtx_lock(&intrpic_lock); if (intr_pic_registered(pic)) error = EBUSY; else { TAILQ_INSERT_TAIL(&pics, pic, pics); error = 0; } - mtx_unlock(&intr_table_lock); + mtx_unlock(&intrpic_lock); return (error); } @@ -148,16 +150,16 @@ intr_register_source(struct intsrc *isrc) vector); if (error) return (error); - mtx_lock(&intr_table_lock); + sx_xlock(&intrsrc_lock); if (interrupt_sources[vector] != NULL) { - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); intr_event_destroy(isrc->is_event); return (EEXIST); } intrcnt_register(isrc); interrupt_sources[vector] = isrc; isrc->is_handlers = 0; - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); return (0); } @@ -181,14 +183,14 @@ intr_add_handler(const char *name, int vector, driver_filter_t filter, error = intr_event_add_handler(isrc->is_event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); if (error == 0) { - mtx_lock(&intr_table_lock); + sx_xlock(&intrsrc_lock); intrcnt_updatename(isrc); isrc->is_handlers++; if (isrc->is_handlers == 1) { isrc->is_pic->pic_enable_intr(isrc); isrc->is_pic->pic_enable_source(isrc); } - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); } return (error); } @@ -197,28 +199,19 @@ int intr_remove_handler(void *cookie) { struct intsrc *isrc; - int error, mtx_owned; + int error; isrc = intr_handler_source(cookie); error = intr_event_remove_handler(cookie); if (error == 0) { - /* - * Recursion is needed here so PICs can remove interrupts - * while resuming. It was previously not possible due to - * intr_resume holding the intr_table_lock and - * intr_remove_handler recursing on it. - */ - mtx_owned = mtx_owned(&intr_table_lock); - if (mtx_owned == 0) - mtx_lock(&intr_table_lock); + sx_xlock(&intrsrc_lock); isrc->is_handlers--; if (isrc->is_handlers == 0) { isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI); isrc->is_pic->pic_disable_intr(isrc); } intrcnt_updatename(isrc); - if (mtx_owned == 0) - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); } return (error); } @@ -292,12 +285,12 @@ intr_resume(bool suspend_cancelled) #ifndef DEV_ATPIC atpic_reset(); #endif - mtx_lock(&intr_table_lock); + mtx_lock(&intrpic_lock); TAILQ_FOREACH(pic, &pics, pics) { if (pic->pic_resume != NULL) pic->pic_resume(pic, suspend_cancelled); } - mtx_unlock(&intr_table_lock); + mtx_unlock(&intrpic_lock); } void @@ -305,12 +298,12 @@ intr_suspend(void) { struct pic *pic; - mtx_lock(&intr_table_lock); + mtx_lock(&intrpic_lock); TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) { if (pic->pic_suspend != NULL) pic->pic_suspend(pic); } - mtx_unlock(&intr_table_lock); + mtx_unlock(&intrpic_lock); } static int @@ -326,9 +319,9 @@ intr_assign_cpu(void *arg, u_char cpu) */ if (assign_cpu && cpu != NOCPU) { isrc = arg; - mtx_lock(&intr_table_lock); + sx_xlock(&intrsrc_lock); error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]); - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); } else error = 0; return (error); @@ -388,7 +381,8 @@ intr_init(void *dummy __unused) intrcnt_setname("???", 0); intrcnt_index = 1; TAILQ_INIT(&pics); - mtx_init(&intr_table_lock, "intr sources", NULL, MTX_DEF); + mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF); + sx_init(&intrsrc_lock, "intrsrc"); mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN); } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); @@ -536,7 +530,7 @@ intr_shuffle_irqs(void *arg __unused) return; /* Round-robin assign a CPU to each enabled source. */ - mtx_lock(&intr_table_lock); + sx_xlock(&intrsrc_lock); assign_cpu = 1; for (i = 0; i < NUM_IO_INTS; i++) { isrc = interrupt_sources[i]; @@ -557,7 +551,7 @@ intr_shuffle_irqs(void *arg __unused) } } - mtx_unlock(&intr_table_lock); + sx_xunlock(&intrsrc_lock); } SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); diff --git a/sys/x86/x86/io_apic.c b/sys/x86/x86/io_apic.c index 8dafa96..bf12881 100644 --- a/sys/x86/x86/io_apic.c +++ b/sys/x86/x86/io_apic.c @@ -325,6 +325,18 @@ ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id) u_int old_id; /* + * On Hyper-V: + * - Stick to the first cpu for all I/O APIC pins. + * - And don't allow destination cpu changes. + */ + if (vm_guest == VM_GUEST_HV) { + if (intpin->io_vector) + return (EINVAL); + else + apic_id = 0; + } + + /* * keep 1st core as the destination for NMI */ if (intpin->io_irq == IRQ_NMI) diff --git a/tools/tools/ether_reflect/ether_reflect.1 b/tools/tools/ether_reflect/ether_reflect.1 index 45ee27a..e6539b1 100644 --- a/tools/tools/ether_reflect/ether_reflect.1 +++ b/tools/tools/ether_reflect/ether_reflect.1 @@ -103,6 +103,7 @@ program first appeared in .Sh AUTHORS This manual page was written by -.An George V. Neville-Neil Aq gnn@FreeBSD.org . +.An George V. Neville-Neil Aq Mt gnn@FreeBSD.org . .Sh BUGS -Should be reported to the author or to net@FreeBSD.org. +Should be reported to the author or to +.Aq Mt net@FreeBSD.org . diff --git a/tools/tools/fixwhite/fixwhite.1 b/tools/tools/fixwhite/fixwhite.1 index 54dae66..016e24b 100644 --- a/tools/tools/fixwhite/fixwhite.1 +++ b/tools/tools/fixwhite/fixwhite.1 @@ -45,4 +45,4 @@ If the whitespace at the beginning of a sentence is exactly a multiple of eight spaces, the whitespace is replaced by tabs. Also, spaces preceding tabs will be merged into the tab character. .Sh AUTHORS -.An Ed Schouten Aq ed@FreeBSD.org +.An Ed Schouten Aq Mt ed@FreeBSD.org diff --git a/tools/tools/mcgrab/mcgrab.1 b/tools/tools/mcgrab/mcgrab.1 index 1b83f50..2c6bfa6 100644 --- a/tools/tools/mcgrab/mcgrab.1 +++ b/tools/tools/mcgrab/mcgrab.1 @@ -77,7 +77,7 @@ program first appeared in .Sh AUTHORS This manual page was written by -.An George V. Neville-Neil Aq gnn@FreeBSD.org . +.An George V. Neville-Neil Aq Mt gnn@FreeBSD.org . .Sh BUGS Should be reported to the author or to -.Aq net@FreeBSD.org . +.Aq Mt net@FreeBSD.org . diff --git a/tools/tools/mctest/mctest.1 b/tools/tools/mctest/mctest.1 index c64a5f8..3cec879 100644 --- a/tools/tools/mctest/mctest.1 +++ b/tools/tools/mctest/mctest.1 @@ -115,7 +115,7 @@ program first appeared in .Sh AUTHORS This manual page was written by -.An George V. Neville-Neil Aq gnn@FreeBSD.org . +.An George V. Neville-Neil Aq Mt gnn@FreeBSD.org . .Sh BUGS Should be reported to the author or to -.Aq net@FreeBSD.org . +.Aq Mt net@FreeBSD.org . diff --git a/tools/tools/sysdoc/sysdoc.sh b/tools/tools/sysdoc/sysdoc.sh index b07c53d..872eabd 100644 --- a/tools/tools/sysdoc/sysdoc.sh +++ b/tools/tools/sysdoc/sysdoc.sh @@ -59,12 +59,12 @@ implementation, see the respecting manual pages. This manual page is automatically generated by a set of scripts written by .An -nosplit -.An Tom Rhodes Aq trhodes@FreeBSD.org , +.An Tom Rhodes Aq Mt trhodes@FreeBSD.org , with significant contributions from -.An Giorgos Keramidas Aq keramida@FreeBSD.org , -.An Ruslan Ermilov Aq ru@FreeBSD.org , +.An Giorgos Keramidas Aq Mt keramida@FreeBSD.org , +.An Ruslan Ermilov Aq Mt ru@FreeBSD.org , and -.An Marc Silver Aq marcs@draenor.org . +.An Marc Silver Aq Mt marcs@draenor.org . .Sh BUGS Sometimes .Fx diff --git a/tools/tools/vimage/vimage.8 b/tools/tools/vimage/vimage.8 index e9a24c4..306bf83 100644 --- a/tools/tools/vimage/vimage.8 +++ b/tools/tools/vimage/vimage.8 @@ -188,8 +188,8 @@ of the main FreeBSD tree. As a result of a project sponsored by the FreeBSD Foundation and Stiching NLNet, integrated virtualized network stack first appeared in FreeBSD 8.0. +.Sh AUTHORS +.An Marko Zec Aq Mt zec@fer.hr .Sh BUGS Deletion of vimages / vnets is known to leak kernel memory and fail at stopping various timers, hence may lead to system crashes. -.Sh AUTHOR -.An "Marko Zec" Aq zec@fer.hr diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 18ff908..fb6b734 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -851,7 +851,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) if (strncmp(devname, "vale", 4) == 0) pci_vtnet_netmap_setup(sc, devname); if ((strncmp(devname, "tap", 3) == 0) || - (strncmp(devname, "vmmnet", 5) == 0)) + (strncmp(devname, "vmnet", 5) == 0)) pci_vtnet_tap_setup(sc, devname); free(devname); |