diff options
35 files changed, 3283 insertions, 1502 deletions
diff --git a/contrib/gcc/ChangeLog.gcc43 b/contrib/gcc/ChangeLog.gcc43 index 3f893ac..05f0107 100644 --- a/contrib/gcc/ChangeLog.gcc43 +++ b/contrib/gcc/ChangeLog.gcc43 @@ -1,3 +1,9 @@ +2007-06-05 Joerg Wunsch <j.gnu@uriah.heep.sax.de> (r23479) + + PR preprocessor/23479 + * doc/extend.texi: Document the 0b-prefixed binary integer + constant extension. + 2007-05-01 Dwarakanath Rajagopal <dwarak.rajagopal@amd.com> (r124341) * doc/invoke.texi: Fix typo, 'AMD Family 10h core' instead of diff --git a/contrib/gcc/config/arm/libunwind.S b/contrib/gcc/config/arm/libunwind.S index 06e1310..81e4236 100644 --- a/contrib/gcc/config/arm/libunwind.S +++ b/contrib/gcc/config/arm/libunwind.S @@ -116,5 +116,6 @@ UNWIND_WRAPPER _Unwind_RaiseException 1 UNWIND_WRAPPER _Unwind_Resume 1 UNWIND_WRAPPER _Unwind_Resume_or_Rethrow 1 UNWIND_WRAPPER _Unwind_ForcedUnwind 3 +UNWIND_WRAPPER _Unwind_Backtrace 2 -#endif /* __symbian__ */ +#endif /* ndef __symbian__ */ diff --git a/contrib/gcc/config/arm/unwind-arm.c b/contrib/gcc/config/arm/unwind-arm.c index 9d2513b..47354b4 100644 --- a/contrib/gcc/config/arm/unwind-arm.c +++ b/contrib/gcc/config/arm/unwind-arm.c @@ -747,6 +747,66 @@ _Unwind_DeleteException (_Unwind_Exception * exc) } +/* Perform stack backtrace through unwind data. */ +_Unwind_Reason_Code +__gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument, + phase2_vrs * entry_vrs); +_Unwind_Reason_Code +__gnu_Unwind_Backtrace(_Unwind_Trace_Fn trace, void * trace_argument, + phase2_vrs * entry_vrs) +{ + phase1_vrs saved_vrs; + _Unwind_Reason_Code code; + + _Unwind_Control_Block ucb; + _Unwind_Control_Block *ucbp = &ucb; + + /* Set the pc to the call site. */ + entry_vrs->core.r[R_PC] = entry_vrs->core.r[R_LR]; + + /* Save the core registers. */ + saved_vrs.core = entry_vrs->core; + /* Set demand-save flags. */ + saved_vrs.demand_save_flags = ~(_uw) 0; + + do + { + /* Find the entry for this routine. */ + if (get_eit_entry (ucbp, saved_vrs.core.r[R_PC]) != _URC_OK) + { + code = _URC_FAILURE; + break; + } + + /* The dwarf unwinder assumes the context structure holds things + like the function and LSDA pointers. The ARM implementation + caches these in the exception header (UCB). To avoid + rewriting everything we make the virtual IP register point at + the UCB. */ + _Unwind_SetGR((_Unwind_Context *)&saved_vrs, 12, (_Unwind_Ptr) ucbp); + + /* Call trace function. */ + if ((*trace) ((_Unwind_Context *) &saved_vrs, trace_argument) + != _URC_NO_REASON) + { + code = _URC_FAILURE; + break; + } + + /* Call the pr to decide what to do. */ + code = ((personality_routine) UCB_PR_ADDR (ucbp)) + (_US_VIRTUAL_UNWIND_FRAME | _US_FORCE_UNWIND, + ucbp, (void *) &saved_vrs); + } + while (code != _URC_END_OF_STACK + && code != _URC_FAILURE); + + finish: + restore_non_core_regs (&saved_vrs); + return code; +} + + /* Common implementation for ARM ABI defined personality routines. ID is the index of the personality routine, other arguments are as defined by __aeabi_unwind_cpp_pr{0,1,2}. */ @@ -1014,3 +1074,19 @@ _Unwind_GetTextRelBase (_Unwind_Context *context __attribute__ ((unused))) { abort (); } + +#ifdef __FreeBSD__ +/* FreeBSD expects these to be functions */ +_Unwind_Ptr +_Unwind_GetIP (struct _Unwind_Context *context) +{ + return _Unwind_GetGR (context, 15) & ~(_Unwind_Word)1; +} + +_Unwind_Ptr +_Unwind_GetIPInfo (struct _Unwind_Context *context, int *ip_before_insn) +{ + *ip_before_insn = 0; + return _Unwind_GetGR (context, 15) & ~(_Unwind_Word)1; +} +#endif diff --git a/contrib/gcc/config/arm/unwind-arm.h b/contrib/gcc/config/arm/unwind-arm.h index 0811f2c..8a5c5ce 100644 --- a/contrib/gcc/config/arm/unwind-arm.h +++ b/contrib/gcc/config/arm/unwind-arm.h @@ -205,6 +205,13 @@ extern "C" { _Unwind_Control_Block *, struct _Unwind_Context *, void *); _Unwind_Reason_Code _Unwind_ForcedUnwind (_Unwind_Control_Block *, _Unwind_Stop_Fn, void *); + /* @@@ Use unwind data to perform a stack backtrace. The trace callback + is called for every stack frame in the call chain, but no cleanup + actions are performed. */ + typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn) (_Unwind_Context *, void *); + _Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, + void*); + _Unwind_Word _Unwind_GetCFA (struct _Unwind_Context *); void _Unwind_Complete(_Unwind_Control_Block *ucbp); void _Unwind_DeleteException (_Unwind_Exception *); @@ -246,12 +253,17 @@ extern "C" { return val; } +#ifndef __FreeBSD__ /* Return the address of the instruction, not the actual IP value. */ #define _Unwind_GetIP(context) \ (_Unwind_GetGR (context, 15) & ~(_Unwind_Word)1) #define _Unwind_GetIPInfo(context, ip_before_insn) \ (*ip_before_insn = 0, _Unwind_GetGR (context, 15) & ~(_Unwind_Word)1) +#else + _Unwind_Ptr _Unwind_GetIP (struct _Unwind_Context *); + _Unwind_Ptr _Unwind_GetIPInfo (struct _Unwind_Context *, int *); +#endif static inline void _Unwind_SetGR (_Unwind_Context *context, int regno, _Unwind_Word val) diff --git a/contrib/gcc/doc/extend.texi b/contrib/gcc/doc/extend.texi index d7a1494..d27af10 100644 --- a/contrib/gcc/doc/extend.texi +++ b/contrib/gcc/doc/extend.texi @@ -81,6 +81,7 @@ extensions, accepted by GCC in C89 mode and in C++. * Pragmas:: Pragmas accepted by GCC. * Unnamed Fields:: Unnamed struct/union fields within structs/unions. * Thread-Local:: Per-thread variables. +* Binary constants:: Binary constants using the @samp{0b} prefix. @end menu @node Statement Exprs @@ -10424,6 +10425,28 @@ Non-@code{static} members shall not be @code{__thread}. @end quotation @end itemize +@node Binary constants +@section Binary constants using the @samp{0b} prefix +@cindex Binary constants using the @samp{0b} prefix + +Integer constants can be written as binary constants, consisting of a +sequence of @samp{0} and @samp{1} digits, prefixed by @samp{0b} or +@samp{0B}. This is particularly useful in environments that operate a +lot on the bit-level (like microcontrollers). + +The following statements are identical: + +@smallexample +i = 42; +i = 0x2a; +i = 052; +i = 0b101010; +@end smallexample + +The type of these constants follows the same rules as for octal or +hexadecimal integer constants, so suffixes like @samp{L} or @samp{UL} +can be applied. + @node C++ Extensions @chapter Extensions to the C++ Language @cindex extensions, C++ language diff --git a/contrib/gcclibs/libcpp/expr.c b/contrib/gcclibs/libcpp/expr.c index bf8baaf..24fcb1b8 100644 --- a/contrib/gcclibs/libcpp/expr.c +++ b/contrib/gcclibs/libcpp/expr.c @@ -188,6 +188,11 @@ cpp_classify_number (cpp_reader *pfile, const cpp_token *token) radix = 16; str++; } + else if ((*str == 'b' || *str == 'B') && (str[1] == '0' || str[1] == '1')) + { + radix = 2; + str++; + } } /* Now scan for a well-formed integer or float. */ @@ -226,10 +231,22 @@ cpp_classify_number (cpp_reader *pfile, const cpp_token *token) radix = 10; if (max_digit >= radix) - SYNTAX_ERROR2 ("invalid digit \"%c\" in octal constant", '0' + max_digit); + { + if (radix == 2) + SYNTAX_ERROR2 ("invalid digit \"%c\" in binary constant", '0' + max_digit); + else + SYNTAX_ERROR2 ("invalid digit \"%c\" in octal constant", '0' + max_digit); + } if (float_flag != NOT_FLOAT) { + if (radix == 2) + { + cpp_error (pfile, CPP_DL_ERROR, + "invalid prefix \"0b\" for floating constant"); + return CPP_N_INVALID; + } + if (radix == 16 && CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, c99)) cpp_error (pfile, CPP_DL_PEDWARN, "use of C99 hexadecimal floating constant"); @@ -321,11 +338,16 @@ cpp_classify_number (cpp_reader *pfile, const cpp_token *token) if ((result & CPP_N_IMAGINARY) && CPP_PEDANTIC (pfile)) cpp_error (pfile, CPP_DL_PEDWARN, "imaginary constants are a GCC extension"); + if (radix == 2 && CPP_PEDANTIC (pfile)) + cpp_error (pfile, CPP_DL_PEDWARN, + "binary constants are a GCC extension"); if (radix == 10) result |= CPP_N_DECIMAL; else if (radix == 16) result |= CPP_N_HEX; + else if (radix == 2) + result |= CPP_N_BINARY; else result |= CPP_N_OCTAL; @@ -376,6 +398,11 @@ cpp_interpret_integer (cpp_reader *pfile, const cpp_token *token, base = 16; p += 2; } + else if ((type & CPP_N_RADIX) == CPP_N_BINARY) + { + base = 2; + p += 2; + } /* We can add a digit to numbers strictly less than this without needing the precision and slowness of double integers. */ @@ -431,12 +458,25 @@ static cpp_num append_digit (cpp_num num, int digit, int base, size_t precision) { cpp_num result; - unsigned int shift = 3 + (base == 16); + unsigned int shift; bool overflow; cpp_num_part add_high, add_low; - /* Multiply by 8 or 16. Catching this overflow here means we don't + /* Multiply by 2, 8 or 16. Catching this overflow here means we don't need to worry about add_high overflowing. */ + switch (base) + { + case 2: + shift = 1; + break; + + case 16: + shift = 4; + break; + + default: + shift = 3; + } overflow = !!(num.high >> (PART_PRECISION - shift)); result.high = num.high << shift; result.low = num.low << shift; diff --git a/contrib/gcclibs/libcpp/include/cpplib.h b/contrib/gcclibs/libcpp/include/cpplib.h index 851a2e3..7fd73b8 100644 --- a/contrib/gcclibs/libcpp/include/cpplib.h +++ b/contrib/gcclibs/libcpp/include/cpplib.h @@ -745,6 +745,7 @@ struct cpp_num #define CPP_N_DECIMAL 0x0100 #define CPP_N_HEX 0x0200 #define CPP_N_OCTAL 0x0400 +#define CPP_N_BINARY 0x0800 #define CPP_N_UNSIGNED 0x1000 /* Properties. */ #define CPP_N_IMAGINARY 0x2000 diff --git a/lib/libc/gen/errlst.c b/lib/libc/gen/errlst.c index 7b4fd62..f8fe968 100644 --- a/lib/libc/gen/errlst.c +++ b/lib/libc/gen/errlst.c @@ -34,6 +34,7 @@ static char sccsid[] = "@(#)errlst.c 8.2 (Berkeley) 11/16/93"; __FBSDID("$FreeBSD$"); #include <stdio.h> +#include "errlst.h" const char *const sys_errlist[] = { "No error: 0", /* 0 - ENOERROR */ @@ -156,3 +157,8 @@ const char *const sys_errlist[] = { "Previous owner died", /* 96 - EOWNERDEAD */ }; const int sys_nerr = sizeof(sys_errlist) / sizeof(sys_errlist[0]); + +#ifdef PIC +__strong_reference(sys_errlist, __hidden_sys_errlist); +__strong_reference(sys_nerr, __hidden_sys_nerr); +#endif diff --git a/lib/libc/include/errlst.h b/lib/libc/include/errlst.h new file mode 100644 index 0000000..4e9e29f --- /dev/null +++ b/lib/libc/include/errlst.h @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2013 Jilles Tjoelker + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __ERRLST_H__ +#define __ERRLST_H__ + +#include <sys/cdefs.h> + +#ifdef PIC +/* If the main executable imports these, do not use its copy from libc.so. */ +extern const char *const __hidden_sys_errlist[] __hidden; +extern const int __hidden_sys_nerr __hidden; +#else +#define __hidden_sys_errlist sys_errlist +#define __hidden_sys_nerr sys_nerr +#endif + +#endif /* __ERRLST_H__ */ diff --git a/lib/libc/stdio/xprintf_errno.c b/lib/libc/stdio/xprintf_errno.c index 0c2be46..3c831d1 100644 --- a/lib/libc/stdio/xprintf_errno.c +++ b/lib/libc/stdio/xprintf_errno.c @@ -34,6 +34,7 @@ #include <vis.h> #include <assert.h> #include <sys/time.h> +#include "errlst.h" #include "printf.h" int @@ -54,7 +55,7 @@ __printf_render_errno(struct __printf_io *io, const struct printf_info *pi __unu ret = 0; error = *((const int *)arg[0]); - if (error >= 0 && error < sys_nerr) { + if (error >= 0 && error < __hidden_sys_nerr) { p = strerror(error); return (__printf_out(io, pi, p, strlen(p))); } diff --git a/lib/libc/string/strerror.c b/lib/libc/string/strerror.c index e11b351..1d7a385 100644 --- a/lib/libc/string/strerror.c +++ b/lib/libc/string/strerror.c @@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$"); #include <string.h> #include <stdio.h> +#include "errlst.h" + #define UPREFIX "Unknown error" /* @@ -87,7 +89,7 @@ strerror_r(int errnum, char *strerrbuf, size_t buflen) catd = catopen("libc", NL_CAT_LOCALE); #endif - if (errnum < 0 || errnum >= sys_nerr) { + if (errnum < 0 || errnum >= __hidden_sys_nerr) { errstr(errnum, #if defined(NLS) catgets(catd, 1, 0xffff, UPREFIX), @@ -99,9 +101,9 @@ strerror_r(int errnum, char *strerrbuf, size_t buflen) } else { if (strlcpy(strerrbuf, #if defined(NLS) - catgets(catd, 1, errnum, sys_errlist[errnum]), + catgets(catd, 1, errnum, __hidden_sys_errlist[errnum]), #else - sys_errlist[errnum], + __hidden_sys_errlist[errnum], #endif buflen) >= buflen) retval = ERANGE; diff --git a/lib/libelf/libelf_data.c b/lib/libelf/libelf_data.c index 3fbb067..17808ef 100644 --- a/lib/libelf/libelf_data.c +++ b/lib/libelf/libelf_data.c @@ -84,13 +84,21 @@ _libelf_xlate_shtype(uint32_t sht) case SHT_SUNW_dof: return (ELF_T_BYTE); #endif + case SHT_ARM_PREEMPTMAP: + /* FALLTHROUGH */ + case SHT_ARM_ATTRIBUTES: + /* FALLTHROUGH */ + case SHT_ARM_DEBUGOVERLAY: + /* FALLTHROUGH */ + case SHT_ARM_OVERLAYSECTION: + /* FALLTHROUGH */ case SHT_MIPS_DWARF: /* FALLTHROUGH */ case SHT_MIPS_REGINFO: /* FALLTHROUGH */ case SHT_MIPS_OPTIONS: /* FALLTHROUGH */ - case SHT_AMD64_UNWIND: /* == SHT_IA_64_UNWIND */ + case SHT_AMD64_UNWIND: /* == SHT_IA_64_UNWIND == SHT_ARM_EXIDX */ return (ELF_T_BYTE); default: return (-1); diff --git a/share/examples/scsi_target/scsi_target.c b/share/examples/scsi_target/scsi_target.c index 1a7a061..0609ce1 100644 --- a/share/examples/scsi_target/scsi_target.c +++ b/share/examples/scsi_target/scsi_target.c @@ -365,7 +365,7 @@ init_ccbs() for (i = 0; i < MAX_INITIATORS; i++) { struct ccb_accept_tio *atio; struct atio_descr *a_descr; - struct ccb_immed_notify *inot; + struct ccb_immediate_notify *inot; atio = (struct ccb_accept_tio *)malloc(sizeof(*atio)); if (atio == NULL) { @@ -382,7 +382,7 @@ init_ccbs() atio->ccb_h.targ_descr = a_descr; send_ccb((union ccb *)atio, /*priority*/1); - inot = (struct ccb_immed_notify *)malloc(sizeof(*inot)); + inot = (struct ccb_immediate_notify *)malloc(sizeof(*inot)); if (inot == NULL) { warn("malloc INOT"); return (-1); @@ -593,7 +593,7 @@ handle_read() oo += run_queue(c_descr->atio); break; } - case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: /* INOTs are handled with priority */ TAILQ_INSERT_HEAD(&work_queue, &ccb->ccb_h, periph_links.tqe); @@ -903,7 +903,7 @@ free_ccb(union ccb *ccb) case XPT_ACCEPT_TARGET_IO: free(ccb->ccb_h.targ_descr); /* FALLTHROUGH */ - case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: default: free(ccb); break; diff --git a/share/man/man4/vtnet.4 b/share/man/man4/vtnet.4 index 8d4d202..c7b2189 100644 --- a/share/man/man4/vtnet.4 +++ b/share/man/man4/vtnet.4 @@ -69,14 +69,30 @@ prompt before booting the kernel or stored in .Xr loader.conf 5 . .Bl -tag -width "xxxxxx" .It Va hw.vtnet.csum_disable +.It Va hw.vtnet. Ns Ar X Ns Va .csum_disable This tunable disables receive and send checksum offload. The default value is 0. .It Va hw.vtnet.tso_disable +.It Va hw.vtnet. Ns Ar X Ns Va .tso_disable This tunable disables TSO. The default value is 0. .It Va hw.vtnet.lro_disable +.It Va hw.vtnet. Ns Ar X Ns Va .lro_disable This tunable disables LRO. The default value is 0. +.It Va hw.vtnet.mq_disable +.It Va hw.vtnet. Ns Ar X Ns Va .mq_disable +This tunable disables multiqueue. +The default value is 0. +.It Va hw.vtnet.mq_max_pairs +.It Va hw.vtnet. Ns Ar X Ns Va .mq_max_pairs +This tunable sets the maximum number of transmit and receive queue pairs. +Multiple queues are only supported when the Multiqueue feature is negotiated. +This driver supports a maximum of 8 queue pairs. +The number of queue pairs used is the lesser of the maximum supported by the +driver and the hypervisor, the number of CPUs present in the guest, and this +tunable if not zero. +The default value is 0. .El .Sh SEE ALSO .Xr arp 4 , diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e868cf5..79ec5ed 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -160,11 +160,11 @@ IDTVEC(xen_intr_upcall) SUPERALIGN_TEXT global_invltlb: - movl %cr4,%eax - andl $~0x80,%eax - movl %eax,%cr4 - orl $0x80,%eax - movl %eax,%cr4 + movq %cr4,%rax + andq $~0x80,%rax /* PGE */ + movq %rax,%cr4 + orq $0x80,%rax + movq %rax,%cr4 invltlb_ret_clear_pm_save: movq smp_tlb_pmap,%rdx testq %rdx,%rdx diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index a134e10..d905961 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -762,7 +762,6 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* Initialize the PAT MSR. */ pmap_init_pat(); -#ifdef SMP /* Initialize TLB Context Id. */ TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { @@ -773,8 +772,10 @@ pmap_bootstrap(vm_paddr_t *firstaddr) invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; kernel_pmap->pm_pcid = 0; - } else +#ifndef SMP + pmap_pcid_enabled = 0; #endif + } else pmap_pcid_enabled = 0; } diff --git a/sys/cam/ctl/scsi_ctl.c b/sys/cam/ctl/scsi_ctl.c index 97200ca..45acdba 100644 --- a/sys/cam/ctl/scsi_ctl.c +++ b/sys/cam/ctl/scsi_ctl.c @@ -961,23 +961,23 @@ ctlfestart(struct cam_periph *periph, union ccb *start_ccb) /* * Valid combinations: - * - CAM_SEND_STATUS, SCATTER_VALID = 0, dxfer_len = 0, + * - CAM_SEND_STATUS, CAM_DATA_SG = 0, dxfer_len = 0, * sglist_cnt = 0 - * - CAM_SEND_STATUS = 0, SCATTER_VALID = 0, dxfer_len != 0, + * - CAM_SEND_STATUS = 0, CAM_DATA_SG = 0, dxfer_len != 0, * sglist_cnt = 0 - * - CAM_SEND_STATUS = 0, SCATTER_VALID, dxfer_len != 0, + * - CAM_SEND_STATUS = 0, CAM_DATA_SG, dxfer_len != 0, * sglist_cnt != 0 */ #ifdef CTLFEDEBUG if (((flags & CAM_SEND_STATUS) - && (((flags & CAM_SCATTER_VALID) != 0) + && (((flags & CAM_DATA_SG) != 0) || (dxfer_len != 0) || (csio->sglist_cnt != 0))) || (((flags & CAM_SEND_STATUS) == 0) && (dxfer_len == 0)) - || ((flags & CAM_SCATTER_VALID) + || ((flags & CAM_DATA_SG) && (csio->sglist_cnt == 0)) - || (((flags & CAM_SCATTER_VALID) == 0) + || (((flags & CAM_DATA_SG) == 0) && (csio->sglist_cnt != 0))) { printf("%s: tag %04x cdb %02x flags %#x dxfer_len " "%d sg %u\n", __func__, atio->tag_id, diff --git a/sys/cam/scsi/scsi_enc.c b/sys/cam/scsi/scsi_enc.c index bb7a1a0..775d8f6 100644 --- a/sys/cam/scsi/scsi_enc.c +++ b/sys/cam/scsi/scsi_enc.c @@ -56,6 +56,8 @@ __FBSDID("$FreeBSD$"); #include <cam/scsi/scsi_enc.h> #include <cam/scsi/scsi_enc_internal.h> +#include <opt_ses.h> + MALLOC_DEFINE(M_SCSIENC, "SCSI ENC", "SCSI ENC buffers"); /* Enclosure type independent driver */ @@ -719,12 +721,12 @@ enc_type(struct ccb_getdev *cgd) return (ENC_NONE); } -#ifdef ENC_ENABLE_PASSTHROUGH +#ifdef SES_ENABLE_PASSTHROUGH if ((iqd[6] & 0x40) && (iqd[2] & 0x7) >= 2) { /* * PassThrough Device. */ - return (ENC_ENC_PASSTHROUGH); + return (ENC_SES_PASSTHROUGH); } #endif diff --git a/sys/cam/scsi/scsi_targ_bh.c b/sys/cam/scsi/scsi_targ_bh.c index 92a7ac8..bcf4eea 100644 --- a/sys/cam/scsi/scsi_targ_bh.c +++ b/sys/cam/scsi/scsi_targ_bh.c @@ -283,16 +283,13 @@ targbhenlun(struct cam_periph *periph) xpt_setup_ccb(&atio->ccb_h, periph->path, CAM_PRIORITY_NORMAL); atio->ccb_h.func_code = XPT_ACCEPT_TARGET_IO; atio->ccb_h.cbfcnp = targbhdone; - xpt_action((union ccb *)atio); - status = atio->ccb_h.status; - if (status != CAM_REQ_INPROG) { - targbhfreedescr(atio->ccb_h.ccb_descr); - free(atio, M_SCSIBH); - break; - } ((struct targbh_cmd_desc*)atio->ccb_h.ccb_descr)->atio_link = softc->accept_tio_list; softc->accept_tio_list = atio; + xpt_action((union ccb *)atio); + status = atio->ccb_h.status; + if (status != CAM_REQ_INPROG) + break; } if (i == 0) { @@ -308,10 +305,10 @@ targbhenlun(struct cam_periph *periph) * so the SIM can tell us of asynchronous target mode events. */ for (i = 0; i < MAX_ACCEPT; i++) { - struct ccb_immed_notify *inot; + struct ccb_immediate_notify *inot; - inot = (struct ccb_immed_notify*)malloc(sizeof(*inot), M_SCSIBH, - M_NOWAIT); + inot = (struct ccb_immediate_notify*)malloc(sizeof(*inot), + M_SCSIBH, M_NOWAIT); if (inot == NULL) { status = CAM_RESRC_UNAVAIL; @@ -319,16 +316,14 @@ targbhenlun(struct cam_periph *periph) } xpt_setup_ccb(&inot->ccb_h, periph->path, CAM_PRIORITY_NORMAL); - inot->ccb_h.func_code = XPT_IMMED_NOTIFY; + inot->ccb_h.func_code = XPT_IMMEDIATE_NOTIFY; inot->ccb_h.cbfcnp = targbhdone; + SLIST_INSERT_HEAD(&softc->immed_notify_slist, &inot->ccb_h, + periph_links.sle); xpt_action((union ccb *)inot); status = inot->ccb_h.status; - if (status != CAM_REQ_INPROG) { - free(inot, M_SCSIBH); + if (status != CAM_REQ_INPROG) break; - } - SLIST_INSERT_HEAD(&softc->immed_notify_slist, &inot->ccb_h, - periph_links.sle); } if (i == 0) { @@ -413,7 +408,9 @@ targbhctor(struct cam_periph *periph, void *arg) periph->softc = softc; softc->init_level++; - return (targbhenlun(periph)); + if (targbhenlun(periph) != CAM_REQ_CMP) + cam_periph_invalidate(periph); + return (CAM_REQ_CMP); } static void @@ -715,7 +712,7 @@ targbhdone(struct cam_periph *periph, union ccb *done_ccb) } break; } - case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: { int frozen; diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c index 78e96fb..4b4ad78 100644 --- a/sys/cam/scsi/scsi_target.c +++ b/sys/cam/scsi/scsi_target.c @@ -551,6 +551,7 @@ targwrite(struct cdev *dev, struct uio *uio, int ioflag) switch (func_code) { case XPT_ACCEPT_TARGET_IO: case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: cam_periph_lock(softc->periph); ccb = targgetccb(softc, func_code, priority); descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr; @@ -781,6 +782,7 @@ targdone(struct cam_periph *periph, union ccb *done_ccb) switch (done_ccb->ccb_h.func_code) { /* All FC_*_QUEUED CCBs go back to userland */ case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: case XPT_ACCEPT_TARGET_IO: case XPT_CONT_TARGET_IO: TAILQ_INSERT_TAIL(&softc->user_ccb_queue, &done_ccb->ccb_h, @@ -961,6 +963,7 @@ targfreeccb(struct targ_softc *softc, union ccb *ccb) switch (ccb->ccb_h.func_code) { case XPT_ACCEPT_TARGET_IO: case XPT_IMMED_NOTIFY: + case XPT_IMMEDIATE_NOTIFY: CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("freeing ccb %p\n", ccb)); free(ccb, M_TARG); break; @@ -1131,6 +1134,9 @@ targccblen(xpt_opcode func_code) case XPT_IMMED_NOTIFY: len = sizeof(struct ccb_immed_notify); break; + case XPT_IMMEDIATE_NOTIFY: + len = sizeof(struct ccb_immediate_notify); + break; case XPT_REL_SIMQ: len = sizeof(struct ccb_relsim); break; diff --git a/sys/cddl/dev/dtrace/powerpc/dtrace_isa.c b/sys/cddl/dev/dtrace/powerpc/dtrace_isa.c index 3793adf..9582c97 100644 --- a/sys/cddl/dev/dtrace/powerpc/dtrace_isa.c +++ b/sys/cddl/dev/dtrace/powerpc/dtrace_isa.c @@ -349,50 +349,84 @@ zero: uint64_t dtrace_getarg(int arg, int aframes) { - return (0); -} - -#ifdef notyet -{ - int depth = 0; - register_t sp; - vm_offset_t callpc; - pc_t caller = (pc_t) solaris_cpu[curcpu].cpu_dtrace_caller; - - if (intrpc != 0) - pcstack[depth++] = (pc_t) intrpc; - - aframes++; - - sp = dtrace_getfp(); + uintptr_t val; + uintptr_t *fp = (uintptr_t *)dtrace_getfp(); + uintptr_t *stack; + int i; - while (depth < pcstack_limit) { - if (!INKERNEL((long) frame)) - break; + /* + * A total of 8 arguments are passed via registers; any argument with + * index of 7 or lower is therefore in a register. + */ + int inreg = 7; - callpc = *(void **)(sp + RETURN_OFFSET); + for (i = 1; i <= aframes; i++) { + fp = (uintptr_t *)*fp; - if (!INKERNEL(callpc)) - break; + /* + * On ppc32 AIM, and booke, trapexit() is the immediately following + * label. On ppc64 AIM trapexit() follows a nop. + */ + if (((long)(fp[1]) == (long)trapexit) || + (((long)(fp[1]) + 4 == (long)trapexit))) { + /* + * In the case of powerpc, we will use the pointer to the regs + * structure that was pushed when we took the trap. To get this + * structure, we must increment beyond the frame structure. If the + * argument that we're seeking is passed on the stack, we'll pull + * the true stack pointer out of the saved registers and decrement + * our argument by the number of arguments passed in registers; if + * the argument we're seeking is passed in regsiters, we can just + * load it directly. + */ +#ifdef __powerpc64__ + struct reg *rp = (struct reg *)((uintptr_t)fp[0] + 48); +#else + struct reg *rp = (struct reg *)((uintptr_t)fp[0] + 8); +#endif - if (aframes > 0) { - aframes--; - if ((aframes == 0) && (caller != 0)) { - pcstack[depth++] = caller; + if (arg <= inreg) { + stack = &rp->fixreg[3]; + } else { + stack = (uintptr_t *)(rp->fixreg[1]); + arg -= inreg; } - } - else { - pcstack[depth++] = callpc; + goto load; } - sp = *(void **)sp; } - for (; depth < pcstack_limit; depth++) { - pcstack[depth] = 0; + /* + * We know that we did not come through a trap to get into + * dtrace_probe() -- the provider simply called dtrace_probe() + * directly. As this is the case, we need to shift the argument + * that we're looking for: the probe ID is the first argument to + * dtrace_probe(), so the argument n will actually be found where + * one would expect to find argument (n + 1). + */ + arg++; + + if (arg <= inreg) { + /* + * This shouldn't happen. If the argument is passed in a + * register then it should have been, well, passed in a + * register... + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); } + + arg -= (inreg + 1); + stack = fp + 2; + +load: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[arg]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); + return (0); } -#endif int dtrace_getstackdepth(int aframes) diff --git a/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c b/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c index e6f1ec0..d22f207 100644 --- a/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c @@ -51,6 +51,8 @@ extern int dtrace_in_probe; extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); +extern void dtrace_getnanotime(struct timespec *tsp); + int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); @@ -63,13 +65,13 @@ typedef struct dtrace_invop_hdlr { dtrace_invop_hdlr_t *dtrace_invop_hdlr; int -dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) +dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t arg0) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) - if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0) + if ((rval = hdlr->dtih_func(addr, stack, arg0)) != 0) return (rval); return (0); @@ -134,7 +136,7 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) CPU_SETOF(cpu, &cpus); smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_no_rendevous_barrier, arg); } static void @@ -145,9 +147,82 @@ dtrace_sync_func(void) void dtrace_sync(void) { - dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); + dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } +static int64_t tgt_cpu_tsc; +static int64_t hst_cpu_tsc; +static int64_t timebase_skew[MAXCPU]; +static uint64_t nsec_scale; + +/* See below for the explanation of this macro. */ +/* This is taken from the amd64 dtrace_subr, to provide a synchronized timer + * between multiple processors in dtrace. Since PowerPC Timebases can be much + * lower than x86, the scale shift is 26 instead of 28, allowing for a 15.63MHz + * timebase. + */ +#define SCALE_SHIFT 26 + +static void +dtrace_gethrtime_init_cpu(void *arg) +{ + uintptr_t cpu = (uintptr_t) arg; + + if (cpu == curcpu) + tgt_cpu_tsc = mftb(); + else + hst_cpu_tsc = mftb(); +} + +static void +dtrace_gethrtime_init(void *arg) +{ + struct pcpu *pc; + uint64_t tb_f; + cpuset_t map; + int i; + + tb_f = cpu_tickrate(); + + /* + * The following line checks that nsec_scale calculated below + * doesn't overflow 32-bit unsigned integer, so that it can multiply + * another 32-bit integer without overflowing 64-bit. + * Thus minimum supported Timebase frequency is 15.63MHz. + */ + KASSERT(tb_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("Timebase frequency is too low")); + + /* + * We scale up NANOSEC/tb_f ratio to preserve as much precision + * as possible. + * 2^26 factor was chosen quite arbitrarily from practical + * considerations: + * - it supports TSC frequencies as low as 15.63MHz (see above); + */ + nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tb_f; + + /* The current CPU is the reference one. */ + sched_pin(); + timebase_skew[curcpu] = 0; + CPU_FOREACH(i) { + if (i == curcpu) + continue; + + pc = pcpu_find(i); + CPU_SETOF(PCPU_GET(cpuid), &map); + CPU_SET(pc->pc_cpuid, &map); + + smp_rendezvous_cpus(map, NULL, + dtrace_gethrtime_init_cpu, + smp_no_rendevous_barrier, (void *)(uintptr_t) i); + + timebase_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; + } + sched_unpin(); +} + +SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); + /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have @@ -158,12 +233,21 @@ dtrace_sync(void) uint64_t dtrace_gethrtime() { - struct timespec curtime; - - nanouptime(&curtime); - - return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); + uint64_t timebase; + uint32_t lo; + uint32_t hi; + /* + * We split timebase value into lower and higher 32-bit halves and separately + * scale them with nsec_scale, then we scale them down by 2^28 + * (see nsec_scale calculations) taking into account 32-bit shift of + * the higher half and finally add. + */ + timebase = mftb() - timebase_skew[curcpu]; + lo = timebase; + hi = timebase >> 32; + return (((lo * nsec_scale) >> SCALE_SHIFT) + + ((hi * nsec_scale) << (32 - SCALE_SHIFT))); } uint64_t @@ -171,12 +255,12 @@ dtrace_gethrestime(void) { struct timespec curtime; - getnanotime(&curtime); + dtrace_getnanotime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } -/* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ +/* Function to handle DTrace traps during probes. See powerpc/powerpc/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { @@ -196,34 +280,34 @@ dtrace_trap(struct trapframe *frame, u_int type) * All the rest will be handled in the usual way. */ switch (type) { - /* Page fault. */ - case EXC_DSI: - case EXC_DSE: - /* Flag a bad address. */ - cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; - cpu_core[curcpu].cpuc_dtrace_illval = frame->cpu.aim.dar; - - /* - * Offset the instruction pointer to the instruction - * following the one causing the fault. - */ - frame->srr0 += sizeof(int); - return (1); - case EXC_ISI: - case EXC_ISE: - /* Flag a bad address. */ - cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; - cpu_core[curcpu].cpuc_dtrace_illval = frame->srr0; - - /* - * Offset the instruction pointer to the instruction - * following the one causing the fault. - */ - frame->srr0 += sizeof(int); - return (1); - default: - /* Handle all other traps in the usual way. */ - break; + /* Page fault. */ + case EXC_DSI: + case EXC_DSE: + /* Flag a bad address. */ + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; + cpu_core[curcpu].cpuc_dtrace_illval = frame->cpu.aim.dar; + + /* + * Offset the instruction pointer to the instruction + * following the one causing the fault. + */ + frame->srr0 += sizeof(int); + return (1); + case EXC_ISI: + case EXC_ISE: + /* Flag a bad address. */ + cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; + cpu_core[curcpu].cpuc_dtrace_illval = frame->srr0; + + /* + * Offset the instruction pointer to the instruction + * following the one causing the fault. + */ + frame->srr0 += sizeof(int); + return (1); + default: + /* Handle all other traps in the usual way. */ + break; } } @@ -237,28 +321,29 @@ dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, - (uintptr_t)epid, - (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); + (uintptr_t)epid, + (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { switch (dtrace_invop(frame->srr0, (uintptr_t *)frame, frame->fixreg[3])) { - case DTRACE_INVOP_JUMP: - break; - case DTRACE_INVOP_BCTR: - frame->srr0 = frame->ctr; - break; - case DTRACE_INVOP_BLR: - frame->srr0 = frame->lr; - break; - case DTRACE_INVOP_MFLR_R0: - frame->fixreg[0] = frame->lr ; - break; - default: - return (-1); - break; + case DTRACE_INVOP_JUMP: + break; + case DTRACE_INVOP_BCTR: + frame->srr0 = frame->ctr; + break; + case DTRACE_INVOP_BLR: + frame->srr0 = frame->lr; + break; + case DTRACE_INVOP_MFLR_R0: + frame->fixreg[0] = frame->lr; + frame->srr0 = frame->srr0 + 4; + break; + default: + return (-1); + break; } return (0); diff --git a/sys/cddl/dev/fbt/fbt_powerpc.c b/sys/cddl/dev/fbt/fbt_powerpc.c index bee3dc7..cdaa06a 100644 --- a/sys/cddl/dev/fbt/fbt_powerpc.c +++ b/sys/cddl/dev/fbt/fbt_powerpc.c @@ -57,6 +57,7 @@ #include <sys/sysproto.h> #include <sys/uio.h> #include <sys/unistd.h> +#include <machine/md_var.h> #include <machine/stdarg.h> #include <sys/dtrace.h> @@ -172,7 +173,11 @@ fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval) tmp = fbt->fbtp_savedval & FBT_BR_MASK; /* Sign extend. */ if (tmp & 0x02000000) - tmp |= 0xFC000000; +#ifdef __powerpc64__ + tmp |= 0xfffffffffc000000ULL; +#else + tmp |= 0xfc000000UL; +#endif frame->srr0 += tmp; } cpu->cpu_dtrace_caller = 0; @@ -193,9 +198,12 @@ fbt_provide_module_function(linker_file_t lf, int symindx, const char *name = symval->name; fbt_probe_t *fbt, *retfbt; int j; - int size; u_int32_t *instr, *limit; + /* PowerPC64 uses '.' prefixes on symbol names, ignore it. */ + if (name[0] == '.') + name++; + if (strncmp(name, "dtrace_", 7) == 0 && strncmp(name, "dtrace_safe_", 12) != 0) { /* @@ -210,8 +218,6 @@ fbt_provide_module_function(linker_file_t lf, int symindx, if (name[0] == '_' && name[1] == '_') return (0); - size = symval->size; - instr = (u_int32_t *) symval->value; limit = (u_int32_t *) symval->value + symval->size; @@ -219,7 +225,7 @@ fbt_provide_module_function(linker_file_t lf, int symindx, if (*instr == FBT_MFLR_R0) break; - if (*instr != FBT_MFLR_R0); + if (*instr != FBT_MFLR_R0) return (0); fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO); @@ -264,9 +270,6 @@ again: } } - if (*instr == FBT_MFLR_R0) - return (0); - if (*instr != FBT_MTLR_R0) { instr++; goto again; @@ -291,7 +294,7 @@ again: if (retfbt == NULL) { fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, 3, fbt); + name, FBT_RETURN, 5, fbt); } else { retfbt->fbtp_next = fbt; fbt->fbtp_id = retfbt->fbtp_id; @@ -317,7 +320,7 @@ again: lf->fbt_nentries++; - instr += size; + instr += 4; goto again; } @@ -434,6 +437,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) for (; fbt != NULL; fbt = fbt->fbtp_next) { *fbt->fbtp_patchpoint = fbt->fbtp_patchval; + __syncicache(fbt->fbtp_patchpoint, 4); } } @@ -449,8 +453,10 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg) if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; - for (; fbt != NULL; fbt = fbt->fbtp_next) + for (; fbt != NULL; fbt = fbt->fbtp_next) { *fbt->fbtp_patchpoint = fbt->fbtp_savedval; + __syncicache(fbt->fbtp_patchpoint, 4); + } } static void @@ -464,8 +470,10 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg) if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; - for (; fbt != NULL; fbt = fbt->fbtp_next) + for (; fbt != NULL; fbt = fbt->fbtp_next) { *fbt->fbtp_patchpoint = fbt->fbtp_savedval; + __syncicache(fbt->fbtp_patchpoint, 4); + } } static void @@ -479,15 +487,16 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg) if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; - for (; fbt != NULL; fbt = fbt->fbtp_next) + for (; fbt != NULL; fbt = fbt->fbtp_next) { *fbt->fbtp_patchpoint = fbt->fbtp_patchval; + __syncicache(fbt->fbtp_patchpoint, 4); + } } static int fbt_ctfoff_init(modctl_t *lf, linker_ctf_t *lc) { const Elf_Sym *symp = lc->symtab;; - const char *name; const ctf_header_t *hp = (const ctf_header_t *) lc->ctftab; const uint8_t *ctfdata = lc->ctftab + sizeof(ctf_header_t); int i; @@ -519,11 +528,6 @@ fbt_ctfoff_init(modctl_t *lf, linker_ctf_t *lc) continue; } - if (symp->st_name < lc->strcnt) - name = lc->strtab + symp->st_name; - else - name = "(?)"; - switch (ELF_ST_TYPE(symp->st_info)) { case STT_OBJECT: if (objtoff >= hp->cth_funcoff || @@ -690,6 +694,8 @@ fbt_typoff_init(linker_ctf_t *lc) pop[kind]++; } + /* account for a sentinel value below */ + ctf_typemax++; *lc->typlenp = ctf_typemax; if ((xp = malloc(sizeof(uint32_t) * ctf_typemax, M_LINKER, M_ZERO | M_WAITOK)) == NULL) @@ -1171,6 +1177,11 @@ fbt_getargdesc(void *arg __unused, dtrace_id_t id __unused, void *parg, dtrace_a uint32_t offset; ushort_t info, kind, n; + if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) { + (void) strcpy(desc->dtargd_native, "int"); + return; + } + desc->dtargd_ndx = DTRACE_ARGNONE; /* Get a pointer to the CTF data and it's length. */ @@ -1221,12 +1232,19 @@ fbt_getargdesc(void *arg __unused, dtrace_id_t id __unused, void *parg, dtrace_a return; } - /* Check if the requested argument doesn't exist. */ - if (ndx >= n) - return; + if (fbt->fbtp_roffset != 0) { + /* Only return type is available for args[1] in return probe. */ + if (ndx > 1) + return; + ASSERT(ndx == 1); + } else { + /* Check if the requested argument doesn't exist. */ + if (ndx >= n) + return; - /* Skip the return type and arguments up to the one requested. */ - dp += ndx + 1; + /* Skip the return type and arguments up to the one requested. */ + dp += ndx + 1; + } if (fbt_type_name(&lc, *dp, desc->dtargd_native, sizeof(desc->dtargd_native)) > 0) desc->dtargd_ndx = ndx; @@ -1234,6 +1252,15 @@ fbt_getargdesc(void *arg __unused, dtrace_id_t id __unused, void *parg, dtrace_a return; } +static int +fbt_linker_file_cb(linker_file_t lf, void *arg) +{ + + fbt_provide_module(arg, lf); + + return (0); +} + static void fbt_load(void *dummy) { @@ -1257,6 +1284,9 @@ fbt_load(void *dummy) if (dtrace_register("fbt", &fbt_attr, DTRACE_PRIV_USER, NULL, &fbt_pops, NULL, &fbt_id) != 0) return; + + /* Create probes for the kernel and already-loaded modules. */ + linker_file_foreach(fbt_linker_file_cb, NULL); } diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c index 89604d1..f757394 100644 --- a/sys/dev/virtio/network/if_vtnet.c +++ b/sys/dev/virtio/network/if_vtnet.c @@ -29,10 +29,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#ifdef HAVE_KERNEL_OPTION_HEADERS -#include "opt_device_polling.h" -#endif - #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> @@ -46,6 +42,9 @@ __FBSDID("$FreeBSD$"); #include <sys/sglist.h> #include <sys/lock.h> #include <sys/mutex.h> +#include <sys/taskqueue.h> +#include <sys/smp.h> +#include <machine/smp.h> #include <vm/uma.h> @@ -63,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip6.h> +#include <netinet6/ip6_var.h> #include <netinet/udp.h> #include <netinet/tcp.h> #include <netinet/sctp.h> @@ -79,6 +79,9 @@ __FBSDID("$FreeBSD$"); #include "virtio_if.h" +#include "opt_inet.h" +#include "opt_inet6.h" + static int vtnet_modevent(module_t, int, void *); static int vtnet_probe(device_t); @@ -87,82 +90,139 @@ static int vtnet_detach(device_t); static int vtnet_suspend(device_t); static int vtnet_resume(device_t); static int vtnet_shutdown(device_t); +static int vtnet_attach_completed(device_t); static int vtnet_config_change(device_t); static void vtnet_negotiate_features(struct vtnet_softc *); +static void vtnet_setup_features(struct vtnet_softc *); +static int vtnet_init_rxq(struct vtnet_softc *, int); +static int vtnet_init_txq(struct vtnet_softc *, int); +static int vtnet_alloc_rxtx_queues(struct vtnet_softc *); +static void vtnet_free_rxtx_queues(struct vtnet_softc *); +static int vtnet_alloc_rx_filters(struct vtnet_softc *); +static void vtnet_free_rx_filters(struct vtnet_softc *); static int vtnet_alloc_virtqueues(struct vtnet_softc *); -static void vtnet_get_hwaddr(struct vtnet_softc *); -static void vtnet_set_hwaddr(struct vtnet_softc *); -static int vtnet_is_link_up(struct vtnet_softc *); -static void vtnet_update_link_status(struct vtnet_softc *); -static void vtnet_watchdog(struct vtnet_softc *); +static int vtnet_setup_interface(struct vtnet_softc *); static int vtnet_change_mtu(struct vtnet_softc *, int); static int vtnet_ioctl(struct ifnet *, u_long, caddr_t); -static int vtnet_init_rx_vq(struct vtnet_softc *); -static void vtnet_free_rx_mbufs(struct vtnet_softc *); -static void vtnet_free_tx_mbufs(struct vtnet_softc *); -static void vtnet_free_ctrl_vq(struct vtnet_softc *); - -#ifdef DEVICE_POLLING -static poll_handler_t vtnet_poll; -#endif - -static struct mbuf * vtnet_alloc_rxbuf(struct vtnet_softc *, int, - struct mbuf **); -static int vtnet_replace_rxbuf(struct vtnet_softc *, +static int vtnet_rxq_populate(struct vtnet_rxq *); +static void vtnet_rxq_free_mbufs(struct vtnet_rxq *); +static struct mbuf * + vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **); +static int vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *, struct mbuf *, int); -static int vtnet_newbuf(struct vtnet_softc *); -static void vtnet_discard_merged_rxbuf(struct vtnet_softc *, int); -static void vtnet_discard_rxbuf(struct vtnet_softc *, struct mbuf *); -static int vtnet_enqueue_rxbuf(struct vtnet_softc *, struct mbuf *); -static void vtnet_vlan_tag_remove(struct mbuf *); -static int vtnet_rx_csum(struct vtnet_softc *, struct mbuf *, +static int vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int); +static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *); +static int vtnet_rxq_new_buf(struct vtnet_rxq *); +static int vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *, + struct virtio_net_hdr *); +static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int); +static void vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *); +static int vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int); +static void vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); -static int vtnet_rxeof_merged(struct vtnet_softc *, struct mbuf *, int); -static int vtnet_rxeof(struct vtnet_softc *, int, int *); +static int vtnet_rxq_eof(struct vtnet_rxq *); static void vtnet_rx_vq_intr(void *); +static void vtnet_rxq_tq_intr(void *, int); -static void vtnet_txeof(struct vtnet_softc *); -static struct mbuf * vtnet_tx_offload(struct vtnet_softc *, struct mbuf *, +static void vtnet_txq_free_mbufs(struct vtnet_txq *); +static int vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *, + int *, int *, int *); +static int vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int, + int, struct virtio_net_hdr *); +static struct mbuf * + vtnet_txq_offload(struct vtnet_txq *, struct mbuf *, struct virtio_net_hdr *); -static int vtnet_enqueue_txbuf(struct vtnet_softc *, struct mbuf **, +static int vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **, struct vtnet_tx_header *); -static int vtnet_encap(struct vtnet_softc *, struct mbuf **); -static void vtnet_start_locked(struct ifnet *); +static int vtnet_txq_encap(struct vtnet_txq *, struct mbuf **); +#ifdef VTNET_LEGACY_TX +static void vtnet_start_locked(struct vtnet_txq *, struct ifnet *); static void vtnet_start(struct ifnet *); -static void vtnet_tick(void *); +#else +static int vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *); +static int vtnet_txq_mq_start(struct ifnet *, struct mbuf *); +static void vtnet_txq_tq_deferred(void *, int); +#endif +static void vtnet_txq_tq_intr(void *, int); +static void vtnet_txq_eof(struct vtnet_txq *); static void vtnet_tx_vq_intr(void *); +static void vtnet_tx_start_all(struct vtnet_softc *); + +#ifndef VTNET_LEGACY_TX +static void vtnet_qflush(struct ifnet *); +#endif + +static int vtnet_watchdog(struct vtnet_txq *); +static void vtnet_rxq_accum_stats(struct vtnet_rxq *, + struct vtnet_rxq_stats *); +static void vtnet_txq_accum_stats(struct vtnet_txq *, + struct vtnet_txq_stats *); +static void vtnet_accumulate_stats(struct vtnet_softc *); +static void vtnet_tick(void *); +static void vtnet_start_taskqueues(struct vtnet_softc *); +static void vtnet_free_taskqueues(struct vtnet_softc *); +static void vtnet_drain_taskqueues(struct vtnet_softc *); + +static void vtnet_drain_rxtx_queues(struct vtnet_softc *); +static void vtnet_stop_rendezvous(struct vtnet_softc *); static void vtnet_stop(struct vtnet_softc *); +static int vtnet_virtio_reinit(struct vtnet_softc *); +static void vtnet_init_rx_filters(struct vtnet_softc *); +static int vtnet_init_rx_queues(struct vtnet_softc *); +static int vtnet_init_tx_queues(struct vtnet_softc *); +static int vtnet_init_rxtx_queues(struct vtnet_softc *); +static void vtnet_set_active_vq_pairs(struct vtnet_softc *); static int vtnet_reinit(struct vtnet_softc *); static void vtnet_init_locked(struct vtnet_softc *); static void vtnet_init(void *); +static void vtnet_free_ctrl_vq(struct vtnet_softc *); static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, struct sglist *, int, int); - -static void vtnet_rx_filter(struct vtnet_softc *sc); +static int vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *); +static int vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t); static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int); static int vtnet_set_promisc(struct vtnet_softc *, int); static int vtnet_set_allmulti(struct vtnet_softc *, int); +static void vtnet_attach_disable_promisc(struct vtnet_softc *); +static void vtnet_rx_filter(struct vtnet_softc *); static void vtnet_rx_filter_mac(struct vtnet_softc *); - static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_rx_filter_vlan(struct vtnet_softc *); -static void vtnet_set_vlan_filter(struct vtnet_softc *, int, uint16_t); +static void vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_register_vlan(void *, struct ifnet *, uint16_t); static void vtnet_unregister_vlan(void *, struct ifnet *, uint16_t); +static int vtnet_is_link_up(struct vtnet_softc *); +static void vtnet_update_link_status(struct vtnet_softc *); static int vtnet_ifmedia_upd(struct ifnet *); static void vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *); +static void vtnet_get_hwaddr(struct vtnet_softc *); +static void vtnet_set_hwaddr(struct vtnet_softc *); +static void vtnet_vlan_tag_remove(struct mbuf *); -static void vtnet_add_statistics(struct vtnet_softc *); - -static int vtnet_enable_rx_intr(struct vtnet_softc *); -static int vtnet_enable_tx_intr(struct vtnet_softc *); -static void vtnet_disable_rx_intr(struct vtnet_softc *); -static void vtnet_disable_tx_intr(struct vtnet_softc *); +static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *, + struct sysctl_oid_list *, struct vtnet_rxq *); +static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *, + struct sysctl_oid_list *, struct vtnet_txq *); +static void vtnet_setup_queue_sysctl(struct vtnet_softc *); +static void vtnet_setup_sysctl(struct vtnet_softc *); + +static int vtnet_rxq_enable_intr(struct vtnet_rxq *); +static void vtnet_rxq_disable_intr(struct vtnet_rxq *); +static int vtnet_txq_enable_intr(struct vtnet_txq *); +static void vtnet_txq_disable_intr(struct vtnet_txq *); +static void vtnet_enable_rx_interrupts(struct vtnet_softc *); +static void vtnet_enable_tx_interrupts(struct vtnet_softc *); +static void vtnet_enable_interrupts(struct vtnet_softc *); +static void vtnet_disable_rx_interrupts(struct vtnet_softc *); +static void vtnet_disable_tx_interrupts(struct vtnet_softc *); +static void vtnet_disable_interrupts(struct vtnet_softc *); + +static int vtnet_tunable_int(struct vtnet_softc *, const char *, int); /* Tunables. */ static int vtnet_csum_disable = 0; @@ -171,16 +231,25 @@ static int vtnet_tso_disable = 0; TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable); static int vtnet_lro_disable = 0; TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable); +static int vtnet_mq_disable = 0; +TUNABLE_INT("hw.vtnet.mq_disable", &vtnet_mq_disable); +static int vtnet_mq_max_pairs = 0; +TUNABLE_INT("hw.vtnet.mq_max_pairs", &vtnet_mq_max_pairs); +static int vtnet_rx_process_limit = 512; +TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit); /* - * Reducing the number of transmit completed interrupts can - * improve performance. To do so, the define below keeps the - * Tx vq interrupt disabled and adds calls to vtnet_txeof() - * in the start and watchdog paths. The price to pay for this - * is the m_free'ing of transmitted mbufs may be delayed until - * the watchdog fires. + * Reducing the number of transmit completed interrupts can improve + * performance. To do so, the define below keeps the Tx vq interrupt + * disabled and adds calls to vtnet_txeof() in the start and watchdog + * paths. The price to pay for this is the m_free'ing of transmitted + * mbufs may be delayed until the watchdog fires. + * + * BMV: Reintroduce this later as a run-time option, if it makes + * sense after the EVENT_IDX feature is supported. + * + * #define VTNET_TX_INTR_MODERATION */ -#define VTNET_TX_INTR_MODERATION static uma_zone_t vtnet_tx_header_zone; @@ -203,21 +272,25 @@ static struct virtio_feature_desc vtnet_feature_desc[] = { { VIRTIO_NET_F_CTRL_RX, "RxMode" }, { VIRTIO_NET_F_CTRL_VLAN, "VLanFilter" }, { VIRTIO_NET_F_CTRL_RX_EXTRA, "RxModeExtra" }, + { VIRTIO_NET_F_GUEST_ANNOUNCE, "GuestAnnounce" }, + { VIRTIO_NET_F_MQ, "Multiqueue" }, + { VIRTIO_NET_F_CTRL_MAC_ADDR, "SetMacAddress" }, { 0, NULL } }; static device_method_t vtnet_methods[] = { /* Device methods. */ - DEVMETHOD(device_probe, vtnet_probe), - DEVMETHOD(device_attach, vtnet_attach), - DEVMETHOD(device_detach, vtnet_detach), - DEVMETHOD(device_suspend, vtnet_suspend), - DEVMETHOD(device_resume, vtnet_resume), - DEVMETHOD(device_shutdown, vtnet_shutdown), + DEVMETHOD(device_probe, vtnet_probe), + DEVMETHOD(device_attach, vtnet_attach), + DEVMETHOD(device_detach, vtnet_detach), + DEVMETHOD(device_suspend, vtnet_suspend), + DEVMETHOD(device_resume, vtnet_resume), + DEVMETHOD(device_shutdown, vtnet_shutdown), /* VirtIO methods. */ - DEVMETHOD(virtio_config_change, vtnet_config_change), + DEVMETHOD(virtio_attach_completed, vtnet_attach_completed), + DEVMETHOD(virtio_config_change, vtnet_config_change), DEVMETHOD_END }; @@ -282,56 +355,31 @@ static int vtnet_attach(device_t dev) { struct vtnet_softc *sc; - struct ifnet *ifp; - int tx_size, error; + int error; sc = device_get_softc(dev); sc->vtnet_dev = dev; - VTNET_LOCK_INIT(sc); - callout_init_mtx(&sc->vtnet_tick_ch, VTNET_MTX(sc), 0); - - ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd, - vtnet_ifmedia_sts); - ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL); - ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE); - - vtnet_add_statistics(sc); - + /* Register our feature descriptions. */ virtio_set_feature_desc(dev, vtnet_feature_desc); - vtnet_negotiate_features(sc); - - if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { - sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; - sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); - } else - sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); - - sc->vtnet_rx_mbuf_size = MCLBYTES; - sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc); - - if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { - sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; - if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) { - sc->vtnet_mac_filter = malloc( - sizeof(struct vtnet_mac_filter), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (sc->vtnet_mac_filter == NULL) { - device_printf(dev, - "cannot allocate mac filter table\n"); - error = ENOMEM; - goto fail; - } + VTNET_CORE_LOCK_INIT(sc); + callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0); - sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; - } + vtnet_setup_sysctl(sc); + vtnet_setup_features(sc); - if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) - sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; + error = vtnet_alloc_rx_filters(sc); + if (error) { + device_printf(dev, "cannot allocate Rx filters\n"); + goto fail; } - vtnet_get_hwaddr(sc); + error = vtnet_alloc_rxtx_queues(sc); + if (error) { + device_printf(dev, "cannot allocate queues\n"); + goto fail; + } error = vtnet_alloc_virtqueues(sc); if (error) { @@ -339,111 +387,21 @@ vtnet_attach(device_t dev) goto fail; } - ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) { - device_printf(dev, "cannot allocate ifnet structure\n"); - error = ENOSPC; + error = vtnet_setup_interface(sc); + if (error) { + device_printf(dev, "cannot setup interface\n"); goto fail; } - ifp->if_softc = sc; - if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_init = vtnet_init; - ifp->if_start = vtnet_start; - ifp->if_ioctl = vtnet_ioctl; - - sc->vtnet_rx_size = virtqueue_size(sc->vtnet_rx_vq); - sc->vtnet_rx_process_limit = sc->vtnet_rx_size; - - tx_size = virtqueue_size(sc->vtnet_tx_vq); - sc->vtnet_tx_size = tx_size; - IFQ_SET_MAXLEN(&ifp->if_snd, tx_size - 1); - ifp->if_snd.ifq_drv_maxlen = tx_size - 1; - IFQ_SET_READY(&ifp->if_snd); - - ether_ifattach(ifp, sc->vtnet_hwaddr); - - if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) - ifp->if_capabilities |= IFCAP_LINKSTATE; - - /* Tell the upper layer(s) we support long frames. */ - ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); - ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; - - if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { - ifp->if_capabilities |= IFCAP_TXCSUM; - - if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) - ifp->if_capabilities |= IFCAP_TSO4; - if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) - ifp->if_capabilities |= IFCAP_TSO6; - if (ifp->if_capabilities & IFCAP_TSO) - ifp->if_capabilities |= IFCAP_VLAN_HWTSO; - - if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) - sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; - } - - if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) { - ifp->if_capabilities |= IFCAP_RXCSUM; - - if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) || - virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6)) - ifp->if_capabilities |= IFCAP_LRO; - } - - if (ifp->if_capabilities & IFCAP_HWCSUM) { - /* - * VirtIO does not support VLAN tagging, but we can fake - * it by inserting and removing the 802.1Q header during - * transmit and receive. We are then able to do checksum - * offloading of VLAN frames. - */ - ifp->if_capabilities |= - IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; - } - - ifp->if_capenable = ifp->if_capabilities; - - /* - * Capabilities after here are not enabled by default. - */ - - if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { - ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; - - sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, - vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); - sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, - vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); - } - -#ifdef DEVICE_POLLING - ifp->if_capabilities |= IFCAP_POLLING; -#endif - error = virtio_setup_intr(dev, INTR_TYPE_NET); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); - ether_ifdetach(ifp); + /* BMV: This will crash if during boot! */ + ether_ifdetach(sc->vtnet_ifp); goto fail; } - /* - * Device defaults to promiscuous mode for backwards - * compatibility. Turn it off if possible. - */ - if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { - VTNET_LOCK(sc); - if (vtnet_set_promisc(sc, 0) != 0) { - ifp->if_flags |= IFF_PROMISC; - device_printf(dev, - "cannot disable promiscuous mode\n"); - } - VTNET_UNLOCK(sc); - } else - ifp->if_flags |= IFF_PROMISC; + vtnet_start_taskqueues(sc); fail: if (error) @@ -461,24 +419,19 @@ vtnet_detach(device_t dev) sc = device_get_softc(dev); ifp = sc->vtnet_ifp; - KASSERT(mtx_initialized(VTNET_MTX(sc)), - ("vtnet mutex not initialized")); - -#ifdef DEVICE_POLLING - if (ifp != NULL && ifp->if_capenable & IFCAP_POLLING) - ether_poll_deregister(ifp); -#endif - if (device_is_attached(dev)) { - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); vtnet_stop(sc); - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); callout_drain(&sc->vtnet_tick_ch); + vtnet_drain_taskqueues(sc); ether_ifdetach(ifp); } + vtnet_free_taskqueues(sc); + if (sc->vtnet_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach); sc->vtnet_vlan_attach = NULL; @@ -488,25 +441,20 @@ vtnet_detach(device_t dev) sc->vtnet_vlan_detach = NULL; } - if (sc->vtnet_mac_filter != NULL) { - free(sc->vtnet_mac_filter, M_DEVBUF); - sc->vtnet_mac_filter = NULL; - } + ifmedia_removeall(&sc->vtnet_media); if (ifp != NULL) { if_free(ifp); sc->vtnet_ifp = NULL; } - if (sc->vtnet_rx_vq != NULL) - vtnet_free_rx_mbufs(sc); - if (sc->vtnet_tx_vq != NULL) - vtnet_free_tx_mbufs(sc); + vtnet_free_rxtx_queues(sc); + vtnet_free_rx_filters(sc); + if (sc->vtnet_ctrl_vq != NULL) vtnet_free_ctrl_vq(sc); - ifmedia_removeall(&sc->vtnet_media); - VTNET_LOCK_DESTROY(sc); + VTNET_CORE_LOCK_DESTROY(sc); return (0); } @@ -518,10 +466,10 @@ vtnet_suspend(device_t dev) sc = device_get_softc(dev); - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); vtnet_stop(sc); sc->vtnet_flags |= VTNET_FLAG_SUSPENDED; - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); return (0); } @@ -535,11 +483,11 @@ vtnet_resume(device_t dev) sc = device_get_softc(dev); ifp = sc->vtnet_ifp; - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) vtnet_init_locked(sc); sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED; - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); return (0); } @@ -556,15 +504,26 @@ vtnet_shutdown(device_t dev) } static int +vtnet_attach_completed(device_t dev) +{ + + vtnet_attach_disable_promisc(device_get_softc(dev)); + + return (0); +} + +static int vtnet_config_change(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); vtnet_update_link_status(sc); - VTNET_UNLOCK(sc); + if (sc->vtnet_link_active != 0) + vtnet_tx_start_all(sc); + VTNET_CORE_UNLOCK(sc); return (0); } @@ -578,188 +537,491 @@ vtnet_negotiate_features(struct vtnet_softc *sc) dev = sc->vtnet_dev; mask = 0; - if (vtnet_csum_disable) + /* + * TSO and LRO are only available when their corresponding checksum + * offload feature is also negotiated. + */ + if (vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable)) { mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; + mask |= VTNET_TSO_FEATURES | VTNET_LRO_FEATURES; + } + if (vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable)) + mask |= VTNET_TSO_FEATURES; + if (vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable)) + mask |= VTNET_LRO_FEATURES; + if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable)) + mask |= VIRTIO_NET_F_MQ; +#ifdef VTNET_LEGACY_TX + mask |= VIRTIO_NET_F_MQ; +#endif + + features = VTNET_FEATURES & ~mask; + sc->vtnet_features = virtio_negotiate_features(dev, features); + + if (virtio_with_feature(dev, VTNET_LRO_FEATURES) == 0) + return; + if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) + return; /* - * TSO and LRO are only available when their corresponding - * checksum offload feature is also negotiated. + * LRO without mergeable buffers requires special care. This is not + * ideal because every receive buffer must be large enough to hold + * the maximum TCP packet, the Ethernet header, and the header. This + * requires up to 34 descriptors with MCLBYTES clusters. If we do + * not have indirect descriptors, LRO is disabled since the virtqueue + * will not contain very many receive buffers. */ + if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0) { + device_printf(dev, + "LRO disabled due to both mergeable buffers and indirect " + "descriptors not negotiated\n"); - if (vtnet_csum_disable || vtnet_tso_disable) - mask |= VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | - VIRTIO_NET_F_HOST_ECN; + features &= ~VTNET_LRO_FEATURES; + sc->vtnet_features = virtio_negotiate_features(dev, features); + } else + sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; +} - if (vtnet_csum_disable || vtnet_lro_disable) - mask |= VTNET_LRO_FEATURES; +static void +vtnet_setup_features(struct vtnet_softc *sc) +{ + device_t dev; + int max_pairs, max; - features = VTNET_FEATURES & ~mask; -#ifdef VTNET_TX_INTR_MODERATION - features |= VIRTIO_F_NOTIFY_ON_EMPTY; -#endif - sc->vtnet_features = virtio_negotiate_features(dev, features); + dev = sc->vtnet_dev; + + vtnet_negotiate_features(sc); + + if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { + /* This feature should always be negotiated. */ + sc->vtnet_flags |= VTNET_FLAG_MAC; + } + + if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { + sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; + sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else + sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); + + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { + sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; - if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0 && - virtio_with_feature(dev, VTNET_LRO_FEATURES)) { + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) + sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) + sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR)) + sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC; + } + + if (virtio_with_feature(dev, VIRTIO_NET_F_MQ) && + sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { + max_pairs = virtio_read_dev_config_2(dev, + offsetof(struct virtio_net_config, max_virtqueue_pairs)); + if (max_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || + max_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) + max_pairs = 1; + } else + max_pairs = 1; + + if (max_pairs > 1) { /* - * LRO without mergeable buffers requires special care. This - * is not ideal because every receive buffer must be large - * enough to hold the maximum TCP packet, the Ethernet header, - * and the vtnet_rx_header. This requires up to 34 descriptors - * when using MCLBYTES clusters. If we do not have indirect - * descriptors, LRO is disabled since the virtqueue will not - * be able to contain very many receive buffers. + * Limit the maximum number of queue pairs to the number of + * CPUs or the configured maximum. The actual number of + * queues that get used may be less. */ - if (virtio_with_feature(dev, - VIRTIO_RING_F_INDIRECT_DESC) == 0) { - device_printf(dev, - "LRO disabled due to lack of both mergeable " - "buffers and indirect descriptors\n"); - - sc->vtnet_features = virtio_negotiate_features(dev, - features & ~VTNET_LRO_FEATURES); - } else - sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; + max = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs); + if (max > 0 && max_pairs > max) + max_pairs = max; + if (max_pairs > mp_ncpus) + max_pairs = mp_ncpus; + if (max_pairs > VTNET_MAX_QUEUE_PAIRS) + max_pairs = VTNET_MAX_QUEUE_PAIRS; + if (max_pairs > 1) + sc->vtnet_flags |= VTNET_FLAG_MULTIQ; } + + sc->vtnet_max_vq_pairs = max_pairs; } static int -vtnet_alloc_virtqueues(struct vtnet_softc *sc) +vtnet_init_rxq(struct vtnet_softc *sc, int id) { - device_t dev; - struct vq_alloc_info vq_info[3]; - int nvqs, rxsegs; + struct vtnet_rxq *rxq; - dev = sc->vtnet_dev; - nvqs = 2; + rxq = &sc->vtnet_rxqs[id]; - /* - * Indirect descriptors are not needed for the Rx - * virtqueue when mergeable buffers are negotiated. - * The header is placed inline with the data, not - * in a separate descriptor, and mbuf clusters are - * always physically contiguous. - */ - if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { - rxsegs = sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG ? - VTNET_MAX_RX_SEGS : VTNET_MIN_RX_SEGS; - } else - rxsegs = 0; + snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d", + device_get_nameunit(sc->vtnet_dev), id); + mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF); - VQ_ALLOC_INFO_INIT(&vq_info[0], rxsegs, - vtnet_rx_vq_intr, sc, &sc->vtnet_rx_vq, - "%s receive", device_get_nameunit(dev)); + rxq->vtnrx_sc = sc; + rxq->vtnrx_id = id; - VQ_ALLOC_INFO_INIT(&vq_info[1], VTNET_MAX_TX_SEGS, - vtnet_tx_vq_intr, sc, &sc->vtnet_tx_vq, - "%s transmit", device_get_nameunit(dev)); + TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq); + rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT, + taskqueue_thread_enqueue, &rxq->vtnrx_tq); - if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { - nvqs++; + return (rxq->vtnrx_tq == NULL ? ENOMEM : 0); +} + +static int +vtnet_init_txq(struct vtnet_softc *sc, int id) +{ + struct vtnet_txq *txq; + + txq = &sc->vtnet_txqs[id]; + + snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d", + device_get_nameunit(sc->vtnet_dev), id); + mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF); + + txq->vtntx_sc = sc; + txq->vtntx_id = id; + +#ifndef VTNET_LEGACY_TX + txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF, + M_NOWAIT, &txq->vtntx_mtx); + if (txq->vtntx_br == NULL) + return (ENOMEM); + + TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq); +#endif + TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq); + txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT, + taskqueue_thread_enqueue, &txq->vtntx_tq); + if (txq->vtntx_tq == NULL) + return (ENOMEM); + + return (0); +} - VQ_ALLOC_INFO_INIT(&vq_info[2], 0, NULL, NULL, - &sc->vtnet_ctrl_vq, "%s control", - device_get_nameunit(dev)); +static int +vtnet_alloc_rxtx_queues(struct vtnet_softc *sc) +{ + int i, npairs, error; + + npairs = sc->vtnet_max_vq_pairs; + + sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF, + M_NOWAIT | M_ZERO); + sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF, + M_NOWAIT | M_ZERO); + if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL) + return (ENOMEM); + + for (i = 0; i < npairs; i++) { + error = vtnet_init_rxq(sc, i); + if (error) + return (error); + error = vtnet_init_txq(sc, i); + if (error) + return (error); } - return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info)); + vtnet_setup_queue_sysctl(sc); + + return (0); } static void -vtnet_get_hwaddr(struct vtnet_softc *sc) +vtnet_destroy_rxq(struct vtnet_rxq *rxq) { - device_t dev; - dev = sc->vtnet_dev; + rxq->vtnrx_sc = NULL; + rxq->vtnrx_id = -1; - if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { - virtio_read_device_config(dev, - offsetof(struct virtio_net_config, mac), - sc->vtnet_hwaddr, ETHER_ADDR_LEN); - } else { - /* Generate random locally administered unicast address. */ - sc->vtnet_hwaddr[0] = 0xB2; - arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); + if (mtx_initialized(&rxq->vtnrx_mtx) != 0) + mtx_destroy(&rxq->vtnrx_mtx); +} - vtnet_set_hwaddr(sc); +static void +vtnet_destroy_txq(struct vtnet_txq *txq) +{ + + txq->vtntx_sc = NULL; + txq->vtntx_id = -1; + +#ifndef VTNET_LEGACY_TX + if (txq->vtntx_br != NULL) { + buf_ring_free(txq->vtntx_br, M_DEVBUF); + txq->vtntx_br = NULL; } +#endif + + if (mtx_initialized(&txq->vtntx_mtx) != 0) + mtx_destroy(&txq->vtntx_mtx); } static void -vtnet_set_hwaddr(struct vtnet_softc *sc) +vtnet_free_rxtx_queues(struct vtnet_softc *sc) { - device_t dev; + int i; - dev = sc->vtnet_dev; + if (sc->vtnet_rxqs != NULL) { + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) + vtnet_destroy_rxq(&sc->vtnet_rxqs[i]); + free(sc->vtnet_rxqs, M_DEVBUF); + sc->vtnet_rxqs = NULL; + } - virtio_write_device_config(dev, - offsetof(struct virtio_net_config, mac), - sc->vtnet_hwaddr, ETHER_ADDR_LEN); + if (sc->vtnet_txqs != NULL) { + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) + vtnet_destroy_txq(&sc->vtnet_txqs[i]); + free(sc->vtnet_txqs, M_DEVBUF); + sc->vtnet_txqs = NULL; + } } static int -vtnet_is_link_up(struct vtnet_softc *sc) +vtnet_alloc_rx_filters(struct vtnet_softc *sc) +{ + + if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { + sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (sc->vtnet_mac_filter == NULL) + return (ENOMEM); + } + + if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { + sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) * + VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO); + if (sc->vtnet_vlan_filter == NULL) + return (ENOMEM); + } + + return (0); +} + +static void +vtnet_free_rx_filters(struct vtnet_softc *sc) +{ + + if (sc->vtnet_mac_filter != NULL) { + free(sc->vtnet_mac_filter, M_DEVBUF); + sc->vtnet_mac_filter = NULL; + } + + if (sc->vtnet_vlan_filter != NULL) { + free(sc->vtnet_vlan_filter, M_DEVBUF); + sc->vtnet_vlan_filter = NULL; + } +} + +static int +vtnet_alloc_virtqueues(struct vtnet_softc *sc) { device_t dev; - struct ifnet *ifp; - uint16_t status; + struct vq_alloc_info *info; + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i, idx, flags, nvqs, rxsegs, error; dev = sc->vtnet_dev; - ifp = sc->vtnet_ifp; + flags = 0; + + /* + * Indirect descriptors are not needed for the Rx virtqueue when + * mergeable buffers are negotiated. The header is placed inline + * with the data, not in a separate descriptor, and mbuf clusters + * are always physically contiguous. + */ + if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) + rxsegs = 0; + else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) + rxsegs = VTNET_MAX_RX_SEGS; + else + rxsegs = VTNET_MIN_RX_SEGS; - VTNET_LOCK_ASSERT(sc); + nvqs = sc->vtnet_max_vq_pairs * 2; + if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) + nvqs++; - if ((ifp->if_capenable & IFCAP_LINKSTATE) == 0) - return (1); + info = malloc(sizeof(struct vq_alloc_info) * nvqs , M_TEMP, M_NOWAIT); + if (info == NULL) + return (ENOMEM); - status = virtio_read_dev_config_2(dev, - offsetof(struct virtio_net_config, status)); + for (i = 0, idx = 0; i < sc->vtnet_max_vq_pairs; i++, idx+=2) { + rxq = &sc->vtnet_rxqs[i]; + VQ_ALLOC_INFO_INIT(&info[idx], rxsegs, + vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq, + "%s-%d rx", device_get_nameunit(dev), rxq->vtnrx_id); - return ((status & VIRTIO_NET_S_LINK_UP) != 0); + txq = &sc->vtnet_txqs[i]; + VQ_ALLOC_INFO_INIT(&info[idx+1], VTNET_MAX_TX_SEGS, + vtnet_tx_vq_intr, txq, &txq->vtntx_vq, + "%s-%d tx", device_get_nameunit(dev), txq->vtntx_id); + } + + if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { + VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL, + &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev)); + } + + /* + * Enable interrupt binding if this is multiqueue. This only matters + * when per-vq MSIX is available. + */ + if (sc->vtnet_flags & VTNET_FLAG_MULTIQ) + flags |= 0; + + error = virtio_alloc_virtqueues(dev, flags, nvqs, info); + free(info, M_TEMP); + + return (error); } -static void -vtnet_update_link_status(struct vtnet_softc *sc) +static int +vtnet_setup_interface(struct vtnet_softc *sc) { + device_t dev; struct ifnet *ifp; - int link; + int limit; - ifp = sc->vtnet_ifp; + dev = sc->vtnet_dev; - link = vtnet_is_link_up(sc); + ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + device_printf(dev, "cannot allocate ifnet structure\n"); + return (ENOSPC); + } - if (link && ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) { - sc->vtnet_flags |= VTNET_FLAG_LINK; - if_link_state_change(ifp, LINK_STATE_UP); - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - vtnet_start_locked(ifp); - } else if (!link && (sc->vtnet_flags & VTNET_FLAG_LINK)) { - sc->vtnet_flags &= ~VTNET_FLAG_LINK; - if_link_state_change(ifp, LINK_STATE_DOWN); + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + if_initbaudrate(ifp, IF_Gbps(10)); /* Approx. */ + ifp->if_softc = sc; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = vtnet_init; + ifp->if_ioctl = vtnet_ioctl; + +#ifndef VTNET_LEGACY_TX + ifp->if_transmit = vtnet_txq_mq_start; + ifp->if_qflush = vtnet_qflush; +#else + struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq; + ifp->if_start = vtnet_start; + IFQ_SET_MAXLEN(&ifp->if_snd, virtqueue_size(vq) - 1); + ifp->if_snd.ifq_drv_maxlen = virtqueue_size(vq) - 1; + IFQ_SET_READY(&ifp->if_snd); +#endif + + ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd, + vtnet_ifmedia_sts); + ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL); + ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE); + + /* Read (or generate) the MAC address for the adapter. */ + vtnet_get_hwaddr(sc); + + ether_ifattach(ifp, sc->vtnet_hwaddr); + + if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) + ifp->if_capabilities |= IFCAP_LINKSTATE; + + /* Tell the upper layer(s) we support long frames. */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; + + if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { + ifp->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; + + if (virtio_with_feature(dev, VIRTIO_NET_F_GSO)) { + ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; + sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; + } else { + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) + ifp->if_capabilities |= IFCAP_TSO4; + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) + ifp->if_capabilities |= IFCAP_TSO6; + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) + sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; + } + + if (ifp->if_capabilities & IFCAP_TSO) + ifp->if_capabilities |= IFCAP_VLAN_HWTSO; + } + + if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) + ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; + + if (ifp->if_capabilities & IFCAP_HWCSUM) { + /* + * VirtIO does not support VLAN tagging, but we can fake + * it by inserting and removing the 802.1Q header during + * transmit and receive. We are then able to do checksum + * offloading of VLAN frames. + */ + ifp->if_capabilities |= + IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; + } + + ifp->if_capenable = ifp->if_capabilities; + + /* + * Capabilities after here are not enabled by default. + */ + + if (ifp->if_capabilities & IFCAP_RXCSUM) { + if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) || + virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6)) + ifp->if_capabilities |= IFCAP_LRO; + } + + if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { + ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; + + sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); + sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); } + + limit = vtnet_tunable_int(sc, "rx_process_limit", + vtnet_rx_process_limit); + if (limit < 0) + limit = INT_MAX; + sc->vtnet_rx_process_limit = limit; + + return (0); } -static void -vtnet_watchdog(struct vtnet_softc *sc) +static int +vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu) { struct ifnet *ifp; + int frame_size, clsize; ifp = sc->vtnet_ifp; -#ifdef VTNET_TX_INTR_MODERATION - vtnet_txeof(sc); -#endif + if (new_mtu < ETHERMIN || new_mtu > VTNET_MAX_MTU) + return (EINVAL); - if (sc->vtnet_watchdog_timer == 0 || --sc->vtnet_watchdog_timer) - return; + frame_size = sc->vtnet_hdr_size + sizeof(struct ether_vlan_header) + + new_mtu; - if_printf(ifp, "watchdog timeout -- resetting\n"); -#ifdef VTNET_DEBUG - virtqueue_dump(sc->vtnet_tx_vq); -#endif - ifp->if_oerrors++; - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - vtnet_init_locked(sc); + /* + * Based on the new MTU (and hence frame size) determine which + * cluster size is most appropriate for the receive queues. + */ + if (frame_size <= MCLBYTES) { + clsize = MCLBYTES; + } else if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + /* Avoid going past 9K jumbos. */ + if (frame_size > MJUM9BYTES) + return (EINVAL); + clsize = MJUM9BYTES; + } else + clsize = MJUMPAGESIZE; + + ifp->if_mtu = new_mtu; + sc->vtnet_rx_new_clsize = clsize; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + vtnet_init_locked(sc); + } + + return (0); } static int @@ -771,22 +1033,19 @@ vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) sc = ifp->if_softc; ifr = (struct ifreq *) data; - reinit = 0; error = 0; switch (cmd) { case SIOCSIFMTU: - if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VTNET_MAX_MTU) - error = EINVAL; - else if (ifp->if_mtu != ifr->ifr_mtu) { - VTNET_LOCK(sc); + if (ifp->if_mtu != ifr->ifr_mtu) { + VTNET_CORE_LOCK(sc); error = vtnet_change_mtu(sc, ifr->ifr_mtu); - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); } break; case SIOCSIFFLAGS: - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); if ((ifp->if_flags & IFF_UP) == 0) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_stop(sc); @@ -803,16 +1062,17 @@ vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) if (error == 0) sc->vtnet_if_flags = ifp->if_flags; - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: - VTNET_LOCK(sc); - if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) && - (ifp->if_drv_flags & IFF_DRV_RUNNING)) + if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) + break; + VTNET_CORE_LOCK(sc); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_rx_filter_mac(sc); - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: @@ -821,68 +1081,36 @@ vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; case SIOCSIFCAP: + VTNET_CORE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; -#ifdef DEVICE_POLLING - if (mask & IFCAP_POLLING) { - if (ifr->ifr_reqcap & IFCAP_POLLING) { - error = ether_poll_register(vtnet_poll, ifp); - if (error) - break; - - VTNET_LOCK(sc); - vtnet_disable_rx_intr(sc); - vtnet_disable_tx_intr(sc); - ifp->if_capenable |= IFCAP_POLLING; - VTNET_UNLOCK(sc); - } else { - error = ether_poll_deregister(ifp); - - /* Enable interrupts even in error case. */ - VTNET_LOCK(sc); - vtnet_enable_tx_intr(sc); - vtnet_enable_rx_intr(sc); - ifp->if_capenable &= ~IFCAP_POLLING; - VTNET_UNLOCK(sc); - } - } -#endif - VTNET_LOCK(sc); - - if (mask & IFCAP_TXCSUM) { + if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; - if (ifp->if_capenable & IFCAP_TXCSUM) - ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; - else - ifp->if_hwassist &= ~VTNET_CSUM_OFFLOAD; - } - - if (mask & IFCAP_TSO4) { + if (mask & IFCAP_TXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; - if (ifp->if_capenable & IFCAP_TSO4) - ifp->if_hwassist |= CSUM_TSO; - else - ifp->if_hwassist &= ~CSUM_TSO; - } - - if (mask & IFCAP_RXCSUM) { - ifp->if_capenable ^= IFCAP_RXCSUM; - reinit = 1; - } + if (mask & IFCAP_TSO6) + ifp->if_capenable ^= IFCAP_TSO6; - if (mask & IFCAP_LRO) { - ifp->if_capenable ^= IFCAP_LRO; + if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | + IFCAP_VLAN_HWFILTER)) { + /* These Rx features require us to renegotiate. */ reinit = 1; - } - if (mask & IFCAP_VLAN_HWFILTER) { - ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; - reinit = 1; - } + if (mask & IFCAP_RXCSUM) + ifp->if_capenable ^= IFCAP_RXCSUM; + if (mask & IFCAP_RXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + if (mask & IFCAP_LRO) + ifp->if_capenable ^= IFCAP_LRO; + if (mask & IFCAP_VLAN_HWFILTER) + ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; + } else + reinit = 0; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; - if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; @@ -890,9 +1118,10 @@ vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } + + VTNET_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); - VTNET_UNLOCK(sc); break; default: @@ -900,80 +1129,32 @@ vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } - VTNET_LOCK_ASSERT_NOTOWNED(sc); + VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } static int -vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu) -{ - struct ifnet *ifp; - int new_frame_size, clsize; - - ifp = sc->vtnet_ifp; - - if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { - new_frame_size = sizeof(struct vtnet_rx_header) + - sizeof(struct ether_vlan_header) + new_mtu; - - if (new_frame_size > MJUM9BYTES) - return (EINVAL); - - if (new_frame_size <= MCLBYTES) - clsize = MCLBYTES; - else - clsize = MJUM9BYTES; - } else { - new_frame_size = sizeof(struct virtio_net_hdr_mrg_rxbuf) + - sizeof(struct ether_vlan_header) + new_mtu; - - if (new_frame_size <= MCLBYTES) - clsize = MCLBYTES; - else - clsize = MJUMPAGESIZE; - } - - sc->vtnet_rx_mbuf_size = clsize; - sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc); - KASSERT(sc->vtnet_rx_mbuf_count < VTNET_MAX_RX_SEGS, - ("too many rx mbufs: %d", sc->vtnet_rx_mbuf_count)); - - ifp->if_mtu = new_mtu; - - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - vtnet_init_locked(sc); - } - - return (0); -} - -static int -vtnet_init_rx_vq(struct vtnet_softc *sc) +vtnet_rxq_populate(struct vtnet_rxq *rxq) { struct virtqueue *vq; int nbufs, error; - vq = sc->vtnet_rx_vq; - nbufs = 0; + vq = rxq->vtnrx_vq; error = ENOSPC; - while (!virtqueue_full(vq)) { - if ((error = vtnet_newbuf(sc)) != 0) + for (nbufs = 0; !virtqueue_full(vq); nbufs++) { + error = vtnet_rxq_new_buf(rxq); + if (error) break; - nbufs++; } if (nbufs > 0) { virtqueue_notify(vq); - /* * EMSGSIZE signifies the virtqueue did not have enough * entries available to hold the last mbuf. This is not - * an error. We should not get ENOSPC since we check if - * the virtqueue is full before attempting to add a - * buffer. + * an error. */ if (error == EMSGSIZE) error = 0; @@ -983,86 +1164,32 @@ vtnet_init_rx_vq(struct vtnet_softc *sc) } static void -vtnet_free_rx_mbufs(struct vtnet_softc *sc) +vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq) { struct virtqueue *vq; struct mbuf *m; int last; - vq = sc->vtnet_rx_vq; + vq = rxq->vtnrx_vq; last = 0; while ((m = virtqueue_drain(vq, &last)) != NULL) m_freem(m); - KASSERT(virtqueue_empty(vq), ("mbufs remaining in Rx Vq")); -} - -static void -vtnet_free_tx_mbufs(struct vtnet_softc *sc) -{ - struct virtqueue *vq; - struct vtnet_tx_header *txhdr; - int last; - - vq = sc->vtnet_tx_vq; - last = 0; - - while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { - m_freem(txhdr->vth_mbuf); - uma_zfree(vtnet_tx_header_zone, txhdr); - } - - KASSERT(virtqueue_empty(vq), ("mbufs remaining in Tx Vq")); -} - -static void -vtnet_free_ctrl_vq(struct vtnet_softc *sc) -{ - - /* - * The control virtqueue is only polled, therefore - * it should already be empty. - */ - KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq), - ("Ctrl Vq not empty")); -} - -#ifdef DEVICE_POLLING -static int -vtnet_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) -{ - struct vtnet_softc *sc; - int rx_done; - - sc = ifp->if_softc; - rx_done = 0; - - VTNET_LOCK(sc); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - if (cmd == POLL_AND_CHECK_STATUS) - vtnet_update_link_status(sc); - - if (virtqueue_nused(sc->vtnet_rx_vq) > 0) - vtnet_rxeof(sc, count, &rx_done); - - vtnet_txeof(sc); - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - vtnet_start_locked(ifp); - } - VTNET_UNLOCK(sc); - - return (rx_done); + KASSERT(virtqueue_empty(vq), + ("%s: mbufs remaining in rx queue %p", __func__, rxq)); } -#endif /* DEVICE_POLLING */ static struct mbuf * -vtnet_alloc_rxbuf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) +vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) { struct mbuf *m_head, *m_tail, *m; int i, clsize; - clsize = sc->vtnet_rx_mbuf_size; + clsize = sc->vtnet_rx_clsize; + + KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, + ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs)); m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize); if (m_head == NULL) @@ -1071,19 +1198,15 @@ vtnet_alloc_rxbuf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) m_head->m_len = clsize; m_tail = m_head; - if (nbufs > 1) { - KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, - ("chained Rx mbuf requested without LRO_NOMRG")); - - for (i = 1; i < nbufs; i++) { - m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); - if (m == NULL) - goto fail; + /* Allocate the rest of the chain. */ + for (i = 1; i < nbufs; i++) { + m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); + if (m == NULL) + goto fail; - m->m_len = clsize; - m_tail->m_next = m; - m_tail = m; - } + m->m_len = clsize; + m_tail->m_next = m; + m_tail = m; } if (m_tailp != NULL) @@ -1098,43 +1221,48 @@ fail: return (NULL); } +/* + * Slow path for when LRO without mergeable buffers is negotiated. + */ static int -vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0) +vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *rxq, struct mbuf *m0, + int len0) { + struct vtnet_softc *sc; struct mbuf *m, *m_prev; struct mbuf *m_new, *m_tail; int len, clsize, nreplace, error; - m = m0; - m_prev = NULL; - len = len0; + sc = rxq->vtnrx_sc; + clsize = sc->vtnet_rx_clsize; + m_prev = NULL; m_tail = NULL; - clsize = sc->vtnet_rx_mbuf_size; nreplace = 0; - KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || - m->m_next == NULL, ("chained Rx mbuf without LRO_NOMRG")); + m = m0; + len = len0; /* - * Since LRO_NOMRG mbuf chains are so large, we want to avoid - * allocating an entire chain for each received frame. When - * the received frame's length is less than that of the chain, - * the unused mbufs are reassigned to the new chain. + * Since these mbuf chains are so large, we avoid allocating an + * entire replacement chain if possible. When the received frame + * did not consume the entire chain, the unused mbufs are moved + * to the replacement chain. */ while (len > 0) { /* - * Something is seriously wrong if we received - * a frame larger than the mbuf chain. Drop it. + * Something is seriously wrong if we received a frame + * larger than the chain. Drop it. */ if (m == NULL) { sc->vtnet_stats.rx_frame_too_large++; return (EMSGSIZE); } + /* We always allocate the same cluster size. */ KASSERT(m->m_len == clsize, - ("mbuf length not expected cluster size: %d", - m->m_len)); + ("%s: mbuf size %d is not the cluster size %d", + __func__, m->m_len, clsize)); m->m_len = MIN(m->m_len, len); len -= m->m_len; @@ -1144,27 +1272,26 @@ vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0) nreplace++; } - KASSERT(m_prev != NULL, ("m_prev == NULL")); - KASSERT(nreplace <= sc->vtnet_rx_mbuf_count, - ("too many replacement mbufs: %d/%d", nreplace, - sc->vtnet_rx_mbuf_count)); + KASSERT(nreplace <= sc->vtnet_rx_nmbufs, + ("%s: too many replacement mbufs %d max %d", __func__, nreplace, + sc->vtnet_rx_nmbufs)); - m_new = vtnet_alloc_rxbuf(sc, nreplace, &m_tail); + m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail); if (m_new == NULL) { m_prev->m_len = clsize; return (ENOBUFS); } /* - * Move unused mbufs, if any, from the original chain - * onto the end of the new chain. + * Move any unused mbufs from the received chain onto the end + * of the new chain. */ if (m_prev->m_next != NULL) { m_tail->m_next = m_prev->m_next; m_prev->m_next = NULL; } - error = vtnet_enqueue_rxbuf(sc, m_new); + error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * BAD! We could not enqueue the replacement mbuf chain. We @@ -1189,343 +1316,321 @@ vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0) } static int -vtnet_newbuf(struct vtnet_softc *sc) +vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len) { - struct mbuf *m; + struct vtnet_softc *sc; + struct mbuf *m_new; int error; - m = vtnet_alloc_rxbuf(sc, sc->vtnet_rx_mbuf_count, NULL); - if (m == NULL) - return (ENOBUFS); + sc = rxq->vtnrx_sc; - error = vtnet_enqueue_rxbuf(sc, m); - if (error) - m_freem(m); + KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, + ("%s: chained mbuf without LRO_NOMRG", __func__)); - return (error); -} - -static void -vtnet_discard_merged_rxbuf(struct vtnet_softc *sc, int nbufs) -{ - struct virtqueue *vq; - struct mbuf *m; - - vq = sc->vtnet_rx_vq; + if (m->m_next == NULL) { + /* Fast-path for the common case of just one mbuf. */ + if (m->m_len < len) + return (EINVAL); - while (--nbufs > 0) { - if ((m = virtqueue_dequeue(vq, NULL)) == NULL) - break; - vtnet_discard_rxbuf(sc, m); - } -} + m_new = vtnet_rx_alloc_buf(sc, 1, NULL); + if (m_new == NULL) + return (ENOBUFS); -static void -vtnet_discard_rxbuf(struct vtnet_softc *sc, struct mbuf *m) -{ - int error; + error = vtnet_rxq_enqueue_buf(rxq, m_new); + if (error) { + /* + * The new mbuf is suppose to be an identical + * copy of the one just dequeued so this is an + * unexpected error. + */ + m_freem(m_new); + sc->vtnet_stats.rx_enq_replacement_failed++; + } else + m->m_len = len; + } else + error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len); - /* - * Requeue the discarded mbuf. This should always be - * successful since it was just dequeued. - */ - error = vtnet_enqueue_rxbuf(sc, m); - KASSERT(error == 0, ("cannot requeue discarded mbuf")); + return (error); } static int -vtnet_enqueue_rxbuf(struct vtnet_softc *sc, struct mbuf *m) +vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m) { struct sglist sg; struct sglist_seg segs[VTNET_MAX_RX_SEGS]; + struct vtnet_softc *sc; struct vtnet_rx_header *rxhdr; - struct virtio_net_hdr *hdr; uint8_t *mdata; int offset, error; - VTNET_LOCK_ASSERT(sc); - KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || - m->m_next == NULL, ("chained Rx mbuf without LRO_NOMRG")); - - sglist_init(&sg, VTNET_MAX_RX_SEGS, segs); - + sc = rxq->vtnrx_sc; mdata = mtod(m, uint8_t *); - offset = 0; + VTNET_RXQ_LOCK_ASSERT(rxq); + KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, + ("%s: chained mbuf without LRO_NOMRG", __func__)); + KASSERT(m->m_len == sc->vtnet_rx_clsize, + ("%s: unexpected cluster size %d/%d", __func__, m->m_len, + sc->vtnet_rx_clsize)); + + sglist_init(&sg, VTNET_MAX_RX_SEGS, segs); if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr)); rxhdr = (struct vtnet_rx_header *) mdata; - hdr = &rxhdr->vrh_hdr; - offset += sizeof(struct vtnet_rx_header); - - error = sglist_append(&sg, hdr, sc->vtnet_hdr_size); - KASSERT(error == 0, ("cannot add header to sglist")); - } - - error = sglist_append(&sg, mdata + offset, m->m_len - offset); - if (error) - return (error); + sglist_append(&sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size); + offset = sizeof(struct vtnet_rx_header); + } else + offset = 0; + sglist_append(&sg, mdata + offset, m->m_len - offset); if (m->m_next != NULL) { error = sglist_append_mbuf(&sg, m->m_next); - if (error) - return (error); + MPASS(error == 0); } - return (virtqueue_enqueue(sc->vtnet_rx_vq, m, &sg, 0, sg.sg_nseg)); + error = virtqueue_enqueue(rxq->vtnrx_vq, m, &sg, 0, sg.sg_nseg); + + return (error); } -static void -vtnet_vlan_tag_remove(struct mbuf *m) +static int +vtnet_rxq_new_buf(struct vtnet_rxq *rxq) { - struct ether_vlan_header *evl; + struct vtnet_softc *sc; + struct mbuf *m; + int error; - evl = mtod(m, struct ether_vlan_header *); + sc = rxq->vtnrx_sc; - m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); - m->m_flags |= M_VLANTAG; + m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL); + if (m == NULL) + return (ENOBUFS); - /* Strip the 802.1Q header. */ - bcopy((char *) evl, (char *) evl + ETHER_VLAN_ENCAP_LEN, - ETHER_HDR_LEN - ETHER_TYPE_LEN); - m_adj(m, ETHER_VLAN_ENCAP_LEN); + error = vtnet_rxq_enqueue_buf(rxq, m); + if (error) + m_freem(m); + + return (error); } -#ifdef notyet +/* + * Use the checksum offset in the VirtIO header to set the + * correct CSUM_* flags. + */ static int -vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m, - struct virtio_net_hdr *hdr) +vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m, + uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { - struct ether_header *eh; - struct ether_vlan_header *evh; - struct ip *ip; - struct ip6_hdr *ip6; - struct udphdr *udp; - int ip_offset, csum_start, csum_offset, hlen; - uint16_t eth_type; - uint8_t ip_proto; - - /* - * Convert the VirtIO checksum interface to FreeBSD's interface. - * The host only provides us with the offset at which to start - * checksumming, and the offset from that to place the completed - * checksum. While this maps well with how Linux does checksums, - * for FreeBSD, we must parse the received packet in order to set - * the appropriate CSUM_* flags. - */ - - /* - * Every mbuf added to the receive virtqueue is always at least - * MCLBYTES big, so assume something is amiss if the first mbuf - * does not contain both the Ethernet and protocol headers. - */ - ip_offset = sizeof(struct ether_header); - if (m->m_len < ip_offset) - return (1); + struct vtnet_softc *sc; +#if defined(INET) || defined(INET6) + int offset = hdr->csum_start + hdr->csum_offset; +#endif - eh = mtod(m, struct ether_header *); - eth_type = ntohs(eh->ether_type); - if (eth_type == ETHERTYPE_VLAN) { - ip_offset = sizeof(struct ether_vlan_header); - if (m->m_len < ip_offset) - return (1); - evh = mtod(m, struct ether_vlan_header *); - eth_type = ntohs(evh->evl_proto); - } + sc = rxq->vtnrx_sc; + /* Only do a basic sanity check on the offset. */ switch (eth_type) { +#if defined(INET) case ETHERTYPE_IP: - if (m->m_len < ip_offset + sizeof(struct ip)) - return (1); - - ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset); - /* Sanity check the IP header. */ - if (ip->ip_v != IPVERSION) - return (1); - hlen = ip->ip_hl << 2; - if (hlen < sizeof(struct ip)) - return (1); - if (ntohs(ip->ip_len) < hlen) + if (__predict_false(offset < ip_start + sizeof(struct ip))) return (1); - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) - return (1); - - ip_proto = ip->ip_p; - csum_start = ip_offset + hlen; break; - +#endif +#if defined(INET6) case ETHERTYPE_IPV6: - if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) + if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) return (1); - - /* - * XXX FreeBSD does not handle any IPv6 checksum offloading - * at the moment. - */ - - ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset); - /* XXX Assume no extension headers are present. */ - ip_proto = ip6->ip6_nxt; - csum_start = ip_offset + sizeof(struct ip6_hdr); break; - +#endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } - /* Assume checksum begins right after the IP header. */ - if (hdr->csum_start != csum_start) { - sc->vtnet_stats.rx_csum_bad_start++; - return (1); - } - - switch (ip_proto) { - case IPPROTO_TCP: - csum_offset = offsetof(struct tcphdr, th_sum); - break; - - case IPPROTO_UDP: - csum_offset = offsetof(struct udphdr, uh_sum); + /* + * Use the offset to determine the appropriate CSUM_* flags. This is + * a bit dirty, but we can get by with it since the checksum offsets + * happen to be different. We assume the host host does not do IPv4 + * header checksum offloading. + */ + switch (hdr->csum_offset) { + case offsetof(struct udphdr, uh_sum): + case offsetof(struct tcphdr, th_sum): + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; break; - - case IPPROTO_SCTP: - csum_offset = offsetof(struct sctphdr, checksum); + case offsetof(struct sctphdr, checksum): + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; - default: - sc->vtnet_stats.rx_csum_bad_ipproto++; - return (1); - } - - if (hdr->csum_offset != csum_offset) { sc->vtnet_stats.rx_csum_bad_offset++; return (1); } - /* - * The IP header checksum is almost certainly valid but I'm - * uncertain if that is guaranteed. - * - * m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; - */ + return (0); +} - switch (ip_proto) { - case IPPROTO_UDP: - if (m->m_len < csum_start + sizeof(struct udphdr)) - return (1); +static int +vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m, + uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) +{ + struct vtnet_softc *sc; + int offset, proto; - udp = (struct udphdr *)(mtod(m, uint8_t *) + csum_start); - if (udp->uh_sum == 0) - return (0); + sc = rxq->vtnrx_sc; - /* FALLTHROUGH */ + switch (eth_type) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip; + if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) + return (1); + ip = (struct ip *)(m->m_data + ip_start); + proto = ip->ip_p; + offset = ip_start + (ip->ip_hl << 2); + break; + } +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + if (__predict_false(m->m_len < ip_start + + sizeof(struct ip6_hdr))) + return (1); + offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); + if (__predict_false(offset < 0)) + return (1); + break; +#endif + default: + sc->vtnet_stats.rx_csum_bad_ethtype++; + return (1); + } + switch (proto) { case IPPROTO_TCP: + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) + return (1); + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + case IPPROTO_UDP: + if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) + return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; - case IPPROTO_SCTP: + if (__predict_false(m->m_len < offset + sizeof(struct sctphdr))) + return (1); m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; + default: + /* + * For the remaining protocols, FreeBSD does not support + * checksum offloading, so the checksum will be recomputed. + */ +#if 0 + if_printf(sc->vtnet_ifp, "cksum offload of unsupported " + "protocol eth_type=%#x proto=%d csum_start=%d + "csum_offset=%d\n", __func__, eth_type, proto, + hdr->csum_start, hdr->csum_offset); +#endif + break; } - sc->vtnet_stats.rx_csum_offloaded++; - return (0); } -#endif /* - * Alternative method of doing receive checksum offloading. Rather - * than parsing the received frame down to the IP header, use the - * csum_offset to determine which CSUM_* flags are appropriate. We - * can get by with doing this only because the checksum offsets are - * unique for the things we care about. + * Set the appropriate CSUM_* flags. Unfortunately, the information + * provided is not directly useful to us. The VirtIO header gives the + * offset of the checksum, which is all Linux needs, but this is not + * how FreeBSD does things. We are forced to peek inside the packet + * a bit. + * + * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD + * could accept the offsets and let the stack figure it out. */ static int -vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m, +vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct ether_header *eh; struct ether_vlan_header *evh; - struct udphdr *udp; - int csum_len; uint16_t eth_type; - - csum_len = hdr->csum_start + hdr->csum_offset; - - if (csum_len < sizeof(struct ether_header) + sizeof(struct ip)) - return (1); - if (m->m_len < csum_len) - return (1); + int offset, error; eh = mtod(m, struct ether_header *); eth_type = ntohs(eh->ether_type); if (eth_type == ETHERTYPE_VLAN) { + /* BMV: We should handle nested VLAN tags too. */ evh = mtod(m, struct ether_vlan_header *); eth_type = ntohs(evh->evl_proto); - } - - if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) { - sc->vtnet_stats.rx_csum_bad_ethtype++; - return (1); - } - - /* Use the offset to determine the appropriate CSUM_* flags. */ - switch (hdr->csum_offset) { - case offsetof(struct udphdr, uh_sum): - if (m->m_len < hdr->csum_start + sizeof(struct udphdr)) - return (1); - udp = (struct udphdr *)(mtod(m, uint8_t *) + hdr->csum_start); - if (udp->uh_sum == 0) - return (0); + offset = sizeof(struct ether_vlan_header); + } else + offset = sizeof(struct ether_header); - /* FALLTHROUGH */ + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr); + else + error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr); - case offsetof(struct tcphdr, th_sum): - m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xFFFF; - break; + return (error); +} - case offsetof(struct sctphdr, checksum): - m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; - break; +static void +vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs) +{ + struct mbuf *m; - default: - sc->vtnet_stats.rx_csum_bad_offset++; - return (1); + while (--nbufs > 0) { + m = virtqueue_dequeue(rxq->vtnrx_vq, NULL); + if (m == NULL) + break; + vtnet_rxq_discard_buf(rxq, m); } +} - sc->vtnet_stats.rx_csum_offloaded++; +static void +vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m) +{ + int error; - return (0); + /* + * Requeue the discarded mbuf. This should always be successful + * since it was just dequeued. + */ + error = vtnet_rxq_enqueue_buf(rxq, m); + KASSERT(error == 0, + ("%s: cannot requeue discarded mbuf %d", __func__, error)); } static int -vtnet_rxeof_merged(struct vtnet_softc *sc, struct mbuf *m_head, int nbufs) +vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs) { + struct vtnet_softc *sc; struct ifnet *ifp; struct virtqueue *vq; struct mbuf *m, *m_tail; int len; + sc = rxq->vtnrx_sc; + vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; - vq = sc->vtnet_rx_vq; m_tail = m_head; while (--nbufs > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) { - ifp->if_ierrors++; + rxq->vtnrx_stats.vrxs_ierrors++; goto fail; } - if (vtnet_newbuf(sc) != 0) { - ifp->if_iqdrops++; - vtnet_discard_rxbuf(sc, m); + if (vtnet_rxq_new_buf(rxq) != 0) { + rxq->vtnrx_stats.vrxs_iqdrops++; + vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) - vtnet_discard_merged_rxbuf(sc, nbufs); + vtnet_rxq_discard_merged_bufs(rxq, nbufs); goto fail; } @@ -1549,35 +1654,83 @@ fail: return (1); } +static void +vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + struct ether_header *eh; + + sc = rxq->vtnrx_sc; + ifp = sc->vtnet_ifp; + + if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { + eh = mtod(m, struct ether_header *); + if (eh->ether_type == htons(ETHERTYPE_VLAN)) { + vtnet_vlan_tag_remove(m); + /* + * With the 802.1Q header removed, update the + * checksum starting location accordingly. + */ + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + hdr->csum_start -= ETHER_VLAN_ENCAP_LEN; + } + } + + m->m_pkthdr.flowid = rxq->vtnrx_id; + m->m_flags |= M_FLOWID; + + /* + * BMV: FreeBSD does not have the UNNECESSARY and PARTIAL checksum + * distinction that Linux does. Need to reevaluate if performing + * offloading for the NEEDS_CSUM case is really appropriate. + */ + if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | + VIRTIO_NET_HDR_F_DATA_VALID)) { + if (vtnet_rxq_csum(rxq, m, hdr) == 0) + rxq->vtnrx_stats.vrxs_csum++; + else + rxq->vtnrx_stats.vrxs_csum_failed++; + } + + rxq->vtnrx_stats.vrxs_ipackets++; + rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len; + + /* VTNET_RXQ_UNLOCK(rxq); */ + (*ifp->if_input)(ifp, m); + /* VTNET_RXQ_LOCK(rxq); */ +} + static int -vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp) +vtnet_rxq_eof(struct vtnet_rxq *rxq) { - struct virtio_net_hdr lhdr; + struct virtio_net_hdr lhdr, *hdr; + struct vtnet_softc *sc; struct ifnet *ifp; struct virtqueue *vq; struct mbuf *m; - struct ether_header *eh; - struct virtio_net_hdr *hdr; struct virtio_net_hdr_mrg_rxbuf *mhdr; - int len, deq, nbufs, adjsz, rx_npkts; + int len, deq, nbufs, adjsz, count; + sc = rxq->vtnrx_sc; + vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; - vq = sc->vtnet_rx_vq; hdr = &lhdr; deq = 0; - rx_npkts = 0; + count = sc->vtnet_rx_process_limit; - VTNET_LOCK_ASSERT(sc); + VTNET_RXQ_LOCK_ASSERT(rxq); - while (--count >= 0) { + while (count-- > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) break; deq++; if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { - ifp->if_ierrors++; - vtnet_discard_rxbuf(sc, m); + rxq->vtnrx_stats.vrxs_ierrors++; + vtnet_rxq_discard_buf(rxq, m); continue; } @@ -1585,8 +1738,8 @@ vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp) nbufs = 1; adjsz = sizeof(struct vtnet_rx_header); /* - * Account for our pad between the header and - * the actual start of the frame. + * Account for our pad inserted between the header + * and the actual start of the frame. */ len += VTNET_RX_HEADER_PAD; } else { @@ -1595,11 +1748,11 @@ vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp) adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf); } - if (vtnet_replace_rxbuf(sc, m, len) != 0) { - ifp->if_iqdrops++; - vtnet_discard_rxbuf(sc, m); + if (vtnet_rxq_replace_buf(rxq, m, len) != 0) { + rxq->vtnrx_stats.vrxs_iqdrops++; + vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) - vtnet_discard_merged_rxbuf(sc, nbufs); + vtnet_rxq_discard_merged_bufs(rxq, nbufs); continue; } @@ -1608,263 +1761,297 @@ vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp) m->m_pkthdr.csum_flags = 0; if (nbufs > 1) { - if (vtnet_rxeof_merged(sc, m, nbufs) != 0) + /* Dequeue the rest of chain. */ + if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0) continue; } - ifp->if_ipackets++; - /* * Save copy of header before we strip it. For both mergeable - * and non-mergeable, the VirtIO header is placed first in the - * mbuf's data. We no longer need num_buffers, so always use a - * virtio_net_hdr. + * and non-mergeable, the header is at the beginning of the + * mbuf data. We no longer need num_buffers, so always use a + * regular header. + * + * BMV: Is this memcpy() expensive? We know the mbuf data is + * still valid even after the m_adj(). */ memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr)); m_adj(m, adjsz); - if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { - eh = mtod(m, struct ether_header *); - if (eh->ether_type == htons(ETHERTYPE_VLAN)) { - vtnet_vlan_tag_remove(m); - - /* - * With the 802.1Q header removed, update the - * checksum starting location accordingly. - */ - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) - hdr->csum_start -= - ETHER_VLAN_ENCAP_LEN; - } - } - - if (ifp->if_capenable & IFCAP_RXCSUM && - hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (vtnet_rx_csum(sc, m, hdr) != 0) - sc->vtnet_stats.rx_csum_failed++; - } - - VTNET_UNLOCK(sc); - rx_npkts++; - (*ifp->if_input)(ifp, m); - VTNET_LOCK(sc); - - /* - * The interface may have been stopped while we were - * passing the packet up the network stack. - */ - if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) - break; + vtnet_rxq_input(rxq, m, hdr); } if (deq > 0) virtqueue_notify(vq); - if (rx_npktsp != NULL) - *rx_npktsp = rx_npkts; - return (count > 0 ? 0 : EAGAIN); } static void -vtnet_rx_vq_intr(void *xsc) +vtnet_rx_vq_intr(void *xrxq) { struct vtnet_softc *sc; + struct vtnet_rxq *rxq; struct ifnet *ifp; - int more; + int tries, more; - sc = xsc; + rxq = xrxq; + sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; + tries = 0; + + if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) { + /* + * Ignore this interrupt. Either this is a spurious interrupt + * or multiqueue without per-VQ MSIX so every queue needs to + * be polled (a brain dead configuration we could try harder + * to avoid). + */ + vtnet_rxq_disable_intr(rxq); + return; + } again: - VTNET_LOCK(sc); + VTNET_RXQ_LOCK(rxq); -#ifdef DEVICE_POLLING - if (ifp->if_capenable & IFCAP_POLLING) { - VTNET_UNLOCK(sc); + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + VTNET_RXQ_UNLOCK(rxq); return; } -#endif + + more = vtnet_rxq_eof(rxq); + if (more || vtnet_rxq_enable_intr(rxq) != 0) { + if (!more) + vtnet_rxq_disable_intr(rxq); + /* + * This is an occasional condition or race (when !more), + * so retry a few times before scheduling the taskqueue. + */ + rxq->vtnrx_stats.vrxs_rescheduled++; + VTNET_RXQ_UNLOCK(rxq); + if (tries++ < VTNET_INTR_DISABLE_RETRIES) + goto again; + taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); + } else + VTNET_RXQ_UNLOCK(rxq); +} + +static void +vtnet_rxq_tq_intr(void *xrxq, int pending) +{ + struct vtnet_softc *sc; + struct vtnet_rxq *rxq; + struct ifnet *ifp; + int more; + + rxq = xrxq; + sc = rxq->vtnrx_sc; + ifp = sc->vtnet_ifp; + + VTNET_RXQ_LOCK(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { - vtnet_enable_rx_intr(sc); - VTNET_UNLOCK(sc); + VTNET_RXQ_UNLOCK(rxq); return; } - more = vtnet_rxeof(sc, sc->vtnet_rx_process_limit, NULL); - if (more || vtnet_enable_rx_intr(sc) != 0) { + more = vtnet_rxq_eof(rxq); + if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) - vtnet_disable_rx_intr(sc); - sc->vtnet_stats.rx_task_rescheduled++; - VTNET_UNLOCK(sc); - goto again; + vtnet_rxq_disable_intr(rxq); + rxq->vtnrx_stats.vrxs_rescheduled++; + taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } - VTNET_UNLOCK(sc); + VTNET_RXQ_UNLOCK(rxq); } static void -vtnet_txeof(struct vtnet_softc *sc) +vtnet_txq_free_mbufs(struct vtnet_txq *txq) { struct virtqueue *vq; - struct ifnet *ifp; struct vtnet_tx_header *txhdr; - int deq; - - vq = sc->vtnet_tx_vq; - ifp = sc->vtnet_ifp; - deq = 0; + int last; - VTNET_LOCK_ASSERT(sc); + vq = txq->vtntx_vq; + last = 0; - while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { - deq++; - ifp->if_opackets++; + while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { m_freem(txhdr->vth_mbuf); uma_zfree(vtnet_tx_header_zone, txhdr); } - if (deq > 0) { - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - if (virtqueue_empty(vq)) - sc->vtnet_watchdog_timer = 0; - } + KASSERT(virtqueue_empty(vq), + ("%s: mbufs remaining in tx queue %p", __func__, txq)); } -static struct mbuf * -vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m, - struct virtio_net_hdr *hdr) +/* + * BMV: Much of this can go away once we finally have offsets in + * the mbuf packet header. Bug andre@. + */ +static int +vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, + int *etype, int *proto, int *start) { - struct ifnet *ifp; - struct ether_header *eh; + struct vtnet_softc *sc; struct ether_vlan_header *evh; - struct ip *ip; - struct ip6_hdr *ip6; - struct tcphdr *tcp; - int ip_offset; - uint16_t eth_type, csum_start; - uint8_t ip_proto, gso_type; + int offset; - ifp = sc->vtnet_ifp; + sc = txq->vtntx_sc; - ip_offset = sizeof(struct ether_header); - if (m->m_len < ip_offset) { - if ((m = m_pullup(m, ip_offset)) == NULL) - return (NULL); + evh = mtod(m, struct ether_vlan_header *); + if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + /* BMV: We should handle nested VLAN tags too. */ + *etype = ntohs(evh->evl_proto); + offset = sizeof(struct ether_vlan_header); + } else { + *etype = ntohs(evh->evl_encap_proto); + offset = sizeof(struct ether_header); + } + + switch (*etype) { +#if defined(INET) + case ETHERTYPE_IP: { + struct ip *ip, iphdr; + if (__predict_false(m->m_len < offset + sizeof(struct ip))) { + m_copydata(m, offset, sizeof(struct ip), + (caddr_t) &iphdr); + ip = &iphdr; + } else + ip = (struct ip *)(m->m_data + offset); + *proto = ip->ip_p; + *start = offset + (ip->ip_hl << 2); + break; } - - eh = mtod(m, struct ether_header *); - eth_type = ntohs(eh->ether_type); - if (eth_type == ETHERTYPE_VLAN) { - ip_offset = sizeof(struct ether_vlan_header); - if (m->m_len < ip_offset) { - if ((m = m_pullup(m, ip_offset)) == NULL) - return (NULL); - } - evh = mtod(m, struct ether_vlan_header *); - eth_type = ntohs(evh->evl_proto); +#endif +#if defined(INET6) + case ETHERTYPE_IPV6: + *proto = -1; + *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); + /* Assert the network stack sent us a valid packet. */ + KASSERT(*start > offset, + ("%s: mbuf %p start %d offset %d proto %d", __func__, m, + *start, offset, *proto)); + break; +#endif + default: + sc->vtnet_stats.tx_csum_bad_ethtype++; + return (EINVAL); } - switch (eth_type) { - case ETHERTYPE_IP: - if (m->m_len < ip_offset + sizeof(struct ip)) { - m = m_pullup(m, ip_offset + sizeof(struct ip)); - if (m == NULL) - return (NULL); - } + return (0); +} - ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset); - ip_proto = ip->ip_p; - csum_start = ip_offset + (ip->ip_hl << 2); - gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - break; +static int +vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type, + int offset, struct virtio_net_hdr *hdr) +{ + static struct timeval lastecn; + static int curecn; + struct vtnet_softc *sc; + struct tcphdr *tcp, tcphdr; - case ETHERTYPE_IPV6: - if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) { - m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr)); - if (m == NULL) - return (NULL); - } + sc = txq->vtntx_sc; + + if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { + m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); + tcp = &tcphdr; + } else + tcp = (struct tcphdr *)(m->m_data + offset); + + hdr->hdr_len = offset + (tcp->th_off << 2); + hdr->gso_size = m->m_pkthdr.tso_segsz; + hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : + VIRTIO_NET_HDR_GSO_TCPV6; - ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset); + if (tcp->th_flags & TH_CWR) { /* - * XXX Assume no extension headers are present. Presently, - * this will always be true in the case of TSO, and FreeBSD - * does not perform checksum offloading of IPv6 yet. + * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, + * ECN support is not on a per-interface basis, but globally via + * the net.inet.tcp.ecn.enable sysctl knob. The default is off. */ - ip_proto = ip6->ip6_nxt; - csum_start = ip_offset + sizeof(struct ip6_hdr); - gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - break; - - default: - return (m); + if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { + if (ppsratecheck(&lastecn, &curecn, 1)) + if_printf(sc->vtnet_ifp, + "TSO with ECN not negotiated with host\n"); + return (ENOTSUP); + } + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } - if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) { - hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; - hdr->csum_start = csum_start; - hdr->csum_offset = m->m_pkthdr.csum_data; + txq->vtntx_stats.vtxs_tso++; - sc->vtnet_stats.tx_csum_offloaded++; - } + return (0); +} - if (m->m_pkthdr.csum_flags & CSUM_TSO) { - if (ip_proto != IPPROTO_TCP) - return (m); +static struct mbuf * +vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct vtnet_softc *sc; + int flags, etype, csum_start, proto, error; - if (m->m_len < csum_start + sizeof(struct tcphdr)) { - m = m_pullup(m, csum_start + sizeof(struct tcphdr)); - if (m == NULL) - return (NULL); - } + sc = txq->vtntx_sc; + flags = m->m_pkthdr.csum_flags; - tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start); - hdr->gso_type = gso_type; - hdr->hdr_len = csum_start + (tcp->th_off << 2); - hdr->gso_size = m->m_pkthdr.tso_segsz; + error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start); + if (error) + goto drop; - if (tcp->th_flags & TH_CWR) { - /* - * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN. - * ECN support is only configurable globally with the - * net.inet.tcp.ecn.enable sysctl knob. - */ - if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { - if_printf(ifp, "TSO with ECN not supported " - "by host\n"); - m_freem(m); - return (NULL); - } + if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) || + (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) { + /* + * We could compare the IP protocol vs the CSUM_ flag too, + * but that really should not be necessary. + */ + hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; + hdr->csum_start = csum_start; + hdr->csum_offset = m->m_pkthdr.csum_data; + txq->vtntx_stats.vtxs_csum++; + } - hdr->flags |= VIRTIO_NET_HDR_GSO_ECN; + if (flags & CSUM_TSO) { + if (__predict_false(proto != IPPROTO_TCP)) { + /* Likely failed to correctly parse the mbuf. */ + sc->vtnet_stats.tx_tso_not_tcp++; + goto drop; } - sc->vtnet_stats.tx_tso_offloaded++; + KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, + ("%s: mbuf %p TSO without checksum offload", __func__, m)); + + error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr); + if (error) + goto drop; } return (m); + +drop: + m_freem(m); + return (NULL); } static int -vtnet_enqueue_txbuf(struct vtnet_softc *sc, struct mbuf **m_head, +vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head, struct vtnet_tx_header *txhdr) { struct sglist sg; struct sglist_seg segs[VTNET_MAX_TX_SEGS]; + struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m; int collapsed, error; - vq = sc->vtnet_tx_vq; + vq = txq->vtntx_vq; + sc = txq->vtntx_sc; m = *m_head; collapsed = 0; sglist_init(&sg, VTNET_MAX_TX_SEGS, segs); error = sglist_append(&sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size); KASSERT(error == 0 && sg.sg_nseg == 1, - ("%s: cannot add header to sglist error %d", __func__, error)); + ("%s: error %d adding header to sglist", __func__, error)); again: error = sglist_append_mbuf(&sg, m); @@ -1878,12 +2065,14 @@ again: *m_head = m; collapsed = 1; + txq->vtntx_stats.vtxs_collapsed++; goto again; } txhdr->vth_mbuf = m; + error = virtqueue_enqueue(vq, txhdr, &sg, sg.sg_nseg, 0); - return (virtqueue_enqueue(vq, txhdr, &sg, sg.sg_nseg, 0)); + return (error); fail: m_freem(*m_head); @@ -1893,28 +2082,29 @@ fail: } static int -vtnet_encap(struct vtnet_softc *sc, struct mbuf **m_head) +vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head) { + struct vtnet_softc *sc; struct vtnet_tx_header *txhdr; struct virtio_net_hdr *hdr; struct mbuf *m; int error; + sc = txq->vtntx_sc; m = *m_head; M_ASSERTPKTHDR(m); txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO); if (txhdr == NULL) { - *m_head = NULL; m_freem(m); + *m_head = NULL; return (ENOMEM); } /* - * Always use the non-mergeable header to simplify things. When - * the mergeable feature is negotiated, the num_buffers field - * must be set to zero. We use vtnet_hdr_size later to enqueue - * the correct header size to the host. + * Always use the non-mergeable header, regardless if the feature + * was negotiated. For transmit, num_buffers is always zero. The + * vtnet_hdr_size is used to enqueue the correct header size. */ hdr = &txhdr->vth_uhdr.hdr; @@ -1927,72 +2117,55 @@ vtnet_encap(struct vtnet_softc *sc, struct mbuf **m_head) m->m_flags &= ~M_VLANTAG; } - if (m->m_pkthdr.csum_flags != 0) { - m = vtnet_tx_offload(sc, m, hdr); + if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) { + m = vtnet_txq_offload(txq, m, hdr); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } } - error = vtnet_enqueue_txbuf(sc, m_head, txhdr); + error = vtnet_txq_enqueue_buf(txq, m_head, txhdr); + if (error == 0) + return (0); + fail: - if (error) - uma_zfree(vtnet_tx_header_zone, txhdr); + uma_zfree(vtnet_tx_header_zone, txhdr); return (error); } -static void -vtnet_start(struct ifnet *ifp) -{ - struct vtnet_softc *sc; - - sc = ifp->if_softc; - - VTNET_LOCK(sc); - vtnet_start_locked(ifp); - VTNET_UNLOCK(sc); -} +#ifdef VTNET_LEGACY_TX static void -vtnet_start_locked(struct ifnet *ifp) +vtnet_start_locked(struct vtnet_txq *txq, struct ifnet *ifp) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m0; int enq; - sc = ifp->if_softc; - vq = sc->vtnet_tx_vq; + sc = txq->vtntx_sc; + vq = txq->vtntx_vq; enq = 0; - VTNET_LOCK_ASSERT(sc); + VTNET_TXQ_LOCK_ASSERT(txq); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING || ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + sc->vtnet_link_active == 0) return; -#ifdef VTNET_TX_INTR_MODERATION - if (virtqueue_nused(vq) >= sc->vtnet_tx_size / 2) - vtnet_txeof(sc); -#endif - while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { - if (virtqueue_full(vq)) { - ifp->if_drv_flags |= IFF_DRV_OACTIVE; + if (virtqueue_full(vq)) break; - } IFQ_DRV_DEQUEUE(&ifp->if_snd, m0); if (m0 == NULL) break; - if (vtnet_encap(sc, &m0) != 0) { - if (m0 == NULL) - break; - IFQ_DRV_PREPEND(&ifp->if_snd, m0); - ifp->if_drv_flags |= IFF_DRV_OACTIVE; + if (vtnet_txq_encap(txq, &m0) != 0) { + if (m0 != NULL) + IFQ_DRV_PREPEND(&ifp->if_snd, m0); break; } @@ -2002,65 +2175,529 @@ vtnet_start_locked(struct ifnet *ifp) if (enq > 0) { virtqueue_notify(vq); - sc->vtnet_watchdog_timer = VTNET_WATCHDOG_TIMEOUT; + txq->vtntx_watchdog = VTNET_TX_TIMEOUT; } } static void -vtnet_tick(void *xsc) +vtnet_start(struct ifnet *ifp) { struct vtnet_softc *sc; + struct vtnet_txq *txq; - sc = xsc; + sc = ifp->if_softc; + txq = &sc->vtnet_txqs[0]; - VTNET_LOCK_ASSERT(sc); -#ifdef VTNET_DEBUG - virtqueue_dump(sc->vtnet_rx_vq); - virtqueue_dump(sc->vtnet_tx_vq); -#endif + VTNET_TXQ_LOCK(txq); + vtnet_start_locked(txq, ifp); + VTNET_TXQ_UNLOCK(txq); +} - vtnet_watchdog(sc); - callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); +#else /* !VTNET_LEGACY_TX */ + +static int +vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m) +{ + struct vtnet_softc *sc; + struct virtqueue *vq; + struct buf_ring *br; + struct ifnet *ifp; + int enq, error; + + sc = txq->vtntx_sc; + vq = txq->vtntx_vq; + br = txq->vtntx_br; + ifp = sc->vtnet_ifp; + enq = 0; + error = 0; + + VTNET_TXQ_LOCK_ASSERT(txq); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + sc->vtnet_link_active == 0) { + if (m != NULL) + error = drbr_enqueue(ifp, br, m); + return (error); + } + + if (m != NULL) { + error = drbr_enqueue(ifp, br, m); + if (error) + return (error); + } + + while ((m = drbr_peek(ifp, br)) != NULL) { + error = vtnet_txq_encap(txq, &m); + if (error) { + if (m != NULL) + drbr_putback(ifp, br, m); + else + drbr_advance(ifp, br); + break; + } + drbr_advance(ifp, br); + + enq++; + ETHER_BPF_MTAP(ifp, m); + } + + if (enq > 0) { + virtqueue_notify(vq); + txq->vtntx_watchdog = VTNET_TX_TIMEOUT; + } + + return (error); +} + +static int +vtnet_txq_mq_start(struct ifnet *ifp, struct mbuf *m) +{ + struct vtnet_softc *sc; + struct vtnet_txq *txq; + int i, npairs, error; + + sc = ifp->if_softc; + npairs = sc->vtnet_act_vq_pairs; + + if (m->m_flags & M_FLOWID) + i = m->m_pkthdr.flowid % npairs; + else + i = curcpu % npairs; + + txq = &sc->vtnet_txqs[i]; + + if (VTNET_TXQ_TRYLOCK(txq) != 0) { + error = vtnet_txq_mq_start_locked(txq, m); + VTNET_TXQ_UNLOCK(txq); + } else { + error = drbr_enqueue(ifp, txq->vtntx_br, m); + taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask); + } + + return (error); } static void -vtnet_tx_vq_intr(void *xsc) +vtnet_txq_tq_deferred(void *xtxq, int pending) { struct vtnet_softc *sc; + struct vtnet_txq *txq; + + txq = xtxq; + sc = txq->vtntx_sc; + + VTNET_TXQ_LOCK(txq); + if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br)) + vtnet_txq_mq_start_locked(txq, NULL); + VTNET_TXQ_UNLOCK(txq); +} + +#endif /* VTNET_LEGACY_TX */ + +static void +vtnet_txq_tq_intr(void *xtxq, int pending) +{ + struct vtnet_softc *sc; + struct vtnet_txq *txq; struct ifnet *ifp; - sc = xsc; + txq = xtxq; + sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; -again: - VTNET_LOCK(sc); + VTNET_TXQ_LOCK(txq); -#ifdef DEVICE_POLLING - if (ifp->if_capenable & IFCAP_POLLING) { - VTNET_UNLOCK(sc); + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + VTNET_TXQ_UNLOCK(txq); return; } + + vtnet_txq_eof(txq); + +#ifdef VTNET_LEGACY_TX + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + vtnet_start_locked(txq, ifp); +#else + if (!drbr_empty(ifp, txq->vtntx_br)) + vtnet_txq_mq_start_locked(txq, NULL); #endif + if (vtnet_txq_enable_intr(txq) != 0) { + vtnet_txq_disable_intr(txq); + txq->vtntx_stats.vtxs_rescheduled++; + taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); + } + + VTNET_TXQ_UNLOCK(txq); +} + +static void +vtnet_txq_eof(struct vtnet_txq *txq) +{ + struct virtqueue *vq; + struct vtnet_tx_header *txhdr; + struct mbuf *m; + + vq = txq->vtntx_vq; + VTNET_TXQ_LOCK_ASSERT(txq); + + while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { + m = txhdr->vth_mbuf; + + txq->vtntx_stats.vtxs_opackets++; + txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len; + if (m->m_flags & M_MCAST) + txq->vtntx_stats.vtxs_omcasts++; + + m_freem(m); + uma_zfree(vtnet_tx_header_zone, txhdr); + } + + if (virtqueue_empty(vq)) + txq->vtntx_watchdog = 0; +} + +static void +vtnet_tx_vq_intr(void *xtxq) +{ + struct vtnet_softc *sc; + struct vtnet_txq *txq; + struct ifnet *ifp; + int tries; + + txq = xtxq; + sc = txq->vtntx_sc; + ifp = sc->vtnet_ifp; + tries = 0; + + if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) { + /* + * Ignore this interrupt. Either this is a spurious interrupt + * or multiqueue without per-VQ MSIX so every queue needs to + * be polled (a brain dead configuration we could try harder + * to avoid). + */ + vtnet_txq_disable_intr(txq); + return; + } + +again: + VTNET_TXQ_LOCK(txq); + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { - vtnet_enable_tx_intr(sc); - VTNET_UNLOCK(sc); + VTNET_TXQ_UNLOCK(txq); return; } - vtnet_txeof(sc); + vtnet_txq_eof(txq); +#ifdef VTNET_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - vtnet_start_locked(ifp); + vtnet_start_locked(txq, ifp); +#else + if (!drbr_empty(ifp, txq->vtntx_br)) + vtnet_txq_mq_start_locked(txq, NULL); +#endif - if (vtnet_enable_tx_intr(sc) != 0) { - vtnet_disable_tx_intr(sc); - sc->vtnet_stats.tx_task_rescheduled++; - VTNET_UNLOCK(sc); - goto again; + if (vtnet_txq_enable_intr(txq) != 0) { + vtnet_txq_disable_intr(txq); + /* + * This is an occasional race, so retry a few times + * before scheduling the taskqueue. + */ + VTNET_TXQ_UNLOCK(txq); + if (tries++ < VTNET_INTR_DISABLE_RETRIES) + goto again; + txq->vtntx_stats.vtxs_rescheduled++; + taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); + } else + VTNET_TXQ_UNLOCK(txq); +} + +static void +vtnet_tx_start_all(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + struct vtnet_txq *txq; + int i; + + ifp = sc->vtnet_ifp; + VTNET_CORE_LOCK_ASSERT(sc); + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { + txq = &sc->vtnet_txqs[i]; + + VTNET_TXQ_LOCK(txq); +#ifdef VTNET_LEGACY_TX + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + vtnet_start_locked(txq, ifp); +#else + if (!drbr_empty(ifp, txq->vtntx_br)) + vtnet_txq_mq_start_locked(txq, NULL); +#endif + VTNET_TXQ_UNLOCK(txq); + } +} + +#ifndef VTNET_LEGACY_TX +static void +vtnet_qflush(struct ifnet *ifp) +{ + struct vtnet_softc *sc; + struct vtnet_txq *txq; + struct mbuf *m; + int i; + + sc = ifp->if_softc; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { + txq = &sc->vtnet_txqs[i]; + + VTNET_TXQ_LOCK(txq); + while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL) + m_freem(m); + VTNET_TXQ_UNLOCK(txq); + } + + if_qflush(ifp); +} +#endif + +static int +vtnet_watchdog(struct vtnet_txq *txq) +{ + struct vtnet_softc *sc; + + sc = txq->vtntx_sc; + + VTNET_TXQ_LOCK(txq); + if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) { + VTNET_TXQ_UNLOCK(txq); + return (0); + } + VTNET_TXQ_UNLOCK(txq); + + if_printf(sc->vtnet_ifp, "watchdog timeout on queue %d\n", + txq->vtntx_id); + return (1); +} + +static void +vtnet_rxq_accum_stats(struct vtnet_rxq *rxq, struct vtnet_rxq_stats *accum) +{ + struct vtnet_rxq_stats *st; + + st = &rxq->vtnrx_stats; + + accum->vrxs_ipackets += st->vrxs_ipackets; + accum->vrxs_ibytes += st->vrxs_ibytes; + accum->vrxs_iqdrops += st->vrxs_iqdrops; + accum->vrxs_csum += st->vrxs_csum; + accum->vrxs_csum_failed += st->vrxs_csum_failed; + accum->vrxs_rescheduled += st->vrxs_rescheduled; +} + +static void +vtnet_txq_accum_stats(struct vtnet_txq *txq, struct vtnet_txq_stats *accum) +{ + struct vtnet_txq_stats *st; + + st = &txq->vtntx_stats; + + accum->vtxs_opackets += st->vtxs_opackets; + accum->vtxs_obytes += st->vtxs_obytes; + accum->vtxs_csum += st->vtxs_csum; + accum->vtxs_tso += st->vtxs_tso; + accum->vtxs_collapsed += st->vtxs_collapsed; + accum->vtxs_rescheduled += st->vtxs_rescheduled; +} + +static void +vtnet_accumulate_stats(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + struct vtnet_statistics *st; + struct vtnet_rxq_stats rxaccum; + struct vtnet_txq_stats txaccum; + int i; + + ifp = sc->vtnet_ifp; + st = &sc->vtnet_stats; + bzero(&rxaccum, sizeof(struct vtnet_rxq_stats)); + bzero(&txaccum, sizeof(struct vtnet_txq_stats)); + + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + vtnet_rxq_accum_stats(&sc->vtnet_rxqs[i], &rxaccum); + vtnet_txq_accum_stats(&sc->vtnet_txqs[i], &txaccum); + } + + st->rx_csum_offloaded = rxaccum.vrxs_csum; + st->rx_csum_failed = rxaccum.vrxs_csum_failed; + st->rx_task_rescheduled = rxaccum.vrxs_rescheduled; + st->tx_csum_offloaded = txaccum.vtxs_csum; + st->tx_tso_offloaded = txaccum.vtxs_tso; + st->tx_task_rescheduled = txaccum.vtxs_rescheduled; + + /* + * With the exception of if_ierrors, these ifnet statistics are + * only updated in the driver, so just set them to our accumulated + * values. if_ierrors is updated in ether_input() for malformed + * frames that we should have already discarded. + */ + ifp->if_ipackets = rxaccum.vrxs_ipackets; + ifp->if_iqdrops = rxaccum.vrxs_iqdrops; + ifp->if_ierrors = rxaccum.vrxs_ierrors; + ifp->if_opackets = txaccum.vtxs_opackets; +#ifndef VTNET_LEGACY_TX + ifp->if_obytes = txaccum.vtxs_obytes; + ifp->if_omcasts = txaccum.vtxs_omcasts; +#endif +} + +static void +vtnet_tick(void *xsc) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + int i, timedout; + + sc = xsc; + ifp = sc->vtnet_ifp; + timedout = 0; + + VTNET_CORE_LOCK_ASSERT(sc); + vtnet_accumulate_stats(sc); + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) + timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]); + + if (timedout != 0) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + vtnet_init_locked(sc); + } else + callout_schedule(&sc->vtnet_tick_ch, hz); +} + +static void +vtnet_start_taskqueues(struct vtnet_softc *sc) +{ + device_t dev; + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i, error; + + dev = sc->vtnet_dev; + + /* + * Errors here are very difficult to recover from - we cannot + * easily fail because, if this is during boot, we will hang + * when freeing any successfully started taskqueues because + * the scheduler isn't up yet. + * + * Most drivers just ignore the return value - it only fails + * with ENOMEM so an error is not likely. + */ + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET, + "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id); + if (error) { + device_printf(dev, "failed to start rx taskq %d\n", + rxq->vtnrx_id); + } + + txq = &sc->vtnet_txqs[i]; + error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET, + "%s txq %d", device_get_nameunit(dev), txq->vtntx_id); + if (error) { + device_printf(dev, "failed to start tx taskq %d\n", + txq->vtntx_id); + } + } +} + +static void +vtnet_free_taskqueues(struct vtnet_softc *sc) +{ + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i; + + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + if (rxq->vtnrx_tq != NULL) { + taskqueue_free(rxq->vtnrx_tq); + rxq->vtnrx_vq = NULL; + } + + txq = &sc->vtnet_txqs[i]; + if (txq->vtntx_tq != NULL) { + taskqueue_free(txq->vtntx_tq); + txq->vtntx_tq = NULL; + } } +} - VTNET_UNLOCK(sc); +static void +vtnet_drain_taskqueues(struct vtnet_softc *sc) +{ + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i; + + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + if (rxq->vtnrx_tq != NULL) + taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); + + txq = &sc->vtnet_txqs[i]; + if (txq->vtntx_tq != NULL) { + taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask); +#ifndef VTNET_LEGACY_TX + taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask); +#endif + } + } +} + +static void +vtnet_drain_rxtx_queues(struct vtnet_softc *sc) +{ + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + vtnet_rxq_free_mbufs(rxq); + + txq = &sc->vtnet_txqs[i]; + vtnet_txq_free_mbufs(txq); + } +} + +static void +vtnet_stop_rendezvous(struct vtnet_softc *sc) +{ + struct vtnet_rxq *rxq; + struct vtnet_txq *txq; + int i; + + /* + * Lock and unlock the per-queue mutex so we known the stop + * state is visible. Doing only the active queues should be + * sufficient, but it does not cost much extra to do all the + * queues. Note we hold the core mutex here too. + */ + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + VTNET_RXQ_LOCK(rxq); + VTNET_RXQ_UNLOCK(rxq); + + txq = &sc->vtnet_txqs[i]; + VTNET_TXQ_LOCK(txq); + VTNET_TXQ_UNLOCK(txq); + } } static void @@ -2072,46 +2709,60 @@ vtnet_stop(struct vtnet_softc *sc) dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; - VTNET_LOCK_ASSERT(sc); + VTNET_CORE_LOCK_ASSERT(sc); - sc->vtnet_watchdog_timer = 0; + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + sc->vtnet_link_active = 0; callout_stop(&sc->vtnet_tick_ch); - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - vtnet_disable_rx_intr(sc); - vtnet_disable_tx_intr(sc); + /* Only advisory. */ + vtnet_disable_interrupts(sc); /* - * Stop the host VirtIO adapter. Note this will reset the host - * adapter's state back to the pre-initialized state, so in - * order to make the device usable again, we must drive it - * through virtio_reinit() and virtio_reinit_complete(). + * Stop the host adapter. This resets it to the pre-initialized + * state. It will not generate any interrupts until after it is + * reinitialized. */ virtio_stop(dev); + vtnet_stop_rendezvous(sc); - sc->vtnet_flags &= ~VTNET_FLAG_LINK; - - vtnet_free_rx_mbufs(sc); - vtnet_free_tx_mbufs(sc); + /* Free any mbufs left in the virtqueues. */ + vtnet_drain_rxtx_queues(sc); } static int -vtnet_reinit(struct vtnet_softc *sc) +vtnet_virtio_reinit(struct vtnet_softc *sc) { + device_t dev; struct ifnet *ifp; uint64_t features; + int mask, error; + dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; features = sc->vtnet_features; + mask = 0; +#if defined(INET) + mask |= IFCAP_RXCSUM; +#endif +#if defined (INET6) + mask |= IFCAP_RXCSUM_IPV6; +#endif + /* * Re-negotiate with the host, removing any disabled receive * features. Transmit features are disabled only on our side * via if_capenable and if_hwassist. */ - if (ifp->if_capabilities & IFCAP_RXCSUM) { - if ((ifp->if_capenable & IFCAP_RXCSUM) == 0) + if (ifp->if_capabilities & mask) { + /* + * We require both IPv4 and IPv6 offloading to be enabled + * in order to negotiated it: VirtIO does not distinguish + * between the two. + */ + if ((ifp->if_capenable & mask) != mask) features &= ~VIRTIO_NET_F_GUEST_CSUM; } @@ -2125,86 +2776,205 @@ vtnet_reinit(struct vtnet_softc *sc) features &= ~VIRTIO_NET_F_CTRL_VLAN; } - return (virtio_reinit(sc->vtnet_dev, features)); + error = virtio_reinit(dev, features); + if (error) + device_printf(dev, "virtio reinit error %d\n", error); + + return (error); } static void -vtnet_init_locked(struct vtnet_softc *sc) +vtnet_init_rx_filters(struct vtnet_softc *sc) { - device_t dev; struct ifnet *ifp; - int error; - dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; - VTNET_LOCK_ASSERT(sc); + if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { + /* Restore promiscuous and all-multicast modes. */ + vtnet_rx_filter(sc); + /* Restore filtered MAC addresses. */ + vtnet_rx_filter_mac(sc); + } - if (ifp->if_drv_flags & IFF_DRV_RUNNING) + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) + vtnet_rx_filter_vlan(sc); +} + +static int +vtnet_init_rx_queues(struct vtnet_softc *sc) +{ + device_t dev; + struct vtnet_rxq *rxq; + int i, clsize, error; + + dev = sc->vtnet_dev; + + /* + * Use the new cluster size if one has been set (via a MTU + * change). Otherwise, use the standard 2K clusters. + * + * BMV: It might make sense to use page sized clusters as + * the default (depending on the features negotiated). + */ + if (sc->vtnet_rx_new_clsize != 0) { + clsize = sc->vtnet_rx_new_clsize; + sc->vtnet_rx_new_clsize = 0; + } else + clsize = MCLBYTES; + + sc->vtnet_rx_clsize = clsize; + sc->vtnet_rx_nmbufs = VTNET_NEEDED_RX_MBUFS(sc, clsize); + + /* The first segment is reserved for the header. */ + KASSERT(sc->vtnet_rx_nmbufs < VTNET_MAX_RX_SEGS, + ("%s: too many rx mbufs %d", __func__, sc->vtnet_rx_nmbufs)); + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { + rxq = &sc->vtnet_rxqs[i]; + + /* Hold the lock to satisfy asserts. */ + VTNET_RXQ_LOCK(rxq); + error = vtnet_rxq_populate(rxq); + VTNET_RXQ_UNLOCK(rxq); + + if (error) { + device_printf(dev, + "cannot allocate mbufs for Rx queue %d\n", i); + return (error); + } + } + + return (0); +} + +static int +vtnet_init_tx_queues(struct vtnet_softc *sc) +{ + struct vtnet_txq *txq; + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { + txq = &sc->vtnet_txqs[i]; + txq->vtntx_watchdog = 0; + } + + return (0); +} + +static int +vtnet_init_rxtx_queues(struct vtnet_softc *sc) +{ + int error; + + error = vtnet_init_rx_queues(sc); + if (error) + return (error); + + error = vtnet_init_tx_queues(sc); + if (error) + return (error); + + return (0); +} + +static void +vtnet_set_active_vq_pairs(struct vtnet_softc *sc) +{ + device_t dev; + int npairs; + + dev = sc->vtnet_dev; + + if ((sc->vtnet_flags & VTNET_FLAG_MULTIQ) == 0) { + MPASS(sc->vtnet_max_vq_pairs == 1); + sc->vtnet_act_vq_pairs = 1; return; + } - /* Stop host's adapter, cancel any pending I/O. */ - vtnet_stop(sc); + /* BMV: Just use the maximum configured for now. */ + npairs = sc->vtnet_max_vq_pairs; - /* Reinitialize the host device. */ - error = vtnet_reinit(sc); - if (error) { + if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) { device_printf(dev, - "reinitialization failed, stopping device...\n"); - vtnet_stop(sc); - return; + "cannot set active queue pairs to %d\n", npairs); + npairs = 1; } - /* Update host with assigned MAC address. */ + sc->vtnet_act_vq_pairs = npairs; +} + +static int +vtnet_reinit(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + int error; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + /* Use the current MAC address. */ bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN); vtnet_set_hwaddr(sc); + vtnet_set_active_vq_pairs(sc); + ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + ifp->if_hwassist |= VTNET_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; + if (ifp->if_capenable & IFCAP_TSO6) + ifp->if_hwassist |= CSUM_TSO; /* No CSUM_TSO_IPV6. */ - error = vtnet_init_rx_vq(sc); - if (error) { - device_printf(dev, - "cannot allocate mbufs for Rx virtqueue\n"); - vtnet_stop(sc); - return; - } + if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) + vtnet_init_rx_filters(sc); - if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { - if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { - /* Restore promiscuous and all-multicast modes. */ - vtnet_rx_filter(sc); + error = vtnet_init_rxtx_queues(sc); + if (error) + return (error); - /* Restore filtered MAC addresses. */ - vtnet_rx_filter_mac(sc); - } + vtnet_enable_interrupts(sc); + ifp->if_drv_flags |= IFF_DRV_RUNNING; - /* Restore VLAN filters. */ - if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) - vtnet_rx_filter_vlan(sc); - } + return (0); +} -#ifdef DEVICE_POLLING - if (ifp->if_capenable & IFCAP_POLLING) { - vtnet_disable_rx_intr(sc); - vtnet_disable_tx_intr(sc); - } else -#endif - { - vtnet_enable_rx_intr(sc); - vtnet_enable_tx_intr(sc); - } +static void +vtnet_init_locked(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_CORE_LOCK_ASSERT(sc); + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + vtnet_stop(sc); + + /* Reinitialize with the host. */ + if (vtnet_virtio_reinit(sc) != 0) + goto fail; + + if (vtnet_reinit(sc) != 0) + goto fail; virtio_reinit_complete(dev); vtnet_update_link_status(sc); callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); + + return; + +fail: + vtnet_stop(sc); } static void @@ -2214,9 +2984,24 @@ vtnet_init(void *xsc) sc = xsc; - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); vtnet_init_locked(sc); - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); +} + +static void +vtnet_free_ctrl_vq(struct vtnet_softc *sc) +{ + struct virtqueue *vq; + + vq = sc->vtnet_ctrl_vq; + + /* + * The control virtqueue is only polled and therefore it should + * already be empty. + */ + KASSERT(virtqueue_empty(vq), + ("%s: ctrl vq %p not empty", __func__, vq)); } static void @@ -2224,87 +3009,117 @@ vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie, struct sglist *sg, int readable, int writable) { struct virtqueue *vq; - void *c; vq = sc->vtnet_ctrl_vq; - VTNET_LOCK_ASSERT(sc); + VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ, - ("no control virtqueue")); - KASSERT(virtqueue_empty(vq), - ("control command already enqueued")); + ("%s: CTRL_VQ feature not negotiated", __func__)); + if (!virtqueue_empty(vq)) + return; if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0) return; - virtqueue_notify(vq); - /* - * Poll until the command is complete. Previously, we would - * sleep until the control virtqueue interrupt handler woke - * us up, but dropping the VTNET_MTX leads to serialization - * difficulties. - * - * Furthermore, it appears QEMU/KVM only allocates three MSIX - * vectors. Two of those vectors are needed for the Rx and Tx - * virtqueues. We do not support sharing both a Vq and config - * changed notification on the same MSIX vector. + * Poll for the response, but the command is likely already + * done when we return from the notify. */ - c = virtqueue_poll(vq, NULL); - KASSERT(c == cookie, ("unexpected control command response")); + virtqueue_notify(vq); + virtqueue_poll(vq, NULL); } -static void -vtnet_rx_filter(struct vtnet_softc *sc) +static int +vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr) { - device_t dev; - struct ifnet *ifp; + struct virtio_net_ctrl_hdr hdr; + struct sglist_seg segs[3]; + struct sglist sg; + uint8_t ack; + int error; - dev = sc->vtnet_dev; - ifp = sc->vtnet_ifp; + hdr.class = VIRTIO_NET_CTRL_MAC; + hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET; + ack = VIRTIO_NET_ERR; - VTNET_LOCK_ASSERT(sc); - KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, - ("CTRL_RX feature not negotiated")); + sglist_init(&sg, 3, segs); + error = 0; + error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, hwaddr, ETHER_ADDR_LEN); + error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + KASSERT(error == 0 && sg.sg_nseg == 3, + ("%s: error %d adding set MAC msg to sglist", __func__, error)); - if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0) - device_printf(dev, "cannot %s promiscuous mode\n", - ifp->if_flags & IFF_PROMISC ? "enable" : "disable"); + vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); - if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0) - device_printf(dev, "cannot %s all-multicast mode\n", - ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable"); + return (ack == VIRTIO_NET_OK ? 0 : EIO); } static int -vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on) +vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs) { - struct virtio_net_ctrl_hdr hdr; struct sglist_seg segs[3]; struct sglist sg; - uint8_t onoff, ack; + struct { + struct virtio_net_ctrl_hdr hdr; + uint8_t pad1; + struct virtio_net_ctrl_mq mq; + uint8_t pad2; + uint8_t ack; + } s; int error; - if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) - return (ENOTSUP); + s.hdr.class = VIRTIO_NET_CTRL_MQ; + s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET; + s.mq.virtqueue_pairs = npairs; + s.ack = VIRTIO_NET_ERR; + sglist_init(&sg, 3, segs); error = 0; + error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq)); + error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); + KASSERT(error == 0 && sg.sg_nseg == 3, + ("%s: error %d adding MQ message to sglist", __func__, error)); - hdr.class = VIRTIO_NET_CTRL_RX; - hdr.cmd = cmd; - onoff = !!on; - ack = VIRTIO_NET_ERR; + vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); + + return (s.ack == VIRTIO_NET_OK ? 0 : EIO); +} + +static int +vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on) +{ + struct sglist_seg segs[3]; + struct sglist sg; + struct { + struct virtio_net_ctrl_hdr hdr; + uint8_t pad1; + uint8_t onoff; + uint8_t pad2; + uint8_t ack; + } s; + int error; + + KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, + ("%s: CTRL_RX feature not negotiated", __func__)); + + s.hdr.class = VIRTIO_NET_CTRL_RX; + s.hdr.cmd = cmd; + s.onoff = !!on; + s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); - error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); - error |= sglist_append(&sg, &onoff, sizeof(uint8_t)); - error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + error = 0; + error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t)); + error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, - ("error adding Rx filter message to sglist")); + ("%s: error %d adding Rx message to sglist", __func__, error)); - vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); + vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); - return (ack == VIRTIO_NET_OK ? 0 : EIO); + return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int @@ -2321,6 +3136,48 @@ vtnet_set_allmulti(struct vtnet_softc *sc, int on) return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on)); } +/* + * The device defaults to promiscuous mode for backwards compatibility. + * Turn it off at attach time if possible. + */ +static void +vtnet_attach_disable_promisc(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + + ifp = sc->vtnet_ifp; + + VTNET_CORE_LOCK(sc); + if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) { + ifp->if_flags |= IFF_PROMISC; + } else if (vtnet_set_promisc(sc, 0) != 0) { + ifp->if_flags |= IFF_PROMISC; + device_printf(sc->vtnet_dev, + "cannot disable default promiscuous mode\n"); + } + VTNET_CORE_UNLOCK(sc); +} + +static void +vtnet_rx_filter(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_CORE_LOCK_ASSERT(sc); + + if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0) + device_printf(dev, "cannot %s promiscuous mode\n", + ifp->if_flags & IFF_PROMISC ? "enable" : "disable"); + + if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0) + device_printf(dev, "cannot %s all-multicast mode\n", + ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable"); +} + static void vtnet_rx_filter_mac(struct vtnet_softc *sc) { @@ -2340,19 +3197,23 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) mcnt = 0; promisc = 0; allmulti = 0; - error = 0; - VTNET_LOCK_ASSERT(sc); + VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, - ("CTRL_RX feature not negotiated")); + ("%s: CTRL_RX feature not negotiated", __func__)); /* Unicast MAC addresses: */ if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; - else if (ucnt == VTNET_MAX_MAC_ENTRIES) + else if (memcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr), + sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0) + continue; + else if (ucnt == VTNET_MAX_MAC_ENTRIES) { + promisc = 1; break; + } bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr), &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN); @@ -2360,10 +3221,8 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) } if_addr_runlock(ifp); - if (ucnt >= VTNET_MAX_MAC_ENTRIES) { - promisc = 1; + if (promisc != 0) { filter->vmf_unicast.nentries = 0; - if_printf(ifp, "more than %d MAC addresses assigned, " "falling back to promiscuous mode\n", VTNET_MAX_MAC_ENTRIES); @@ -2375,8 +3234,10 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; - else if (mcnt == VTNET_MAX_MAC_ENTRIES) + else if (mcnt == VTNET_MAX_MAC_ENTRIES) { + allmulti = 1; break; + } bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN); @@ -2384,17 +3245,15 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) } if_maddr_runlock(ifp); - if (mcnt >= VTNET_MAX_MAC_ENTRIES) { - allmulti = 1; + if (allmulti != 0) { filter->vmf_multicast.nentries = 0; - if_printf(ifp, "more than %d multicast MAC addresses " "assigned, falling back to all-multicast mode\n", VTNET_MAX_MAC_ENTRIES); } else filter->vmf_multicast.nentries = mcnt; - if (promisc && allmulti) + if (promisc != 0 && allmulti != 0) goto out; hdr.class = VIRTIO_NET_CTRL_MAC; @@ -2402,6 +3261,7 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) ack = VIRTIO_NET_ERR; sglist_init(&sg, 4, segs); + error = 0; error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &filter->vmf_unicast, sizeof(uint32_t) + filter->vmf_unicast.nentries * ETHER_ADDR_LEN); @@ -2409,7 +3269,7 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) sizeof(uint32_t) + filter->vmf_multicast.nentries * ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 4, - ("error adding MAC filtering message to sglist")); + ("%s: error %d adding MAC filter msg to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); @@ -2417,111 +3277,99 @@ vtnet_rx_filter_mac(struct vtnet_softc *sc) if_printf(ifp, "error setting host MAC filter table\n"); out: - if (promisc) - if (vtnet_set_promisc(sc, 1) != 0) - if_printf(ifp, "cannot enable promiscuous mode\n"); - if (allmulti) - if (vtnet_set_allmulti(sc, 1) != 0) - if_printf(ifp, "cannot enable all-multicast mode\n"); + if (promisc != 0 && vtnet_set_promisc(sc, 1) != 0) + if_printf(ifp, "cannot enable promiscuous mode\n"); + if (allmulti != 0 && vtnet_set_allmulti(sc, 1) != 0) + if_printf(ifp, "cannot enable all-multicast mode\n"); } static int vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { - struct virtio_net_ctrl_hdr hdr; struct sglist_seg segs[3]; struct sglist sg; - uint8_t ack; + struct { + struct virtio_net_ctrl_hdr hdr; + uint8_t pad1; + uint16_t tag; + uint8_t pad2; + uint8_t ack; + } s; int error; - hdr.class = VIRTIO_NET_CTRL_VLAN; - hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; - ack = VIRTIO_NET_ERR; - error = 0; + s.hdr.class = VIRTIO_NET_CTRL_VLAN; + s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; + s.tag = tag; + s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); - error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); - error |= sglist_append(&sg, &tag, sizeof(uint16_t)); - error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + error = 0; + error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &s.tag, sizeof(uint16_t)); + error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, - ("error adding VLAN control message to sglist")); + ("%s: error %d adding VLAN message to sglist", __func__, error)); - vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); + vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); - return (ack == VIRTIO_NET_OK ? 0 : EIO); + return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static void vtnet_rx_filter_vlan(struct vtnet_softc *sc) { - device_t dev; - uint32_t w, mask; + uint32_t w; uint16_t tag; - int i, nvlans, error; + int i, bit; - VTNET_LOCK_ASSERT(sc); + VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, - ("VLAN_FILTER feature not negotiated")); + ("%s: VLAN_FILTER feature not negotiated", __func__)); - dev = sc->vtnet_dev; - nvlans = sc->vtnet_nvlans; - error = 0; + /* Enable the filter for each configured VLAN. */ + for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) { + w = sc->vtnet_vlan_filter[i]; + + while ((bit = ffs(w) - 1) != -1) { + w &= ~(1 << bit); + tag = sizeof(w) * CHAR_BIT * i + bit; - /* Enable filtering for each configured VLAN. */ - for (i = 0; i < VTNET_VLAN_SHADOW_SIZE && nvlans > 0; i++) { - w = sc->vtnet_vlan_shadow[i]; - for (mask = 1, tag = i * 32; w != 0; mask <<= 1, tag++) { - if ((w & mask) != 0) { - w &= ~mask; - nvlans--; - if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) - error++; + if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) { + device_printf(sc->vtnet_dev, + "cannot enable VLAN %d filter\n", tag); } } } - - KASSERT(nvlans == 0, ("VLAN count incorrect")); - if (error) - device_printf(dev, "cannot restore VLAN filter table\n"); } static void -vtnet_set_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) +vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct ifnet *ifp; int idx, bit; - KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, - ("VLAN_FILTER feature not negotiated")); - - if ((tag == 0) || (tag > 4095)) - return; - ifp = sc->vtnet_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; - VTNET_LOCK(sc); + if (tag == 0 || tag > 4095) + return; + + VTNET_CORE_LOCK(sc); - /* Update shadow VLAN table. */ - if (add) { - sc->vtnet_nvlans++; - sc->vtnet_vlan_shadow[idx] |= (1 << bit); - } else { - sc->vtnet_nvlans--; - sc->vtnet_vlan_shadow[idx] &= ~(1 << bit); - } + if (add) + sc->vtnet_vlan_filter[idx] |= (1 << bit); + else + sc->vtnet_vlan_filter[idx] &= ~(1 << bit); - if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { - if (vtnet_exec_vlan_filter(sc, add, tag) != 0) { - device_printf(sc->vtnet_dev, - "cannot %s VLAN %d %s the host filter table\n", - add ? "add" : "remove", tag, - add ? "to" : "from"); - } + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER && + vtnet_exec_vlan_filter(sc, add, tag) != 0) { + device_printf(sc->vtnet_dev, + "cannot %s VLAN %d %s the host filter table\n", + add ? "add" : "remove", tag, add ? "to" : "from"); } - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); } static void @@ -2531,7 +3379,7 @@ vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) if (ifp->if_softc != arg) return; - vtnet_set_vlan_filter(arg, 1, tag); + vtnet_update_vlan_filter(arg, 1, tag); } static void @@ -2541,7 +3389,47 @@ vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) if (ifp->if_softc != arg) return; - vtnet_set_vlan_filter(arg, 0, tag); + vtnet_update_vlan_filter(arg, 0, tag); +} + +static int +vtnet_is_link_up(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + uint16_t status; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + if ((ifp->if_capabilities & IFCAP_LINKSTATE) == 0) + status = VIRTIO_NET_S_LINK_UP; + else + status = virtio_read_dev_config_2(dev, + offsetof(struct virtio_net_config, status)); + + return ((status & VIRTIO_NET_S_LINK_UP) != 0); +} + +static void +vtnet_update_link_status(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + int link; + + ifp = sc->vtnet_ifp; + + VTNET_CORE_LOCK_ASSERT(sc); + link = vtnet_is_link_up(sc); + + /* Notify if the link status has changed. */ + if (link != 0 && sc->vtnet_link_active == 0) { + sc->vtnet_link_active = 1; + if_link_state_change(ifp, LINK_STATE_UP); + } else if (link == 0 && sc->vtnet_link_active != 0) { + sc->vtnet_link_active = 0; + if_link_state_change(ifp, LINK_STATE_DOWN); + } } static int @@ -2569,112 +3457,334 @@ vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; - VTNET_LOCK(sc); + VTNET_CORE_LOCK(sc); if (vtnet_is_link_up(sc) != 0) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= VTNET_MEDIATYPE; } else ifmr->ifm_active |= IFM_NONE; - VTNET_UNLOCK(sc); + VTNET_CORE_UNLOCK(sc); } static void -vtnet_add_statistics(struct vtnet_softc *sc) +vtnet_set_hwaddr(struct vtnet_softc *sc) { device_t dev; - struct vtnet_statistics *stats; - struct sysctl_ctx_list *ctx; + + dev = sc->vtnet_dev; + + if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) { + if (vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr) != 0) + device_printf(dev, "unable to set MAC address\n"); + } else if (sc->vtnet_flags & VTNET_FLAG_MAC) { + virtio_write_device_config(dev, + offsetof(struct virtio_net_config, mac), + sc->vtnet_hwaddr, ETHER_ADDR_LEN); + } +} + +static void +vtnet_get_hwaddr(struct vtnet_softc *sc) +{ + device_t dev; + + dev = sc->vtnet_dev; + + if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) { + /* + * Generate a random locally administered unicast address. + * + * It would be nice to generate the same MAC address across + * reboots, but it seems all the hosts currently available + * support the MAC feature, so this isn't too important. + */ + sc->vtnet_hwaddr[0] = 0xB2; + arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); + vtnet_set_hwaddr(sc); + return; + } + + virtio_read_device_config(dev, offsetof(struct virtio_net_config, mac), + sc->vtnet_hwaddr, ETHER_ADDR_LEN); +} + +static void +vtnet_vlan_tag_remove(struct mbuf *m) +{ + struct ether_vlan_header *evh; + + evh = mtod(m, struct ether_vlan_header *); + m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); + m->m_flags |= M_VLANTAG; + + /* Strip the 802.1Q header. */ + bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, + ETHER_HDR_LEN - ETHER_TYPE_LEN); + m_adj(m, ETHER_VLAN_ENCAP_LEN); +} + +static void +vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct vtnet_rxq *rxq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *list; + struct vtnet_rxq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "Receive Queue"); + list = SYSCTL_CHILDREN(node); + + stats = &rxq->vtnrx_stats; + + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, + &stats->vrxs_ipackets, "Receive packets"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, + &stats->vrxs_ibytes, "Receive bytes"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, + &stats->vrxs_iqdrops, "Receive drops"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, + &stats->vrxs_ierrors, "Receive errors"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, + &stats->vrxs_csum, "Receive checksum offloaded"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD, + &stats->vrxs_csum_failed, "Receive checksum offload failed"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, + &stats->vrxs_rescheduled, + "Receive interrupt handler rescheduled"); +} + +static void +vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct vtnet_txq *txq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *list; + struct vtnet_txq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "Transmit Queue"); + list = SYSCTL_CHILDREN(node); + + stats = &txq->vtntx_stats; + + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, + &stats->vtxs_opackets, "Transmit packets"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, + &stats->vtxs_obytes, "Transmit bytes"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, + &stats->vtxs_omcasts, "Transmit multicasts"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, + &stats->vtxs_csum, "Transmit checksum offloaded"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, + &stats->vtxs_tso, "Transmit segmentation offloaded"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "collapsed", CTLFLAG_RD, + &stats->vtxs_collapsed, "Transmit mbufs collapsed"); + SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, + &stats->vtxs_rescheduled, + "Transmit interrupt handler rescheduled"); +} + +static void +vtnet_setup_queue_sysctl(struct vtnet_softc *sc) +{ + device_t dev; + struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; + int i; dev = sc->vtnet_dev; - stats = &sc->vtnet_stats; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_failed", + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]); + vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]); + } +} + +static void +vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct vtnet_softc *sc) +{ + struct vtnet_statistics *stats; + + stats = &sc->vtnet_stats; + + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed", CTLFLAG_RD, &stats->mbuf_alloc_failed, "Mbuf cluster allocation failures"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_frame_too_large", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large", CTLFLAG_RD, &stats->rx_frame_too_large, "Received frame larger than the mbuf chain"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_enq_replacement_failed", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed", CTLFLAG_RD, &stats->rx_enq_replacement_failed, "Enqueuing the replacement receive mbuf failed"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_mergeable_failed", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed", CTLFLAG_RD, &stats->rx_mergeable_failed, "Mergeable buffers receive failures"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", CTLFLAG_RD, &stats->rx_csum_bad_ethtype, "Received checksum offloaded buffer with unsupported " "Ethernet type"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_start", - CTLFLAG_RD, &stats->rx_csum_bad_start, - "Received checksum offloaded buffer with incorrect start offset"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", CTLFLAG_RD, &stats->rx_csum_bad_ipproto, "Received checksum offloaded buffer with incorrect IP protocol"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_offset", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset", CTLFLAG_RD, &stats->rx_csum_bad_offset, "Received checksum offloaded buffer with incorrect offset"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_failed", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto", + CTLFLAG_RD, &stats->rx_csum_bad_proto, + "Received checksum offloaded buffer with incorrect protocol"); + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed", CTLFLAG_RD, &stats->rx_csum_failed, "Received buffer checksum offload failed"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_offloaded", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded", CTLFLAG_RD, &stats->rx_csum_offloaded, "Received buffer checksum offload succeeded"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_task_rescheduled", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled", CTLFLAG_RD, &stats->rx_task_rescheduled, "Times the receive interrupt task rescheduled itself"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_offloaded", - CTLFLAG_RD, &stats->tx_csum_offloaded, - "Offloaded checksum of transmitted buffer"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_offloaded", - CTLFLAG_RD, &stats->tx_tso_offloaded, - "Segmentation offload of transmitted buffer"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_bad_ethtype", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_bad_ethtype", CTLFLAG_RD, &stats->tx_csum_bad_ethtype, "Aborted transmit of checksum offloaded buffer with unknown " "Ethernet type"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_bad_ethtype", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_bad_ethtype", CTLFLAG_RD, &stats->tx_tso_bad_ethtype, "Aborted transmit of TSO buffer with unknown Ethernet type"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_task_rescheduled", + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp", + CTLFLAG_RD, &stats->tx_tso_not_tcp, + "Aborted transmit of TSO buffer with non TCP protocol"); + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded", + CTLFLAG_RD, &stats->tx_csum_offloaded, + "Offloaded checksum of transmitted buffer"); + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded", + CTLFLAG_RD, &stats->tx_tso_offloaded, + "Segmentation offload of transmitted buffer"); + SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled", CTLFLAG_RD, &stats->tx_task_rescheduled, "Times the transmit interrupt task rescheduled itself"); } +static void +vtnet_setup_sysctl(struct vtnet_softc *sc) +{ + device_t dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = sc->vtnet_dev; + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs", + CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0, + "Maximum number of supported virtqueue pairs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs", + CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0, + "Number of active virtqueue pairs"); + + vtnet_setup_stat_sysctl(ctx, child, sc); +} + static int -vtnet_enable_rx_intr(struct vtnet_softc *sc) +vtnet_rxq_enable_intr(struct vtnet_rxq *rxq) { - return (virtqueue_enable_intr(sc->vtnet_rx_vq)); + return (virtqueue_enable_intr(rxq->vtnrx_vq)); } static void -vtnet_disable_rx_intr(struct vtnet_softc *sc) +vtnet_rxq_disable_intr(struct vtnet_rxq *rxq) { - virtqueue_disable_intr(sc->vtnet_rx_vq); + virtqueue_disable_intr(rxq->vtnrx_vq); } static int -vtnet_enable_tx_intr(struct vtnet_softc *sc) +vtnet_txq_enable_intr(struct vtnet_txq *txq) { -#ifdef VTNET_TX_INTR_MODERATION - return (0); -#else - return (virtqueue_enable_intr(sc->vtnet_tx_vq)); -#endif + return (virtqueue_postpone_intr(txq->vtntx_vq, VQ_POSTPONE_LONG)); +} + +static void +vtnet_txq_disable_intr(struct vtnet_txq *txq) +{ + + virtqueue_disable_intr(txq->vtntx_vq); +} + +static void +vtnet_enable_rx_interrupts(struct vtnet_softc *sc) +{ + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) + vtnet_rxq_enable_intr(&sc->vtnet_rxqs[i]); } static void -vtnet_disable_tx_intr(struct vtnet_softc *sc) +vtnet_enable_tx_interrupts(struct vtnet_softc *sc) { + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) + vtnet_txq_enable_intr(&sc->vtnet_txqs[i]); +} + +static void +vtnet_enable_interrupts(struct vtnet_softc *sc) +{ + + vtnet_enable_rx_interrupts(sc); + vtnet_enable_tx_interrupts(sc); +} + +static void +vtnet_disable_rx_interrupts(struct vtnet_softc *sc) +{ + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) + vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]); +} + +static void +vtnet_disable_tx_interrupts(struct vtnet_softc *sc) +{ + int i; + + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) + vtnet_txq_disable_intr(&sc->vtnet_txqs[i]); +} + +static void +vtnet_disable_interrupts(struct vtnet_softc *sc) +{ + + vtnet_disable_rx_interrupts(sc); + vtnet_disable_tx_interrupts(sc); +} + +static int +vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def) +{ + char path[64]; + + snprintf(path, sizeof(path), + "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob); + TUNABLE_INT_FETCH(path, &def); - virtqueue_disable_intr(sc->vtnet_tx_vq); + return (def); } diff --git a/sys/dev/virtio/network/if_vtnetvar.h b/sys/dev/virtio/network/if_vtnetvar.h index d870436..5921103 100644 --- a/sys/dev/virtio/network/if_vtnetvar.h +++ b/sys/dev/virtio/network/if_vtnetvar.h @@ -29,83 +29,165 @@ #ifndef _IF_VTNETVAR_H #define _IF_VTNETVAR_H +struct vtnet_softc; + struct vtnet_statistics { - unsigned long mbuf_alloc_failed; - - unsigned long rx_frame_too_large; - unsigned long rx_enq_replacement_failed; - unsigned long rx_mergeable_failed; - unsigned long rx_csum_bad_ethtype; - unsigned long rx_csum_bad_start; - unsigned long rx_csum_bad_ipproto; - unsigned long rx_csum_bad_offset; - unsigned long rx_csum_failed; - unsigned long rx_csum_offloaded; - unsigned long rx_task_rescheduled; - - unsigned long tx_csum_offloaded; - unsigned long tx_tso_offloaded; - unsigned long tx_csum_bad_ethtype; - unsigned long tx_tso_bad_ethtype; - unsigned long tx_task_rescheduled; + uint64_t mbuf_alloc_failed; + + uint64_t rx_frame_too_large; + uint64_t rx_enq_replacement_failed; + uint64_t rx_mergeable_failed; + uint64_t rx_csum_bad_ethtype; + uint64_t rx_csum_bad_ipproto; + uint64_t rx_csum_bad_offset; + uint64_t rx_csum_bad_proto; + uint64_t tx_csum_bad_ethtype; + uint64_t tx_tso_bad_ethtype; + uint64_t tx_tso_not_tcp; + + /* + * These are accumulated from each Rx/Tx queue. + */ + uint64_t rx_csum_failed; + uint64_t rx_csum_offloaded; + uint64_t rx_task_rescheduled; + uint64_t tx_csum_offloaded; + uint64_t tx_tso_offloaded; + uint64_t tx_task_rescheduled; +}; + +struct vtnet_rxq_stats { + uint64_t vrxs_ipackets; /* if_ipackets */ + uint64_t vrxs_ibytes; /* if_ibytes */ + uint64_t vrxs_iqdrops; /* if_iqdrops */ + uint64_t vrxs_ierrors; /* if_ierrors */ + uint64_t vrxs_csum; + uint64_t vrxs_csum_failed; + uint64_t vrxs_rescheduled; }; +struct vtnet_rxq { + struct mtx vtnrx_mtx; + struct vtnet_softc *vtnrx_sc; + struct virtqueue *vtnrx_vq; + int vtnrx_id; + int vtnrx_process_limit; + struct vtnet_rxq_stats vtnrx_stats; + struct taskqueue *vtnrx_tq; + struct task vtnrx_intrtask; + char vtnrx_name[16]; +} __aligned(CACHE_LINE_SIZE); + +#define VTNET_RXQ_LOCK(_rxq) mtx_lock(&(_rxq)->vtnrx_mtx) +#define VTNET_RXQ_UNLOCK(_rxq) mtx_unlock(&(_rxq)->vtnrx_mtx) +#define VTNET_RXQ_LOCK_ASSERT(_rxq) \ + mtx_assert(&(_rxq)->vtnrx_mtx, MA_OWNED) +#define VTNET_RXQ_LOCK_ASSERT_NOTOWNED(_rxq) \ + mtx_assert(&(_rxq)->vtnrx_mtx, MA_NOTOWNED) + +struct vtnet_txq_stats { + uint64_t vtxs_opackets; /* if_opackets */ + uint64_t vtxs_obytes; /* if_obytes */ + uint64_t vtxs_omcasts; /* if_omcasts */ + uint64_t vtxs_csum; + uint64_t vtxs_tso; + uint64_t vtxs_collapsed; + uint64_t vtxs_rescheduled; +}; + +struct vtnet_txq { + struct mtx vtntx_mtx; + struct vtnet_softc *vtntx_sc; + struct virtqueue *vtntx_vq; +#ifndef VTNET_LEGACY_TX + struct buf_ring *vtntx_br; +#endif + int vtntx_id; + int vtntx_watchdog; + struct vtnet_txq_stats vtntx_stats; + struct taskqueue *vtntx_tq; + struct task vtntx_intrtask; +#ifndef VTNET_LEGACY_TX + struct task vtntx_defrtask; +#endif + char vtntx_name[16]; +} __aligned(CACHE_LINE_SIZE); + +#define VTNET_TXQ_LOCK(_txq) mtx_lock(&(_txq)->vtntx_mtx) +#define VTNET_TXQ_TRYLOCK(_txq) mtx_trylock(&(_txq)->vtntx_mtx) +#define VTNET_TXQ_UNLOCK(_txq) mtx_unlock(&(_txq)->vtntx_mtx) +#define VTNET_TXQ_LOCK_ASSERT(_txq) \ + mtx_assert(&(_txq)->vtntx_mtx, MA_OWNED) +#define VTNET_TXQ_LOCK_ASSERT_NOTOWNED(_txq) \ + mtx_assert(&(_txq)->vtntx_mtx, MA_NOTOWNED) + struct vtnet_softc { device_t vtnet_dev; struct ifnet *vtnet_ifp; - struct mtx vtnet_mtx; + struct vtnet_rxq *vtnet_rxqs; + struct vtnet_txq *vtnet_txqs; uint32_t vtnet_flags; -#define VTNET_FLAG_LINK 0x0001 -#define VTNET_FLAG_SUSPENDED 0x0002 +#define VTNET_FLAG_SUSPENDED 0x0001 +#define VTNET_FLAG_MAC 0x0002 #define VTNET_FLAG_CTRL_VQ 0x0004 #define VTNET_FLAG_CTRL_RX 0x0008 -#define VTNET_FLAG_VLAN_FILTER 0x0010 -#define VTNET_FLAG_TSO_ECN 0x0020 -#define VTNET_FLAG_MRG_RXBUFS 0x0040 -#define VTNET_FLAG_LRO_NOMRG 0x0080 - - struct virtqueue *vtnet_rx_vq; - struct virtqueue *vtnet_tx_vq; - struct virtqueue *vtnet_ctrl_vq; +#define VTNET_FLAG_CTRL_MAC 0x0010 +#define VTNET_FLAG_VLAN_FILTER 0x0020 +#define VTNET_FLAG_TSO_ECN 0x0040 +#define VTNET_FLAG_MRG_RXBUFS 0x0080 +#define VTNET_FLAG_LRO_NOMRG 0x0100 +#define VTNET_FLAG_MULTIQ 0x0200 + int vtnet_link_active; int vtnet_hdr_size; - int vtnet_tx_size; - int vtnet_rx_size; int vtnet_rx_process_limit; - int vtnet_rx_mbuf_size; - int vtnet_rx_mbuf_count; + int vtnet_rx_nmbufs; + int vtnet_rx_clsize; + int vtnet_rx_new_clsize; int vtnet_if_flags; - int vtnet_watchdog_timer; - uint64_t vtnet_features; + int vtnet_act_vq_pairs; + int vtnet_max_vq_pairs; - struct vtnet_statistics vtnet_stats; + struct virtqueue *vtnet_ctrl_vq; + struct vtnet_mac_filter *vtnet_mac_filter; + uint32_t *vtnet_vlan_filter; + uint64_t vtnet_features; + struct vtnet_statistics vtnet_stats; struct callout vtnet_tick_ch; - + struct ifmedia vtnet_media; eventhandler_tag vtnet_vlan_attach; eventhandler_tag vtnet_vlan_detach; - struct ifmedia vtnet_media; - /* - * Fake media type; the host does not provide us with - * any real media information. - */ -#define VTNET_MEDIATYPE (IFM_ETHER | IFM_1000_T | IFM_FDX) + struct mtx vtnet_mtx; + char vtnet_mtx_name[16]; char vtnet_hwaddr[ETHER_ADDR_LEN]; +}; - struct vtnet_mac_filter *vtnet_mac_filter; - /* - * During reset, the host's VLAN filtering table is lost. The - * array below is used to restore all the VLANs configured on - * this interface after a reset. - */ -#define VTNET_VLAN_SHADOW_SIZE (4096 / 32) - int vtnet_nvlans; - uint32_t vtnet_vlan_shadow[VTNET_VLAN_SHADOW_SIZE]; +/* + * Maximum number of queue pairs we will autoconfigure to. + */ +#define VTNET_MAX_QUEUE_PAIRS 8 - char vtnet_mtx_name[16]; -}; +/* + * Additional completed entries can appear in a virtqueue before we can + * reenable interrupts. Number of times to retry before scheduling the + * taskqueue to process the completed entries. + */ +#define VTNET_INTR_DISABLE_RETRIES 4 + +/* + * Fake the media type. The host does not provide us with any real media + * information. + */ +#define VTNET_MEDIATYPE (IFM_ETHER | IFM_10G_T | IFM_FDX) + +/* + * Number of words to allocate for the VLAN shadow table. There is one + * bit for each VLAN. + */ +#define VTNET_VLAN_FILTER_NWORDS (4096 / 32) /* * When mergeable buffers are not negotiated, the vtnet_rx_header structure @@ -161,8 +243,12 @@ struct vtnet_mac_filter { */ CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE); -#define VTNET_WATCHDOG_TIMEOUT 5 +#define VTNET_TX_TIMEOUT 5 #define VTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP | CSUM_SCTP) +#define VTNET_CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6) + +#define VTNET_CSUM_ALL_OFFLOAD \ + (VTNET_CSUM_OFFLOAD | VTNET_CSUM_OFFLOAD_IPV6 | CSUM_TSO) /* Features desired/implemented by this driver. */ #define VTNET_FEATURES \ @@ -170,8 +256,10 @@ CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE); VIRTIO_NET_F_STATUS | \ VIRTIO_NET_F_CTRL_VQ | \ VIRTIO_NET_F_CTRL_RX | \ + VIRTIO_NET_F_CTRL_MAC_ADDR | \ VIRTIO_NET_F_CTRL_VLAN | \ VIRTIO_NET_F_CSUM | \ + VIRTIO_NET_F_GSO | \ VIRTIO_NET_F_HOST_TSO4 | \ VIRTIO_NET_F_HOST_TSO6 | \ VIRTIO_NET_F_HOST_ECN | \ @@ -180,9 +268,18 @@ CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE); VIRTIO_NET_F_GUEST_TSO6 | \ VIRTIO_NET_F_GUEST_ECN | \ VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_MQ | \ + VIRTIO_RING_F_EVENT_IDX | \ VIRTIO_RING_F_INDIRECT_DESC) /* + * The VIRTIO_NET_F_HOST_TSO[46] features permit us to send the host + * frames larger than 1514 bytes. + */ +#define VTNET_TSO_FEATURES (VIRTIO_NET_F_GSO | VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_ECN) + +/* * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us * frames larger than 1514 bytes. We do not yet support software LRO * via tcp_lro_rx(). @@ -209,27 +306,34 @@ CTASSERT(((VTNET_MAX_RX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_RX_SIZE); CTASSERT(((VTNET_MAX_TX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_MTU); /* + * Number of slots in the Tx bufrings. This value matches most other + * multiqueue drivers. + */ +#define VTNET_DEFAULT_BUFRING_SIZE 4096 + +/* * Determine how many mbufs are in each receive buffer. For LRO without * mergeable descriptors, we must allocate an mbuf chain large enough to * hold both the vtnet_rx_header and the maximum receivable data. */ -#define VTNET_NEEDED_RX_MBUFS(_sc) \ +#define VTNET_NEEDED_RX_MBUFS(_sc, _clsize) \ ((_sc)->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0 ? 1 : \ howmany(sizeof(struct vtnet_rx_header) + VTNET_MAX_RX_SIZE, \ - (_sc)->vtnet_rx_mbuf_size) + (_clsize)) -#define VTNET_MTX(_sc) &(_sc)->vtnet_mtx -#define VTNET_LOCK(_sc) mtx_lock(VTNET_MTX((_sc))) -#define VTNET_UNLOCK(_sc) mtx_unlock(VTNET_MTX((_sc))) -#define VTNET_LOCK_DESTROY(_sc) mtx_destroy(VTNET_MTX((_sc))) -#define VTNET_LOCK_ASSERT(_sc) mtx_assert(VTNET_MTX((_sc)), MA_OWNED) -#define VTNET_LOCK_ASSERT_NOTOWNED(_sc) \ - mtx_assert(VTNET_MTX((_sc)), MA_NOTOWNED) +#define VTNET_CORE_MTX(_sc) &(_sc)->vtnet_mtx +#define VTNET_CORE_LOCK(_sc) mtx_lock(VTNET_CORE_MTX((_sc))) +#define VTNET_CORE_UNLOCK(_sc) mtx_unlock(VTNET_CORE_MTX((_sc))) +#define VTNET_CORE_LOCK_DESTROY(_sc) mtx_destroy(VTNET_CORE_MTX((_sc))) +#define VTNET_CORE_LOCK_ASSERT(_sc) \ + mtx_assert(VTNET_CORE_MTX((_sc)), MA_OWNED) +#define VTNET_CORE_LOCK_ASSERT_NOTOWNED(_sc) \ + mtx_assert(VTNET_CORE_MTX((_sc)), MA_NOTOWNED) -#define VTNET_LOCK_INIT(_sc) do { \ +#define VTNET_CORE_LOCK_INIT(_sc) do { \ snprintf((_sc)->vtnet_mtx_name, sizeof((_sc)->vtnet_mtx_name), \ "%s", device_get_nameunit((_sc)->vtnet_dev)); \ - mtx_init(VTNET_MTX((_sc)), (_sc)->vtnet_mtx_name, \ + mtx_init(VTNET_CORE_MTX((_sc)), (_sc)->vtnet_mtx_name, \ "VTNET Core Lock", MTX_DEF); \ } while (0) diff --git a/sys/dev/virtio/network/virtio_net.h b/sys/dev/virtio/network/virtio_net.h index 15a73cc..f4f9feb 100644 --- a/sys/dev/virtio/network/virtio_net.h +++ b/sys/dev/virtio/network/virtio_net.h @@ -50,14 +50,22 @@ #define VIRTIO_NET_F_CTRL_RX 0x40000 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 0x80000 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE 0x200000 /* Announce device on network */ +#define VIRTIO_NET_F_MQ 0x400000 /* Device supports RFS */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 0x800000 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ - uint8_t mac[ETHER_ADDR_LEN]; + uint8_t mac[ETHER_ADDR_LEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ uint16_t status; + /* Maximum number of each of transmit and receive queues; + * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ. + * Legal values are between 1 and 0x8000. + */ + uint16_t max_virtqueue_pairs; } __packed; /* @@ -66,6 +74,7 @@ struct virtio_net_config { */ struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start,csum_offset*/ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ @@ -100,8 +109,6 @@ struct virtio_net_ctrl_hdr { uint8_t cmd; } __packed; -typedef uint8_t virtio_net_ctrl_ack; - #define VIRTIO_NET_OK 0 #define VIRTIO_NET_ERR 1 @@ -134,6 +141,10 @@ typedef uint8_t virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { uint32_t entries; @@ -142,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC 1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 +#define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering @@ -156,4 +168,35 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_VLAN_ADD 0 #define VIRTIO_NET_CTRL_VLAN_DEL 1 +/* + * Control link announce acknowledgement + * + * The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that + * driver has recevied the notification; device would clear the + * VIRTIO_NET_S_ANNOUNCE bit in the status field after it receives + * this command. + */ +#define VIRTIO_NET_CTRL_ANNOUNCE 3 +#define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 + +/* + * Control Receive Flow Steering + * + * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET enables Receive Flow + * Steering, specifying the number of the transmit and receive queues + * that will be used. After the command is consumed and acked by the + * device, the device will not steer new packets on receive virtqueues + * other than specified nor read from transmit virtqueues other than + * specified. Accordingly, driver should not transmit new packets on + * virtqueues other than specified. + */ +struct virtio_net_ctrl_mq { + uint16_t virtqueue_pairs; +} __packed; + +#define VIRTIO_NET_CTRL_MQ 4 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 +#define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 + #endif /* _VIRTIO_NET_H */ diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c index b3df3d9..dcd82ec 100644 --- a/sys/dev/virtio/pci/virtio_pci.c +++ b/sys/dev/virtio/pci/virtio_pci.c @@ -757,8 +757,10 @@ vtpci_probe_and_attach_child(struct vtpci_softc *sc) vtpci_release_child_resources(sc); /* Reset status for future attempt. */ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); - } else + } else { vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); + VIRTIO_ATTACH_COMPLETED(child); + } } static int diff --git a/sys/dev/virtio/virtio_if.m b/sys/dev/virtio/virtio_if.m index 9a99d37..521f4b8 100644 --- a/sys/dev/virtio/virtio_if.m +++ b/sys/dev/virtio/virtio_if.m @@ -31,6 +31,18 @@ INTERFACE virtio; CODE { static int + virtio_default_attach_completed(device_t dev) + { + return (0); + } +}; + +METHOD int attach_completed { + device_t dev; +} DEFAULT virtio_default_attach_completed; + +CODE { + static int virtio_default_config_change(device_t dev) { return (0); diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c index a82426e..beff14c 100644 --- a/sys/dev/virtio/virtqueue.c +++ b/sys/dev/virtio/virtqueue.c @@ -127,7 +127,7 @@ static uint16_t vq_ring_enqueue_segments(struct virtqueue *, static int vq_ring_use_indirect(struct virtqueue *, int); static void vq_ring_enqueue_indirect(struct virtqueue *, void *, struct sglist *, int, int); -static int vq_ring_enable_interrupt(struct virtqueue *, uint16_t); +static int vq_ring_enable_interrupt(struct virtqueue *, uint16_t); static int vq_ring_must_notify_host(struct virtqueue *); static void vq_ring_notify_host(struct virtqueue *); static void vq_ring_free_chain(struct virtqueue *, uint16_t); @@ -440,28 +440,38 @@ virtqueue_enable_intr(struct virtqueue *vq) } int -virtqueue_postpone_intr(struct virtqueue *vq) +virtqueue_postpone_intr(struct virtqueue *vq, vq_postpone_t hint) { uint16_t ndesc, avail_idx; - /* - * Request the next interrupt be postponed until at least half - * of the available descriptors have been consumed. - */ avail_idx = vq->vq_ring.avail->idx; - ndesc = (uint16_t)(avail_idx - vq->vq_used_cons_idx) / 2; + ndesc = (uint16_t)(avail_idx - vq->vq_used_cons_idx); + + switch (hint) { + case VQ_POSTPONE_SHORT: + ndesc /= 4; + break; + case VQ_POSTPONE_LONG: + ndesc *= 3 / 4; + break; + case VQ_POSTPONE_EMPTIED: + break; + } return (vq_ring_enable_interrupt(vq, ndesc)); } +/* + * Note this is only considered a hint to the host. + */ void virtqueue_disable_intr(struct virtqueue *vq) { - /* - * Note this is only considered a hint to the host. - */ - if ((vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) == 0) + if (vq->vq_flags & VIRTQUEUE_FLAG_EVENT_IDX) { + vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - + vq->vq_nentries - 1; + } else vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; } diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h index 128a10a..0d4ed94 100644 --- a/sys/dev/virtio/virtqueue.h +++ b/sys/dev/virtio/virtqueue.h @@ -41,6 +41,16 @@ struct sglist; /* Device callback for a virtqueue interrupt. */ typedef void virtqueue_intr_t(void *); +/* + * Hint on how long the next interrupt should be postponed. This is + * only used when the EVENT_IDX feature is negotiated. + */ +typedef enum { + VQ_POSTPONE_SHORT, + VQ_POSTPONE_LONG, + VQ_POSTPONE_EMPTIED /* Until all available desc are used. */ +} vq_postpone_t; + #define VIRTQUEUE_MAX_NAME_SZ 32 /* One for each virtqueue the device wishes to allocate. */ @@ -73,7 +83,7 @@ int virtqueue_reinit(struct virtqueue *vq, uint16_t size); int virtqueue_intr_filter(struct virtqueue *vq); void virtqueue_intr(struct virtqueue *vq); int virtqueue_enable_intr(struct virtqueue *vq); -int virtqueue_postpone_intr(struct virtqueue *vq); +int virtqueue_postpone_intr(struct virtqueue *vq, vq_postpone_t hint); void virtqueue_disable_intr(struct virtqueue *vq); /* Get physical address of the virtqueue ring. */ diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 90994cc..ebb6935 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2914,11 +2914,92 @@ pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) } /* - * This function is advisory. + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { + pd_entry_t *pde, *pdpe; + pt_entry_t *pte; + vm_offset_t va, va_next; + vm_paddr_t pa; + vm_page_t m; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + pdpe = pmap_segmap(pmap, sva); +#ifdef __mips_n64 + if (*pdpe == 0) { + va_next = (sva + NBSEG) & ~SEGMASK; + if (va_next < sva) + va_next = eva; + continue; + } +#endif + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, sva); + if (*pde == NULL) + continue; + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being write protected. + */ + if (va_next > eva) + va_next = eva; + + va = va_next; + for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + if (!pte_test(pte, PTE_MANAGED | PTE_V)) { + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + continue; + } + pa = TLBLO_PTE_TO_PA(*pte); + m = PHYS_TO_VM_PAGE(pa); + m->md.pv_flags &= ~PV_TABLE_REF; + if (pte_test(pte, PTE_D)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + vm_page_dirty(m); + } else { + pte_clear(pte, PTE_D); + if (va == va_next) + va = sva; + } + } else { + /* + * Unless PTE_D is set, any TLB entries + * mapping "sva" don't allow write access, so + * they needn't be invalidated. + */ + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + } + } + if (va != va_next) + pmap_invalidate_range(pmap, va, sva); + } + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); } /* diff --git a/sys/modules/virtio/network/Makefile b/sys/modules/virtio/network/Makefile index 8463309c..f124d99 100644 --- a/sys/modules/virtio/network/Makefile +++ b/sys/modules/virtio/network/Makefile @@ -23,14 +23,29 @@ # SUCH DAMAGE. # +.include <bsd.own.mk> + .PATH: ${.CURDIR}/../../../dev/virtio/network KMOD= if_vtnet SRCS= if_vtnet.c SRCS+= virtio_bus_if.h virtio_if.h SRCS+= bus_if.h device_if.h +SRCS+= opt_inet.h opt_inet6.h MFILES= kern/bus_if.m kern/device_if.m \ dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + @echo "#define INET 1" > ${.TARGET} +.endif + +.if ${MK_INET6_SUPPORT} != "no" +opt_inet6.h: + @echo "#define INET6 1" > ${.TARGET} +.endif +.endif + .include <bsd.kmod.mk> diff --git a/sys/powerpc/powermac/atibl.c b/sys/powerpc/powermac/atibl.c index fff76d0..f4ac9b0 100644 --- a/sys/powerpc/powermac/atibl.c +++ b/sys/powerpc/powermac/atibl.c @@ -86,6 +86,8 @@ DRIVER_MODULE(atibl, vgapci, atibl_driver, atibl_devclass, 0, 0); static void atibl_identify(driver_t *driver, device_t parent) { + if (OF_finddevice("mac-io/backlight") == -1) + return; if (device_find_child(parent, "backlight", -1) == NULL) device_add_child(parent, "backlight", -1); } diff --git a/sys/powerpc/powermac/nvbl.c b/sys/powerpc/powermac/nvbl.c index 1f89881..033f972 100644 --- a/sys/powerpc/powermac/nvbl.c +++ b/sys/powerpc/powermac/nvbl.c @@ -82,6 +82,8 @@ DRIVER_MODULE(nvbl, vgapci, nvbl_driver, nvbl_devclass, 0, 0); static void nvbl_identify(driver_t *driver, device_t parent) { + if (OF_finddevice("mac-io/backlight") == -1) + return; if (device_find_child(parent, "backlight", -1) == NULL) device_add_child(parent, "backlight", -1); } diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 6627a07..e39654e 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -780,7 +780,7 @@ finished: while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); - keg_free_slab(keg, slab, 0); + keg_free_slab(keg, slab, keg->uk_ipers); } } |