diff options
Diffstat (limited to 'sys')
335 files changed, 13087 insertions, 18513 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index c3aac33..4455cab 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -174,6 +174,22 @@ IDTVEC(xen_intr_upcall) jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rdi + call hv_vector_handler + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown. diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index c81495a..83ca548 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -81,28 +81,11 @@ __FBSDID("$FreeBSD$"); #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) -/* lock region used by kernel profiling */ -int mcount_lock; - -int mp_naps; /* # of Applications processors */ -int boot_cpu_id = -1; /* designated BSP */ - -extern struct pcpu __pcpu[]; - -/* AP uses this during bootstrap. Do not staticize. */ -char *bootSTK; -int bootAP; - -/* Free these after use */ -void *bootstacks[MAXCPU]; +extern struct pcpu __pcpu[]; /* Temporary variables for init_secondary() */ char *doublefault_stack; char *nmi_stack; -void *dpcpu; - -struct pcb stoppcbs[MAXCPU]; -struct susppcb **susppcbs; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr2; @@ -112,309 +95,16 @@ uint64_t pcid_cr3; pmap_t smp_tlb_pmap; extern int invpcid_works; -#ifdef COUNT_IPIS -/* Interrupt counts. */ -static u_long *ipi_preempt_counts[MAXCPU]; -static u_long *ipi_ast_counts[MAXCPU]; -u_long *ipi_invltlb_counts[MAXCPU]; -u_long *ipi_invlrng_counts[MAXCPU]; -u_long *ipi_invlpg_counts[MAXCPU]; -u_long *ipi_invlcache_counts[MAXCPU]; -u_long *ipi_rendezvous_counts[MAXCPU]; -static u_long *ipi_hardclock_counts[MAXCPU]; -#endif - -/* Default cpu_ops implementation. */ -struct cpu_ops cpu_ops; - extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); -extern int pmap_pcid_enabled; - /* * Local data and functions. */ -static volatile cpuset_t ipi_nmi_pending; - -/* used to hold the AP's until we are ready to release them */ -struct mtx ap_boot_mtx; - -/* Set to 1 once we're ready to let the APs out of the pen. */ -static volatile int aps_ready = 0; - -/* - * Store data from cpu_add() until later in the boot when we actually setup - * the APs. - */ -struct cpu_info { - int cpu_present:1; - int cpu_bsp:1; - int cpu_disabled:1; - int cpu_hyperthread:1; -} static cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; -int apic_cpuids[MAX_APIC_ID + 1]; - -/* Holds pending bitmap based IPIs per CPU */ -volatile u_int cpu_ipi_pending[MAXCPU]; - -static u_int boot_address; -static int cpu_logical; /* logical cpus per core */ -static int cpu_cores; /* cores per package */ - -static void assign_cpu_ids(void); -static void set_interrupt_apic_ids(void); static int start_ap(int apic_id); -static void release_aps(void *dummy); -static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ -static int hyperthreading_allowed = 1; static u_int bootMP_size; - -static void -mem_range_AP_init(void) -{ - if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) - mem_range_softc.mr_op->initAP(&mem_range_softc); -} - -static void -topo_probe_amd(void) -{ - int core_id_bits; - int id; - - /* AMD processors do not support HTT. */ - cpu_logical = 1; - - if ((amd_feature2 & AMDID2_CMP) == 0) { - cpu_cores = 1; - return; - } - - core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> - AMDID_COREID_SIZE_SHIFT; - if (core_id_bits == 0) { - cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; - return; - } - - /* Fam 10h and newer should get here. */ - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) - continue; - cpu_cores++; - } -} - -/* - * Round up to the next power of two, if necessary, and then - * take log2. - * Returns -1 if argument is zero. - */ -static __inline int -mask_width(u_int x) -{ - - return (fls(x << (1 - powerof2(x))) - 1); -} - -static void -topo_probe_0x4(void) -{ - u_int p[4]; - int pkg_id_bits; - int core_id_bits; - int max_cores; - int max_logical; - int id; - - /* Both zero and one here mean one logical processor per package. */ - max_logical = (cpu_feature & CPUID_HTT) != 0 ? - (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; - if (max_logical <= 1) - return; - - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. Further, we count number of - * logical processors that belong to the same core - * as BSP thus deducing number of threads per core. - */ - if (cpu_high >= 0x4) { - cpuid_count(0x04, 0, p); - max_cores = ((p[0] >> 26) & 0x3f) + 1; - } else - max_cores = 1; - core_id_bits = mask_width(max_logical/max_cores); - if (core_id_bits < 0) - return; - pkg_id_bits = core_id_bits + mask_width(max_cores); - - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) - continue; - cpu_cores++; - /* Check if logical CPU has the same package and core IDs. */ - if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) - cpu_logical++; - } - - KASSERT(cpu_cores >= 1 && cpu_logical >= 1, - ("topo_probe_0x4 couldn't find BSP")); - - cpu_cores /= cpu_logical; - hyperthreading_cpus = cpu_logical; -} - -static void -topo_probe_0xb(void) -{ - u_int p[4]; - int bits; - int cnt; - int i; - int logical; - int type; - int x; - - /* We only support three levels for now. */ - for (i = 0; i < 3; i++) { - cpuid_count(0x0b, i, p); - - /* Fall back if CPU leaf 11 doesn't really exist. */ - if (i == 0 && p[1] == 0) { - topo_probe_0x4(); - return; - } - - bits = p[0] & 0x1f; - logical = p[1] &= 0xffff; - type = (p[2] >> 8) & 0xff; - if (type == 0 || logical == 0) - break; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. - */ - for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || - cpu_info[x].cpu_disabled) - continue; - if (x >> bits == boot_cpu_id >> bits) - cnt++; - } - if (type == CPUID_TYPE_SMT) - cpu_logical = cnt; - else if (type == CPUID_TYPE_CORE) - cpu_cores = cnt; - } - if (cpu_logical == 0) - cpu_logical = 1; - cpu_cores /= cpu_logical; -} - -/* - * Both topology discovery code and code that consumes topology - * information assume top-down uniformity of the topology. - * That is, all physical packages must be identical and each - * core in a package must have the same number of threads. - * Topology information is queried only on BSP, on which this - * code runs and for which it can query CPUID information. - * Then topology is extrapolated on all packages using the - * uniformity assumption. - */ -static void -topo_probe(void) -{ - static int cpu_topo_probed = 0; - - if (cpu_topo_probed) - return; - - CPU_ZERO(&logical_cpus_mask); - if (mp_ncpus <= 1) - cpu_cores = cpu_logical = 1; - else if (cpu_vendor_id == CPU_VENDOR_AMD) - topo_probe_amd(); - else if (cpu_vendor_id == CPU_VENDOR_INTEL) { - /* - * See Intel(R) 64 Architecture Processor - * Topology Enumeration article for details. - * - * Note that 0x1 <= cpu_high < 4 case should be - * compatible with topo_probe_0x4() logic when - * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) - * or it should trigger the fallback otherwise. - */ - if (cpu_high >= 0xb) - topo_probe_0xb(); - else if (cpu_high >= 0x1) - topo_probe_0x4(); - } - - /* - * Fallback: assume each logical CPU is in separate - * physical package. That is, no multi-core, no SMT. - */ - if (cpu_cores == 0 || cpu_logical == 0) - cpu_cores = cpu_logical = 1; - cpu_topo_probed = 1; -} - -struct cpu_group * -cpu_topo(void) -{ - int cg_flags; - - /* - * Determine whether any threading flags are - * necessry. - */ - topo_probe(); - if (cpu_logical > 1 && hyperthreading_cpus) - cg_flags = CG_FLAG_HTT; - else if (cpu_logical > 1) - cg_flags = CG_FLAG_SMT; - else - cg_flags = 0; - if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { - printf("WARNING: Non-uniform processors.\n"); - printf("WARNING: Using suboptimal topology.\n"); - return (smp_topo_none()); - } - /* - * No multi-core or hyper-threaded. - */ - if (cpu_logical * cpu_cores == 1) - return (smp_topo_none()); - /* - * Only HTT no multi-core. - */ - if (cpu_logical > 1 && cpu_cores == 1) - return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); - /* - * Only multi-core no HTT. - */ - if (cpu_cores > 1 && cpu_logical == 1) - return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); - /* - * Both HTT and multi-core. - */ - return (smp_topo_2level(CG_SHARE_L2, cpu_cores, - CG_SHARE_L1, cpu_logical, cg_flags)); -} +static u_int boot_address; /* * Calculate usable address in base memory for AP trampoline code. @@ -433,85 +123,6 @@ mp_bootaddress(u_int basemem) return mptramp_pagetables; } -void -cpu_add(u_int apic_id, char boot_cpu) -{ - - if (apic_id > MAX_APIC_ID) { - panic("SMP: APIC ID %d too high", apic_id); - return; - } - KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", - apic_id)); - cpu_info[apic_id].cpu_present = 1; - if (boot_cpu) { - KASSERT(boot_cpu_id == -1, - ("CPU %d claims to be BSP, but CPU %d already is", apic_id, - boot_cpu_id)); - boot_cpu_id = apic_id; - cpu_info[apic_id].cpu_bsp = 1; - } - if (mp_ncpus < MAXCPU) { - mp_ncpus++; - mp_maxid = mp_ncpus - 1; - } - if (bootverbose) - printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : - "AP"); -} - -void -cpu_mp_setmaxid(void) -{ - - /* - * mp_maxid should be already set by calls to cpu_add(). - * Just sanity check its value here. - */ - if (mp_ncpus == 0) - KASSERT(mp_maxid == 0, - ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); - else if (mp_ncpus == 1) - mp_maxid = 0; - else - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, - mp_maxid, mp_ncpus)); -} - -int -cpu_mp_probe(void) -{ - - /* - * Always record BSP in CPU map so that the mbuf init code works - * correctly. - */ - CPU_SETOF(0, &all_cpus); - if (mp_ncpus == 0) { - /* - * No CPUs were found, so this must be a UP system. Setup - * the variables to represent a system with a single CPU - * with an id of 0. - */ - mp_ncpus = 1; - return (0); - } - - /* At least one CPU was found. */ - if (mp_ncpus == 1) { - /* - * One CPU was found, so this must be a UP system with - * an I/O APIC. - */ - mp_maxid = 0; - return (0); - } - - /* At least two CPUs were found. */ - return (1); -} - /* * Initialize the IPI handlers and start up the AP's. */ @@ -575,47 +186,6 @@ cpu_mp_start(void) /* - * Print various information about the SMP system hardware and setup. - */ -void -cpu_mp_announce(void) -{ - const char *hyperthread; - int i; - - printf("FreeBSD/SMP: %d package(s) x %d core(s)", - mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); - if (hyperthreading_cpus > 1) - printf(" x %d HTT threads", cpu_logical); - else if (cpu_logical > 1) - printf(" x %d SMT threads", cpu_logical); - printf("\n"); - - /* List active CPUs first. */ - printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); - for (i = 1; i < mp_ncpus; i++) { - if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, - cpu_apic_ids[i]); - } - - /* List disabled CPUs last. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) - continue; - if (cpu_info[i].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, - i); - } -} - -/* * AP CPU's call this to initialize themselves. */ void @@ -624,7 +194,6 @@ init_secondary(void) struct pcpu *pc; struct nmi_pcpu *np; u_int64_t msr, cr0; - u_int cpuid; int cpu, gsel_tss, x; struct region_descriptor ap_gdt; @@ -712,94 +281,7 @@ init_secondary(void) while (!aps_ready) ia32_pause(); - /* - * On real hardware, switch to x2apic mode if possible. Do it - * after aps_ready was signalled, to avoid manipulating the - * mode while BSP might still want to send some IPI to us - * (second startup IPI is ignored on modern hardware etc). - */ - lapic_xapic_mode(); - - /* Initialize the PAT MSR. */ - pmap_init_pat(); - - /* set up CPU registers and state */ - cpu_setregs(); - - /* set up SSE/NX */ - initializecpu(); - - /* set up FPU state on the AP */ - fpuinit(); - - if (cpu_ops.cpu_init) - cpu_ops.cpu_init(); - - /* A quick check from sanity claus */ - cpuid = PCPU_GET(cpuid); - if (PCPU_GET(apic_id) != lapic_id()) { - printf("SMP: cpuid = %d\n", cpuid); - printf("SMP: actual apic_id = %d\n", lapic_id()); - printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); - panic("cpuid mismatch! boom!!"); - } - - /* Initialize curthread. */ - KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); - PCPU_SET(curthread, PCPU_GET(idlethread)); - - mca_init(); - - mtx_lock_spin(&ap_boot_mtx); - - /* Init local apic for irq's */ - lapic_setup(1); - - /* Set memory range attributes for this CPU to match the BSP */ - mem_range_AP_init(); - - smp_cpus++; - - CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); - printf("SMP: AP CPU #%d Launched!\n", cpuid); - - /* Determine if we are a logical CPU. */ - /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ - if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) - CPU_SET(cpuid, &logical_cpus_mask); - - if (bootverbose) - lapic_dump("AP"); - - if (smp_cpus == mp_ncpus) { - /* enable IPI's, tlb shootdown, freezes etc */ - atomic_store_rel_int(&smp_started, 1); - } - - /* - * Enable global pages TLB extension - * This also implicitly flushes the TLB - */ - - load_cr4(rcr4() | CR4_PGE); - if (pmap_pcid_enabled) - load_cr4(rcr4() | CR4_PCIDE); - load_ds(_udatasel); - load_es(_udatasel); - load_fs(_ufssel); - mtx_unlock_spin(&ap_boot_mtx); - - /* Wait until all the AP's are up. */ - while (smp_started == 0) - ia32_pause(); - - /* Start per-CPU event timers. */ - cpu_initclocks_ap(); - - sched_throw(NULL); - - panic("scheduler returned us to %s", __func__); - /* NOTREACHED */ + init_secondary_tail(); } /******************************************************************* @@ -807,108 +289,6 @@ init_secondary(void) */ /* - * We tell the I/O APIC code about all the CPUs we want to receive - * interrupts. If we don't want certain CPUs to receive IRQs we - * can simply not tell the I/O APIC code about them in this function. - * We also do not tell it about the BSP since it tells itself about - * the BSP internally to work with UP kernels and on UP machines. - */ -static void -set_interrupt_apic_ids(void) -{ - u_int i, apic_id; - - for (i = 0; i < MAXCPU; i++) { - apic_id = cpu_apic_ids[i]; - if (apic_id == -1) - continue; - if (cpu_info[apic_id].cpu_bsp) - continue; - if (cpu_info[apic_id].cpu_disabled) - continue; - - /* Don't let hyperthreads service interrupts. */ - if (cpu_logical > 1 && - apic_id % cpu_logical != 0) - continue; - - intr_add_cpu(i); - } -} - -/* - * Assign logical CPU IDs to local APICs. - */ -static void -assign_cpu_ids(void) -{ - u_int i; - - TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", - &hyperthreading_allowed); - - /* Check for explicitly disabled CPUs. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) - continue; - - if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { - cpu_info[i].cpu_hyperthread = 1; - - /* - * Don't use HT CPU if it has been disabled by a - * tunable. - */ - if (hyperthreading_allowed == 0) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - /* Don't use this CPU if it has been disabled by a tunable. */ - if (resource_disabled("lapic", i)) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { - hyperthreading_cpus = 0; - cpu_logical = 1; - } - - /* - * Assign CPU IDs to local APIC IDs and disable any CPUs - * beyond MAXCPU. CPU 0 is always assigned to the BSP. - * - * To minimize confusion for userland, we attempt to number - * CPUs such that all threads and cores in a package are - * grouped together. For now we assume that the BSP is always - * the first thread in a package and just start adding APs - * starting with the BSP's APIC ID. - */ - mp_ncpus = 1; - cpu_apic_ids[0] = boot_cpu_id; - apic_cpuids[boot_cpu_id] = 0; - for (i = boot_cpu_id + 1; i != boot_cpu_id; - i == MAX_APIC_ID ? i = 0 : i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || - cpu_info[i].cpu_disabled) - continue; - - if (mp_ncpus < MAXCPU) { - cpu_apic_ids[mp_ncpus] = i; - apic_cpuids[i] = mp_ncpus; - mp_ncpus++; - } else - cpu_info[i].cpu_disabled = 1; - } - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, - mp_ncpus)); -} - -/* * start each AP in our list */ int @@ -1026,129 +406,6 @@ start_ap(int apic_id) return 0; /* return FAILURE */ } -#ifdef COUNT_XINVLTLB_HITS -u_int xhits_gbl[MAXCPU]; -u_int xhits_pg[MAXCPU]; -u_int xhits_rng[MAXCPU]; -static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, - sizeof(xhits_gbl), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, - sizeof(xhits_pg), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, - sizeof(xhits_rng), "IU", ""); - -u_int ipi_global; -u_int ipi_page; -u_int ipi_range; -u_int ipi_range_size; -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, - &ipi_range_size, 0, ""); - -u_int ipi_masked_global; -u_int ipi_masked_page; -u_int ipi_masked_range; -u_int ipi_masked_range_size; -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, - &ipi_masked_global, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, - &ipi_masked_page, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, - &ipi_masked_range, 0, ""); -SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, - &ipi_masked_range_size, 0, ""); -#endif /* COUNT_XINVLTLB_HITS */ - -/* - * Init and startup IPI. - */ -void -ipi_startup(int apic_id, int vector) -{ - - /* - * This attempts to follow the algorithm described in the - * Intel Multiprocessor Specification v1.4 in section B.4. - * For each IPI, we allow the local APIC ~20us to deliver the - * IPI. If that times out, we panic. - */ - - /* - * first we do an INIT IPI: this INIT IPI might be run, resetting - * and running the target CPU. OR this INIT IPI might be latched (P5 - * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be - * ignored. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); - lapic_ipi_wait(100); - - /* Explicitly deassert the INIT IPI. */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | - APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, - apic_id); - - DELAY(10000); /* wait ~10mS */ - - /* - * next we do a STARTUP IPI: the previous INIT IPI might still be - * latched, (P5 bug) this 1st STARTUP would then terminate - * immediately, and the previously started INIT IPI would continue. OR - * the previous INIT IPI has already run. and this STARTUP IPI will - * run. OR the previous INIT IPI was ignored. and this STARTUP IPI - * will run. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | - vector, apic_id); - if (!lapic_ipi_wait(100)) - panic("Failed to deliver first STARTUP IPI to APIC %d", - apic_id); - DELAY(200); /* wait ~200uS */ - - /* - * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF - * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR - * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is - * recognized after hardware RESET or INIT IPI. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | - vector, apic_id); - if (!lapic_ipi_wait(100)) - panic("Failed to deliver second STARTUP IPI to APIC %d", - apic_id); - - DELAY(200); /* wait ~200uS */ -} - -/* - * Send an IPI to specified CPU handling the bitmap logic. - */ -static void -ipi_send_cpu(int cpu, u_int ipi) -{ - u_int bitmap, old_pending, new_pending; - - KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); - - if (IPI_IS_BITMAPED(ipi)) { - bitmap = 1 << ipi; - ipi = IPI_BITMAP_VECTOR; - do { - old_pending = cpu_ipi_pending[cpu]; - new_pending = old_pending | bitmap; - } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], - old_pending, new_pending)); - if (old_pending) - return; - } - lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); -} - /* * Flush the TLB on all other CPU's */ @@ -1228,26 +485,6 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, } void -smp_cache_flush(void) -{ - - if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); -} - -void -smp_invltlb(pmap_t pmap) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); -#ifdef COUNT_XINVLTLB_HITS - ipi_global++; -#endif - } -} - -void smp_invlpg(pmap_t pmap, vm_offset_t addr) { @@ -1312,210 +549,23 @@ smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, } void -ipi_bitmap_handler(struct trapframe frame) -{ - struct trapframe *oldframe; - struct thread *td; - int cpu = PCPU_GET(cpuid); - u_int ipi_bitmap; - - critical_enter(); - td = curthread; - td->td_intr_nesting_level++; - oldframe = td->td_intr_frame; - td->td_intr_frame = &frame; - ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); - if (ipi_bitmap & (1 << IPI_PREEMPT)) { -#ifdef COUNT_IPIS - (*ipi_preempt_counts[cpu])++; -#endif - sched_preempt(td); - } - if (ipi_bitmap & (1 << IPI_AST)) { -#ifdef COUNT_IPIS - (*ipi_ast_counts[cpu])++; -#endif - /* Nothing to do for AST */ - } - if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { -#ifdef COUNT_IPIS - (*ipi_hardclock_counts[cpu])++; -#endif - hardclockintr(); - } - td->td_intr_frame = oldframe; - td->td_intr_nesting_level--; - critical_exit(); -} - -/* - * send an IPI to a set of cpus. - */ -void -ipi_selected(cpuset_t cpus, u_int ipi) -{ - int cpu; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); - - while ((cpu = CPU_FFS(&cpus)) != 0) { - cpu--; - CPU_CLR(cpu, &cpus); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); - } -} - -/* - * send an IPI to a specific CPU. - */ -void -ipi_cpu(int cpu, u_int ipi) -{ - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); - - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); -} - -/* - * send an IPI to all CPUs EXCEPT myself - */ -void -ipi_all_but_self(u_int ipi) +smp_cache_flush(void) { - cpuset_t other_cpus; - - other_cpus = all_cpus; - CPU_CLR(PCPU_GET(cpuid), &other_cpus); - if (IPI_IS_BITMAPED(ipi)) { - ipi_selected(other_cpus, ipi); - return; - } - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); - - CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); - lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); + if (smp_started) + smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); } -int -ipi_nmi_handler() -{ - u_int cpuid; - - /* - * As long as there is not a simple way to know about a NMI's - * source, if the bitmask for the current CPU is present in - * the global pending bitword an IPI_STOP_HARD has been issued - * and should be handled. - */ - cpuid = PCPU_GET(cpuid); - if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) - return (1); - - CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); - cpustop_handler(); - return (0); -} - -/* - * Handle an IPI_STOP by saving our current context and spinning until we - * are resumed. - */ void -cpustop_handler(void) -{ - u_int cpu; - - cpu = PCPU_GET(cpuid); - - savectx(&stoppcbs[cpu]); - - /* Indicate that we are stopped */ - CPU_SET_ATOMIC(cpu, &stopped_cpus); - - /* Wait for restart */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - CPU_CLR_ATOMIC(cpu, &started_cpus); - CPU_CLR_ATOMIC(cpu, &stopped_cpus); +smp_invltlb(pmap_t pmap) +{ -#ifdef DDB - amd64_db_resume_dbreg(); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; #endif - - if (cpu == 0 && cpustop_restartfunc != NULL) { - cpustop_restartfunc(); - cpustop_restartfunc = NULL; - } -} - -/* - * Handle an IPI_SUSPEND by saving our current context and spinning until we - * are resumed. - */ -void -cpususpend_handler(void) -{ - u_int cpu; - - mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); - - cpu = PCPU_GET(cpuid); - if (savectx(&susppcbs[cpu]->sp_pcb)) { - fpususpend(susppcbs[cpu]->sp_fpususpend); - wbinvd(); - CPU_SET_ATOMIC(cpu, &suspended_cpus); - } else { - fpuresume(susppcbs[cpu]->sp_fpususpend); - pmap_init_pat(); - initializecpu(); - PCPU_SET(switchtime, 0); - PCPU_SET(switchticks, ticks); - - /* Indicate that we are resumed */ - CPU_CLR_ATOMIC(cpu, &suspended_cpus); } - - /* Wait for resume */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - if (cpu_ops.cpu_resume) - cpu_ops.cpu_resume(); - if (vmm_resume_p) - vmm_resume_p(); - - /* Resume MCA and local APIC */ - lapic_xapic_mode(); - mca_resume(); - lapic_setup(0); - - CPU_CLR_ATOMIC(cpu, &started_cpus); - /* Indicate that we are resumed */ - CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* @@ -1678,63 +728,3 @@ invlrng_handler(void) atomic_add_int(&smp_tlb_wait, 1); } - -void -invlcache_handler(void) -{ -#ifdef COUNT_IPIS - (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); -} - -/* - * This is called once the rest of the system is up and running and we're - * ready to let the AP's out of the pen. - */ -static void -release_aps(void *dummy __unused) -{ - - if (mp_ncpus == 1) - return; - atomic_store_rel_int(&aps_ready, 1); - while (smp_started == 0) - ia32_pause(); -} -SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); - -#ifdef COUNT_IPIS -/* - * Setup interrupt counters for IPI handlers. - */ -static void -mp_ipi_intrcnt(void *dummy) -{ - char buf[64]; - int i; - - CPU_FOREACH(i) { - snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); - intrcnt_add(buf, &ipi_invltlb_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); - intrcnt_add(buf, &ipi_invlrng_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); - intrcnt_add(buf, &ipi_invlpg_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); - intrcnt_add(buf, &ipi_invlcache_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:preempt", i); - intrcnt_add(buf, &ipi_preempt_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:ast", i); - intrcnt_add(buf, &ipi_ast_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); - intrcnt_add(buf, &ipi_rendezvous_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); - intrcnt_add(buf, &ipi_hardclock_counts[i]); - } -} -SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); -#endif - diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index bdaca33..c24dd5a 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -340,7 +340,9 @@ device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 9b697f0..e0fe465 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -494,6 +494,8 @@ device virtio_balloon # VirtIO Memory Balloon device device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Microsoft Hyper-V enchancement support +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 3a4b6b3..034a693 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -35,6 +35,39 @@ extern int mp_naps; extern int boot_cpu_id; extern struct pcb stoppcbs[]; extern int cpu_apic_ids[]; +extern int bootAP; +extern void *dpcpu; +extern char *bootSTK; +extern int bootAP; +extern void *bootstacks[]; +extern volatile u_int cpu_ipi_pending[]; +extern volatile int aps_ready; +extern struct mtx ap_boot_mtx; +extern int cpu_logical; +extern int cpu_cores; +extern int pmap_pcid_enabled; +extern u_int xhits_gbl[]; +extern u_int xhits_pg[]; +extern u_int xhits_rng[]; +extern u_int ipi_global; +extern u_int ipi_page; +extern u_int ipi_range; +extern u_int ipi_range_size; +extern u_int ipi_masked_global; +extern u_int ipi_masked_page; +extern u_int ipi_masked_range; +extern u_int ipi_masked_range_size; + +extern volatile int smp_tlb_wait; + +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; + int cpu_hyperthread:1; +}; +extern struct cpu_info cpu_info[]; + #ifdef COUNT_IPIS extern u_long *ipi_invltlb_counts[MAXCPU]; extern u_long *ipi_invlrng_counts[MAXCPU]; @@ -60,9 +93,11 @@ inthand_t struct pmap; /* functions in mp_machdep.c */ +void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); +void init_secondary_tail(void); void invltlb_handler(void); void invltlb_pcid_handler(void); void invlpg_handler(void); @@ -77,6 +112,7 @@ void ipi_cpu(int cpu, u_int ipi); int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); u_int mp_bootaddress(u_int); +void set_interrupt_apic_ids(void); void smp_cache_flush(void); void smp_invlpg(struct pmap *pmap, vm_offset_t addr); void smp_masked_invlpg(cpuset_t mask, struct pmap *pmap, vm_offset_t addr); @@ -87,6 +123,9 @@ void smp_masked_invlpg_range(cpuset_t mask, struct pmap *pmap, void smp_invltlb(struct pmap *pmap); void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap); int native_start_all_aps(void); +void mem_range_AP_init(void); +void topo_probe(void); +void ipi_send_cpu(int cpu, u_int ipi); #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/amd64/include/vm.h b/sys/amd64/include/vm.h index 6573e37..22d2eca 100644 --- a/sys/amd64/include/vm.h +++ b/sys/amd64/include/vm.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009 Advanced Computing Technologies LLC + * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 52294bd..7c617be 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -204,13 +204,12 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); -cpuset_t vm_active_cpus(struct vm *vm); -cpuset_t vm_suspended_cpus(struct vm *vm); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +#ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will @@ -228,6 +227,9 @@ void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(void *rendezvous_cookie) diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h index d03d4f6..d8a6b5c 100644 --- a/sys/amd64/include/xen/xenfunc.h +++ b/sys/amd64/include/xen/xenfunc.h @@ -29,12 +29,7 @@ #ifndef _XEN_XENFUNC_H_ #define _XEN_XENFUNC_H_ -#ifdef XENHVM #include <machine/xen/xenvar.h> -#else -#include <machine/xen/xenpmap.h> -#include <machine/segments.h> -#endif #define BKPT __asm__("int3"); #define XPQ_CALL_DEPTH 5 @@ -64,10 +59,6 @@ void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line); #define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0) #endif -#ifndef XENHVM -void xen_update_descriptor(union descriptor *, union descriptor *); -#endif - extern struct mtx balloon_lock; #if 0 #define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags) diff --git a/sys/amd64/include/xen/xenpmap.h b/sys/amd64/include/xen/xenpmap.h deleted file mode 100644 index d768dad..0000000 --- a/sys/amd64/include/xen/xenpmap.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * - * Copyright (c) 2004 Christian Limpach. - * Copyright (c) 2004,2005 Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christian Limpach. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef _XEN_XENPMAP_H_ -#define _XEN_XENPMAP_H_ - -#include <machine/xen/features.h> - -void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int); -void xen_pt_switch(vm_paddr_t); -void xen_set_ldt(vm_paddr_t, unsigned long); -void xen_pgdpt_pin(vm_paddr_t); -void xen_pgd_pin(vm_paddr_t); -void xen_pgd_unpin(vm_paddr_t); -void xen_pt_pin(vm_paddr_t); -void xen_pt_unpin(vm_paddr_t); -void xen_flush_queue(void); -void xen_check_queue(void); -#if 0 -void pmap_ref(pt_entry_t *pte, vm_paddr_t ma); -#endif - -#ifdef INVARIANTS -#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__) -#else -#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0) -#endif - -#ifdef PMAP_DEBUG -#define PMAP_REF pmap_ref -#define PMAP_DEC_REF_PAGE pmap_dec_ref_page -#define PMAP_MARK_PRIV pmap_mark_privileged -#define PMAP_MARK_UNPRIV pmap_mark_unprivileged -#else -#define PMAP_MARK_PRIV(a) -#define PMAP_MARK_UNPRIV(a) -#define PMAP_REF(a, b) -#define PMAP_DEC_REF_PAGE(a) -#endif - -#define ALWAYS_SYNC 0 - -#ifdef PT_DEBUG -#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) -#else -#define PT_LOG() -#endif - -#define INVALID_P2M_ENTRY (~0UL) - -#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ - -#define SH_PD_SET_VA 1 -#define SH_PD_SET_VA_MA 2 -#define SH_PD_SET_VA_CLEAR 3 - -struct pmap; -void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type); -#ifdef notyet -static vm_paddr_t -vptetomachpte(vm_paddr_t *pte) -{ - vm_offset_t offset, ppte; - vm_paddr_t pgoffset, retval, *pdir_shadow_ptr; - int pgindex; - - ppte = (vm_offset_t)pte; - pgoffset = (ppte & PAGE_MASK); - offset = ppte - (vm_offset_t)PTmap; - pgindex = ppte >> PDRSHIFT; - - pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow); - retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset; - return (retval); -} -#endif -#define PT_GET(_ptp) \ - (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0)) - -#ifdef WRITABLE_PAGETABLES - -#define PT_SET_VA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - PT_LOG(); \ - *(_ptp) = xpmap_ptom((_npte)); \ -} while (/*CONSTCOND*/0) -#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - PT_LOG(); \ - *(_ptp) = (_npte); \ -} while (/*CONSTCOND*/0) -#define PT_CLEAR_VA(_ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - PT_LOG(); \ - *(_ptp) = 0; \ -} while (/*CONSTCOND*/0) - -#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#else /* !WRITABLE_PAGETABLES */ - -#define PT_SET_VA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - xen_queue_pt_update(vtomach(_ptp), \ - xpmap_ptom(_npte)); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - xen_queue_pt_update(vtomach(_ptp), _npte); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PT_CLEAR_VA(_ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - xen_queue_pt_update(vtomach(_ptp), 0); \ - if (sync || ALWAYS_SYNC) \ - xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#endif - -#define PT_SET_MA(_va, _ma) \ -do { \ - PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\ - (_ma), \ - UVMF_INVLPG| UVMF_ALL) < 0); \ -} while (/*CONSTCOND*/0) - -#define PT_UPDATES_FLUSH() do { \ - xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -static __inline vm_paddr_t -xpmap_mtop(vm_paddr_t mpa) -{ - vm_paddr_t tmp = (mpa & PG_FRAME); - - return machtophys(tmp) | (mpa & ~PG_FRAME); -} - -static __inline vm_paddr_t -xpmap_ptom(vm_paddr_t ppa) -{ - vm_paddr_t tmp = (ppa & PG_FRAME); - - return phystomach(tmp) | (ppa & ~PG_FRAME); -} - -static __inline void -set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ -#ifdef notyet - PANIC_IF(max_mapnr && pfn >= max_mapnr); -#endif - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef notyet - PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY)); -#endif - return; - } - xen_phys_machine[pfn] = mfn; -} - - - - -#endif /* _XEN_XENPMAP_H_ */ diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h index d9dbc5d..110a351 100644 --- a/sys/amd64/include/xen/xenvar.h +++ b/sys/amd64/include/xen/xenvar.h @@ -48,68 +48,7 @@ if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__ #define TRACE_DEBUG(argflags, _f, _a...) #endif -#ifdef XENHVM - -static inline vm_paddr_t -phystomach(vm_paddr_t pa) -{ - - return (pa); -} - -static inline vm_paddr_t -machtophys(vm_paddr_t ma) -{ - - return (ma); -} - #define vtomach(va) pmap_kextract((vm_offset_t) (va)) -#define PFNTOMFN(pa) (pa) -#define MFNTOPFN(ma) (ma) - -#define set_phys_to_machine(pfn, mfn) ((void)0) -#define phys_to_machine_mapping_valid(pfn) (TRUE) -#define PT_UPDATES_FLUSH() ((void)0) - -#else - -extern xen_pfn_t *xen_phys_machine; - - -extern xen_pfn_t *xen_machine_phys; -/* Xen starts physical pages after the 4MB ISA hole - - * FreeBSD doesn't - */ - - -#undef ADD_ISA_HOLE /* XXX */ - -#ifdef ADD_ISA_HOLE -#define ISA_INDEX_OFFSET 1024 -#define ISA_PDR_OFFSET 1 -#else -#define ISA_INDEX_OFFSET 0 -#define ISA_PDR_OFFSET 0 -#endif - - -#define PFNTOMFN(i) (xen_phys_machine[(i)]) -#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) - -#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) -#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) - -#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) -#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) - -#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT) -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) - -#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT) -#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT) - -#endif void xpq_init(void); diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index acb3a3d..3157e21 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> -#include <sys/smp.h> #include <machine/vmm.h> #include "io/iommu.h" diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index f505ea1..7cc13ca 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -802,6 +802,7 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; + break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c index 100af4b..d3a6fe8 100644 --- a/sys/amd64/vmm/amd/svm_msr.c +++ b/sys/amd64/vmm/amd/svm_msr.c @@ -27,12 +27,17 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <sys/types.h> +#include <sys/param.h> #include <sys/errno.h> +#include <sys/systm.h> #include <machine/cpufunc.h> #include <machine/specialreg.h> +#include <machine/vmm.h> +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" #include "svm_msr.h" #ifndef MSR_AMDK8_IPM @@ -105,6 +110,14 @@ svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, int error = 0; switch (num) { + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + *result = 0; + break; case MSR_AMDK8_IPM: *result = 0; break; @@ -122,6 +135,15 @@ svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) int error = 0; switch (num) { + case MSR_MTRRcap: + vm_inject_gp(sc->vm, vcpu); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + break; /* Ignore writes */ case MSR_AMDK8_IPM: /* * Ignore writes to the "Interrupt Pending Message" MSR. diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c index fb4b2c8..d860169 100644 --- a/sys/amd64/vmm/amd/vmcb.c +++ b/sys/amd64/vmm/amd/vmcb.c @@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/cpuset.h> #include <machine/segments.h> #include <machine/specialreg.h> diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c index e517778..526b0d1 100644 --- a/sys/amd64/vmm/intel/vmx_msr.c +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/cpuset.h> #include <machine/clock.h> #include <machine/cpufunc.h> @@ -396,6 +395,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) error = 0; switch (num) { + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + *val = 0; + break; case MSR_IA32_MISC_ENABLE: *val = misc_enable; break; @@ -427,6 +433,14 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) error = 0; switch (num) { + case MSR_MTRRcap: + vm_inject_gp(vmx->vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + break; /* Ignore writes */ case MSR_IA32_MISC_ENABLE: changed = val ^ misc_enable; /* diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index 0df6e7c..6e94f5b 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c index 842253d..173ef1f 100644 --- a/sys/amd64/vmm/io/vatpit.c +++ b/sys/amd64/vmm/io/vatpit.c @@ -31,7 +31,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c index a4c96cd..1db1c51 100644 --- a/sys/amd64/vmm/io/vhpet.c +++ b/sys/amd64/vmm/io/vhpet.c @@ -36,7 +36,6 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/systm.h> -#include <sys/cpuset.h> #include <dev/acpica/acpi_hpet.h> diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c index 411887d..e6b8b5a 100644 --- a/sys/amd64/vmm/io/vioapic.c +++ b/sys/amd64/vmm/io/vioapic.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/systm.h> diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 7097248..3451e1e 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -547,6 +547,8 @@ vlapic_update_ppr(struct vlapic *vlapic) VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + static void vlapic_process_eoi(struct vlapic *vlapic) { @@ -557,11 +559,7 @@ vlapic_process_eoi(struct vlapic *vlapic) isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { @@ -570,17 +568,21 @@ vlapic_process_eoi(struct vlapic *vlapic) vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { - vector = i * 32 + bitpos; vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int @@ -1092,11 +1094,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) irrptr = &lapic->irr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c index 09f763f..1e7bb93 100644 --- a/sys/amd64/vmm/io/vpmtmr.c +++ b/sys/amd64/vmm/io/vpmtmr.c @@ -29,7 +29,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/systm.h> diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c index ab9cabb..18ebc4b 100644 --- a/sys/amd64/vmm/io/vrtc.c +++ b/sys/amd64/vmm/io/vrtc.c @@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/lock.h> @@ -63,9 +62,12 @@ struct rtcdev { uint8_t reg_b; uint8_t reg_c; uint8_t reg_d; - uint8_t nvram[128 - 14]; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; } __packed; CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); struct vrtc { struct vm *vm; @@ -139,20 +141,23 @@ update_enabled(struct vrtc *vrtc) } static time_t -vrtc_curtime(struct vrtc *vrtc) +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) { sbintime_t now, delta; - time_t t; + time_t t, secs; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; if (update_enabled(vrtc)) { now = sbinuptime(); delta = now - vrtc->base_uptime; KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " "%#lx to %#lx", vrtc->base_uptime, now)); - t += delta / SBT_1S; + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; } return (t); } @@ -245,6 +250,7 @@ secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) rtc->day_of_month = rtcset(rtc, ct.day); rtc->month = rtcset(rtc, ct.mon); rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); } static int @@ -274,7 +280,7 @@ rtc_to_secs(struct vrtc *vrtc) struct timespec ts; struct rtcdev *rtc; struct vm *vm; - int error, hour, pm, year; + int century, error, hour, pm, year; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); @@ -358,10 +364,14 @@ rtc_to_secs(struct vrtc *vrtc) VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); goto fail; } - if (year >= 70) - ct.year = 1900 + year; - else - ct.year = 2000 + year; + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); + goto fail; + } error = clock_ct_to_ts(&ct, &ts); if (error || ts.tv_sec < 0) { @@ -373,13 +383,19 @@ rtc_to_secs(struct vrtc *vrtc) } return (ts.tv_sec); /* success */ fail: - return (VRTC_BROKEN_TIME); /* failure */ + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); + return (VRTC_BROKEN_TIME); } static int -vrtc_time_update(struct vrtc *vrtc, time_t newtime) +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) { struct rtcdev *rtc; + sbintime_t oldbase; time_t oldtime; uint8_t alarm_sec, alarm_min, alarm_hour; @@ -391,16 +407,21 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime) alarm_hour = rtc->alarm_hour; oldtime = vrtc->base_rtctime; - VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx", + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", oldtime, newtime); + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + oldbase, newbase); + vrtc->base_uptime = newbase; + if (newtime == oldtime) return (0); /* * If 'newtime' indicates that RTC updates are disabled then just * record that and return. There is no need to do alarm interrupt - * processing or update 'base_uptime' in this case. + * processing in this case. */ if (newtime == VRTC_BROKEN_TIME) { vrtc->base_rtctime = VRTC_BROKEN_TIME; @@ -446,8 +467,6 @@ vrtc_time_update(struct vrtc *vrtc, time_t newtime) if (uintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); - vrtc->base_uptime = sbinuptime(); - return (0); } @@ -518,7 +537,7 @@ static void vrtc_callout_handler(void *arg) { struct vrtc *vrtc = arg; - sbintime_t freqsbt; + sbintime_t freqsbt, basetime; time_t rtctime; int error; @@ -540,8 +559,8 @@ vrtc_callout_handler(void *arg) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { - rtctime = vrtc_curtime(vrtc); - error = vrtc_time_update(vrtc, rtctime); + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("%s: vrtc_time_update error %d", __func__, error)); } @@ -606,7 +625,7 @@ static int vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; - sbintime_t oldfreq, newfreq; + sbintime_t oldfreq, newfreq, basetime; time_t curtime, rtctime; int error; uint8_t oldval, changed; @@ -627,19 +646,13 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) if (changed & RTCSB_HALT) { if ((newval & RTCSB_HALT) == 0) { rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); if (rtctime == VRTC_BROKEN_TIME) { - /* - * Stop updating the RTC if the date/time - * programmed by the guest is not correct. - */ - VM_CTR0(vrtc->vm, "Invalid RTC date/time " - "programming detected"); - if (rtc_flag_broken_time) return (-1); } } else { - curtime = vrtc_curtime(vrtc); + curtime = vrtc_curtime(vrtc, &basetime); KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " "between vrtc basetime (%#lx) and curtime (%#lx)", __func__, vrtc->base_rtctime, curtime)); @@ -658,7 +671,7 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) rtctime = VRTC_BROKEN_TIME; rtc->reg_b &= ~RTCSB_UINTR; } - error = vrtc_time_update(vrtc, rtctime); + error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("vrtc_time_update error %d", error)); } @@ -738,7 +751,7 @@ vrtc_set_time(struct vm *vm, time_t secs) vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); - error = vrtc_time_update(vrtc, secs); + error = vrtc_time_update(vrtc, secs, sbinuptime()); VRTC_UNLOCK(vrtc); if (error) { @@ -755,11 +768,12 @@ time_t vrtc_get_time(struct vm *vm) { struct vrtc *vrtc; + sbintime_t basetime; time_t t; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); - t = vrtc_curtime(vrtc); + t = vrtc_curtime(vrtc, &basetime); VRTC_UNLOCK(vrtc); return (t); @@ -777,7 +791,7 @@ vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) * Don't allow writes to RTC control registers or the date/time fields. */ if (offset < offsetof(struct rtcdev, nvram[0]) || - offset >= sizeof(struct rtcdev)) { + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", offset); return (EINVAL); @@ -796,6 +810,7 @@ int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) { struct vrtc *vrtc; + sbintime_t basetime; time_t curtime; uint8_t *ptr; @@ -811,8 +826,8 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) /* * Update RTC date/time fields if necessary. */ - if (offset < 10) { - curtime = vrtc_curtime(vrtc); + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); secs_to_rtc(curtime, vrtc, 0); } @@ -852,6 +867,7 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, { struct vrtc *vrtc; struct rtcdev *rtc; + sbintime_t basetime; time_t curtime; int error, offset; @@ -869,16 +885,20 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, } error = 0; - curtime = vrtc_curtime(vrtc); - vrtc_time_update(vrtc, curtime); + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); - if (in) { - /* - * Update RTC date/time fields if necessary. - */ - if (offset < 10) - secs_to_rtc(curtime, vrtc, 0); + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + if (in) { if (offset == 12) { /* * XXX @@ -922,6 +942,18 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, *((uint8_t *)rtc + offset) = *val; break; } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } } VRTC_UNLOCK(vrtc); return (error); @@ -971,7 +1003,7 @@ vrtc_init(struct vm *vm) VRTC_LOCK(vrtc); vrtc->base_rtctime = VRTC_BROKEN_TIME; - vrtc_time_update(vrtc, curtime); + vrtc_time_update(vrtc, curtime, sbinuptime()); secs_to_rtc(curtime, vrtc, 0); VRTC_UNLOCK(vrtc); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 6bd5bce..bca9b98 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1293,8 +1293,12 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) else if (error != 0) panic("%s: vmm_fetch_instruction error %d", __func__, error); - if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) - return (EFAULT); + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } /* * If the instruction length was not specified then update it now diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 0b50e92..7172365 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -72,6 +72,8 @@ enum { VIE_OP_TYPE_POP, VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, VIE_OP_TYPE_LAST }; @@ -91,6 +93,11 @@ static const struct vie_op two_byte_opcodes[256] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, @@ -146,6 +153,16 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, @@ -803,6 +820,68 @@ done: } static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { @@ -1262,6 +1341,48 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, @@ -1302,6 +1423,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); @@ -1314,6 +1439,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_sub(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; default: error = EINVAL; break; diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index fc68a61..63044e8 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -28,16 +28,10 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> -#include <sys/types.h> -#include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/systm.h> -#include <vm/vm.h> - #include <machine/vmm.h> #include <machine/vmm_instruction_emul.h> -#include <x86/psl.h> #include "vatpic.h" #include "vatpit.h" diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c index 9ecf9af..4ae5fb9 100644 --- a/sys/amd64/vmm/vmm_stat.c +++ b/sys/amd64/vmm/vmm_stat.c @@ -33,7 +33,6 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/systm.h> #include <sys/malloc.h> -#include <sys/smp.h> #include <machine/vmm.h> #include "vmm_util.h" diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index c37d21c..45e08b5 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/pcpu.h> #include <sys/systm.h> -#include <sys/cpuset.h> #include <sys/sysctl.h> #include <machine/clock.h> @@ -289,9 +288,8 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, /* * Machine check handling is done in the host. - * Hide MTRR capability. */ - regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); + regs[3] &= ~(CPUID_MCA | CPUID_MCE); /* * Hide the debug store capability. diff --git a/sys/arm/allwinner/std.a10 b/sys/arm/allwinner/std.a10 index 0084c41..da5d895 100644 --- a/sys/arm/allwinner/std.a10 +++ b/sys/arm/allwinner/std.a10 @@ -17,5 +17,7 @@ options KERNPHYSADDR=0x40200000 makeoptions KERNVIRTADDR=0xc0200000 options KERNVIRTADDR=0xc0200000 +options ARM_L2_PIPT + files "../allwinner/files.allwinner" files "../allwinner/files.a10" diff --git a/sys/arm/amlogic/aml8726/aml8726_mmc.c b/sys/arm/amlogic/aml8726/aml8726_mmc.c index 167d3a3..7d1ef4e 100644 --- a/sys/arm/amlogic/aml8726/aml8726_mmc.c +++ b/sys/arm/amlogic/aml8726/aml8726_mmc.c @@ -70,6 +70,7 @@ struct aml8726_mmc_softc { device_t dev; struct resource *res[2]; struct mtx mtx; + struct callout ch; uint32_t port; unsigned int ref_freq; struct aml8726_mmc_gpio pwr_en; @@ -81,7 +82,7 @@ struct aml8726_mmc_softc { struct mmc_host host; int bus_busy; struct mmc_command *cmd; - unsigned int timeout_remaining; + uint32_t stop_timeout; }; static struct resource_spec aml8726_mmc_spec[] = { @@ -92,6 +93,7 @@ static struct resource_spec aml8726_mmc_spec[] = { #define AML_MMC_LOCK(sc) mtx_lock(&(sc)->mtx) #define AML_MMC_UNLOCK(sc) mtx_unlock(&(sc)->mtx) +#define AML_MMC_LOCK_ASSERT(sc) mtx_assert(&(sc)->mtx, MA_OWNED) #define AML_MMC_LOCK_INIT(sc) \ mtx_init(&(sc)->mtx, device_get_nameunit((sc)->dev), \ "mmc", MTX_DEF) @@ -107,6 +109,10 @@ static struct resource_spec aml8726_mmc_spec[] = { #define PWR_OFF_FLAG(pol) ((pol) == 0 ? GPIO_PIN_HIGH : \ GPIO_PIN_LOW) +#define MSECS_TO_TICKS(ms) (((ms)*hz)/1000 + 1) + +static void aml8726_mmc_timeout(void *arg); + static unsigned int aml8726_mmc_clk(phandle_t node) { @@ -126,6 +132,37 @@ aml8726_mmc_clk(phandle_t node) return ((unsigned int)prop); } +static uint32_t +aml8726_mmc_freq(struct aml8726_mmc_softc *sc, uint32_t divisor) +{ + + return (sc->ref_freq / ((divisor + 1) * 2)); +} + +static uint32_t +aml8726_mmc_div(struct aml8726_mmc_softc *sc, uint32_t desired_freq) +{ + uint32_t divisor; + + divisor = sc->ref_freq / (desired_freq * 2); + + if (divisor == 0) + divisor = 1; + + divisor -= 1; + + if (aml8726_mmc_freq(sc, divisor) > desired_freq) + divisor += 1; + + if (divisor > (AML_MMC_CONFIG_CMD_CLK_DIV_MASK >> + AML_MMC_CONFIG_CMD_CLK_DIV_SHIFT)) { + divisor = AML_MMC_CONFIG_CMD_CLK_DIV_MASK >> + AML_MMC_CONFIG_CMD_CLK_DIV_SHIFT; + } + + return (divisor); +} + static void aml8726_mmc_mapmem(void *arg, bus_dma_segment_t *segs, int nseg, int error) { @@ -162,25 +199,18 @@ aml8726_mmc_power_on(struct aml8726_mmc_softc *sc) PWR_ON_FLAG(sc->pwr_en.pol))); } -static int -aml8726_mmc_restart_timer(struct aml8726_mmc_softc *sc) +static void +aml8726_mmc_soft_reset(struct aml8726_mmc_softc *sc, boolean_t enable_irq) { - uint32_t count; - uint32_t isr; - - if (sc->cmd == NULL || sc->timeout_remaining == 0) - return (0); - - count = (sc->timeout_remaining > 0x1fff) ? 0x1fff : - sc->timeout_remaining; - sc->timeout_remaining -= count; + uint32_t icr; - isr = (count << AML_MMC_IRQ_STATUS_TIMER_CNT_SHIFT) | - AML_MMC_IRQ_STATUS_TIMER_EN | AML_MMC_IRQ_STATUS_TIMEOUT_IRQ; + icr = AML_MMC_IRQ_CONFIG_SOFT_RESET; - CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, isr); + if (enable_irq == true) + icr |= AML_MMC_IRQ_CONFIG_CMD_DONE_EN; - return (1); + CSR_WRITE_4(sc, AML_MMC_IRQ_CONFIG_REG, icr); + CSR_BARRIER(sc, AML_MMC_IRQ_CONFIG_REG); } static int @@ -191,7 +221,6 @@ aml8726_mmc_start_command(struct aml8726_mmc_softc *sc, struct mmc_command *cmd) uint32_t block_size; uint32_t bus_width; uint32_t cmdr; - uint32_t cycles_per_msec; uint32_t extr; uint32_t mcfgr; uint32_t nbits_per_pkg; @@ -203,14 +232,9 @@ aml8726_mmc_start_command(struct aml8726_mmc_softc *sc, struct mmc_command *cmd) return (MMC_ERR_INVALID); /* - * Ensure the hardware state machine is in a known state, - * the command done interrupt is enabled, and previous - * IRQ status bits have been cleared. + * Ensure the hardware state machine is in a known state. */ - CSR_WRITE_4(sc, AML_MMC_IRQ_CONFIG_REG, - (AML_MMC_IRQ_CONFIG_SOFT_RESET | AML_MMC_IRQ_CONFIG_CMD_DONE_EN)); - CSR_BARRIER(sc, AML_MMC_IRQ_CONFIG_REG); - CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, AML_MMC_IRQ_STATUS_CLEAR_IRQ); + aml8726_mmc_soft_reset(sc, true); /* * Start and transmission bits are per section 4.7.2 of the: @@ -225,6 +249,13 @@ aml8726_mmc_start_command(struct aml8726_mmc_softc *sc, struct mmc_command *cmd) mcfgr = sc->port; timeout = AML_MMC_CMD_TIMEOUT; + /* + * If this is a linked command, then use the previous timeout. + */ + if (cmd == cmd->mrq->stop && sc->stop_timeout) + timeout = sc->stop_timeout; + sc->stop_timeout = 0; + if ((cmd->flags & MMC_RSP_136) != 0) { cmdr |= AML_MMC_CMD_RESP_CRC7_FROM_8; cmdr |= (133 << AML_MMC_CMD_RESP_BITS_SHIFT); @@ -291,32 +322,24 @@ aml8726_mmc_start_command(struct aml8726_mmc_softc *sc, struct mmc_command *cmd) timeout = AML_MMC_WRITE_TIMEOUT * (data->len / block_size); } + + /* + * Stop terminates a multiblock read / write and thus + * can take as long to execute as an actual read / write. + */ + if (cmd->mrq->stop != NULL) + sc->stop_timeout = timeout; } sc->cmd = cmd; cmd->error = MMC_ERR_NONE; - /* - * Round up while calculating the number of cycles which - * correspond to a millisecond. Use that to determine - * the count from the desired timeout in milliseconds. - * - * The counter has a limited range which is not sufficient - * for directly implementing worst case timeouts at high clock - * rates so a 32 bit counter is implemented in software. - * - * The documentation isn't clear on when the timer starts - * so add 48 cycles for the command and 136 cycles for the - * response (the values are from the previously mentioned - * standard). - */ if (timeout > AML_MMC_MAX_TIMEOUT) timeout = AML_MMC_MAX_TIMEOUT; - cycles_per_msec = (ios->clock + 1000 - 1) / 1000; - sc->timeout_remaining = 48 + 136 + timeout * cycles_per_msec; - aml8726_mmc_restart_timer(sc); + callout_reset(&sc->ch, MSECS_TO_TICKS(timeout), + aml8726_mmc_timeout, sc); CSR_WRITE_4(sc, AML_MMC_CMD_ARGUMENT_REG, cmd->arg); CSR_WRITE_4(sc, AML_MMC_MULT_CONFIG_REG, mcfgr); @@ -330,20 +353,96 @@ aml8726_mmc_start_command(struct aml8726_mmc_softc *sc, struct mmc_command *cmd) } static void -aml8726_mmc_intr(void *arg) +aml8726_mmc_finish_command(struct aml8726_mmc_softc *sc, int mmc_error) { - struct aml8726_mmc_softc *sc = (struct aml8726_mmc_softc *)arg; + int mmc_stop_error; struct mmc_command *cmd; struct mmc_command *stop_cmd; struct mmc_data *data; + + AML_MMC_LOCK_ASSERT(sc); + + /* Clear all interrupts since the request is no longer in flight. */ + CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, AML_MMC_IRQ_STATUS_CLEAR_IRQ); + CSR_BARRIER(sc, AML_MMC_IRQ_STATUS_REG); + + /* In some cases (e.g. finish called via timeout) this is a NOP. */ + callout_stop(&sc->ch); + + cmd = sc->cmd; + sc->cmd = NULL; + + cmd->error = mmc_error; + + data = cmd->data; + + if (data && data->len && + (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) != 0) { + if ((data->flags & MMC_DATA_READ) != 0) + bus_dmamap_sync(sc->dmatag, sc->dmamap, + BUS_DMASYNC_POSTREAD); + else + bus_dmamap_sync(sc->dmatag, sc->dmamap, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sc->dmatag, sc->dmamap); + } + + /* + * If there's a linked stop command, then start the stop command. + * In order to establish a known state attempt the stop command + * even if the original request encountered an error. + */ + + stop_cmd = (cmd->mrq->stop != cmd) ? cmd->mrq->stop : NULL; + + if (stop_cmd != NULL) { + mmc_stop_error = aml8726_mmc_start_command(sc, stop_cmd); + if (mmc_stop_error == MMC_ERR_NONE) { + AML_MMC_UNLOCK(sc); + return; + } + stop_cmd->error = mmc_stop_error; + } + + AML_MMC_UNLOCK(sc); + + /* Execute the callback after dropping the lock. */ + if (cmd->mrq) + cmd->mrq->done(cmd->mrq); +} + +static void +aml8726_mmc_timeout(void *arg) +{ + struct aml8726_mmc_softc *sc = (struct aml8726_mmc_softc *)arg; + + /* + * The command failed to complete in time so forcefully + * terminate it. + */ + aml8726_mmc_soft_reset(sc, false); + + /* + * Ensure the command has terminated before continuing on + * to things such as bus_dmamap_sync / bus_dmamap_unload. + */ + while ((CSR_READ_4(sc, AML_MMC_IRQ_STATUS_REG) & + AML_MMC_IRQ_STATUS_CMD_BUSY) != 0) + cpu_spinwait(); + + aml8726_mmc_finish_command(sc, MMC_ERR_TIMEOUT); +} + +static void +aml8726_mmc_intr(void *arg) +{ + struct aml8726_mmc_softc *sc = (struct aml8726_mmc_softc *)arg; uint32_t cmdr; - uint32_t icr; uint32_t isr; uint32_t mcfgr; uint32_t previous_byte; uint32_t resp; int mmc_error; - int mmc_stop_error; unsigned int i; AML_MMC_LOCK(sc); @@ -367,12 +466,6 @@ aml8726_mmc_intr(void *arg) if ((cmdr & AML_MMC_CMD_CMD_HAS_DATA) != 0 && (isr & AML_MMC_IRQ_STATUS_WR_CRC16_OK) == 0) mmc_error = MMC_ERR_BADCRC; - } else if ((isr & AML_MMC_IRQ_STATUS_TIMEOUT_IRQ) != 0) { - if (aml8726_mmc_restart_timer(sc) != 0) { - AML_MMC_UNLOCK(sc); - return; - } - mmc_error = MMC_ERR_TIMEOUT; } else { spurious: @@ -389,49 +482,12 @@ spurious: return; } - if ((isr & AML_MMC_IRQ_STATUS_CMD_BUSY) != 0 && - /* - * A multiblock operation may keep the hardware - * busy until stop transmission is executed. - */ - (isr & AML_MMC_IRQ_STATUS_CMD_DONE_IRQ) == 0) { - if (mmc_error == MMC_ERR_NONE) - mmc_error = MMC_ERR_FAILED; - - /* - * Issue a soft reset (while leaving the command complete - * interrupt enabled) to terminate the command. - * - * Ensure the command has terminated before continuing on - * to things such as bus_dmamap_sync / bus_dmamap_unload. - */ - - icr = AML_MMC_IRQ_CONFIG_SOFT_RESET | - AML_MMC_IRQ_CONFIG_CMD_DONE_EN; - - CSR_WRITE_4(sc, AML_MMC_IRQ_CONFIG_REG, icr); - - while ((CSR_READ_4(sc, AML_MMC_IRQ_STATUS_REG) & - AML_MMC_IRQ_STATUS_CMD_BUSY) != 0) - cpu_spinwait(); - } - - /* Clear all interrupts since the request is no longer in flight. */ - CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, AML_MMC_IRQ_STATUS_CLEAR_IRQ); - CSR_BARRIER(sc, AML_MMC_IRQ_STATUS_REG); - - cmd = sc->cmd; - sc->cmd = NULL; - - cmd->error = mmc_error; - - if ((cmd->flags & MMC_RSP_PRESENT) != 0 && - mmc_error == MMC_ERR_NONE) { + if ((cmdr & AML_MMC_CMD_RESP_BITS_MASK) != 0) { mcfgr = sc->port; mcfgr |= AML_MMC_MULT_CONFIG_RESP_READOUT_EN; CSR_WRITE_4(sc, AML_MMC_MULT_CONFIG_REG, mcfgr); - if ((cmd->flags & MMC_RSP_136) != 0) { + if ((cmdr & AML_MMC_CMD_RESP_CRC7_FROM_8) != 0) { /* * Controller supplies 135:8 instead of @@ -444,48 +500,39 @@ spurious: for (i = 0; i < 4; i++) { resp = CSR_READ_4(sc, AML_MMC_CMD_ARGUMENT_REG); - cmd->resp[3 - i] = (resp << 8) | previous_byte; + sc->cmd->resp[3 - i] = (resp << 8) | + previous_byte; previous_byte = (resp >> 24) & 0xff; } } else - cmd->resp[0] = CSR_READ_4(sc, AML_MMC_CMD_ARGUMENT_REG); + sc->cmd->resp[0] = CSR_READ_4(sc, + AML_MMC_CMD_ARGUMENT_REG); } - data = cmd->data; - - if (data && data->len && - (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) != 0) { - if ((data->flags & MMC_DATA_READ) != 0) - bus_dmamap_sync(sc->dmatag, sc->dmamap, - BUS_DMASYNC_POSTREAD); - else - bus_dmamap_sync(sc->dmatag, sc->dmamap, - BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(sc->dmatag, sc->dmamap); - } + if ((isr & AML_MMC_IRQ_STATUS_CMD_BUSY) != 0 && + /* + * A multiblock operation may keep the hardware + * busy until stop transmission is executed. + */ + (isr & AML_MMC_IRQ_STATUS_CMD_DONE_IRQ) == 0) { + if (mmc_error == MMC_ERR_NONE) + mmc_error = MMC_ERR_FAILED; - /* - * If there's a linked stop command, then start the stop command. - * In order to establish a known state attempt the stop command - * even if the original request encountered an error. - */ + /* + * Issue a soft reset to terminate the command. + * + * Ensure the command has terminated before continuing on + * to things such as bus_dmamap_sync / bus_dmamap_unload. + */ - stop_cmd = (cmd->mrq->stop != cmd) ? cmd->mrq->stop : NULL; + aml8726_mmc_soft_reset(sc, false); - if (stop_cmd != NULL) { - mmc_stop_error = aml8726_mmc_start_command(sc, stop_cmd); - if (mmc_stop_error == MMC_ERR_NONE) { - AML_MMC_UNLOCK(sc); - return; - } - stop_cmd->error = mmc_stop_error; + while ((CSR_READ_4(sc, AML_MMC_IRQ_STATUS_REG) & + AML_MMC_IRQ_STATUS_CMD_BUSY) != 0) + cpu_spinwait(); } - AML_MMC_UNLOCK(sc); - - /* Execute the callback after dropping the lock. */ - if (cmd->mrq) - cmd->mrq->done(cmd->mrq); + aml8726_mmc_finish_command(sc, mmc_error); } static int @@ -698,8 +745,10 @@ aml8726_mmc_attach(device_t dev) goto fail; } - sc->host.f_min = 200000; - sc->host.f_max = 50000000; + callout_init_mtx(&sc->ch, &sc->mtx, CALLOUT_RETURNUNLOCKED); + + sc->host.f_min = aml8726_mmc_freq(sc, aml8726_mmc_div(sc, 200000)); + sc->host.f_max = aml8726_mmc_freq(sc, aml8726_mmc_div(sc, 50000000)); sc->host.host_ocr = sc->voltages[0] | sc->voltages[1]; sc->host.caps = MMC_CAP_4_BIT_DATA | MMC_CAP_HSPEED; @@ -756,10 +805,12 @@ aml8726_mmc_detach(device_t dev) * disable the interrupts, and clear the interrupts. */ (void)aml8726_mmc_power_off(sc); - CSR_WRITE_4(sc, AML_MMC_IRQ_CONFIG_REG, AML_MMC_IRQ_CONFIG_SOFT_RESET); - CSR_BARRIER(sc, AML_MMC_IRQ_CONFIG_REG); + aml8726_mmc_soft_reset(sc, false); CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, AML_MMC_IRQ_STATUS_CLEAR_IRQ); + /* This should be a NOP since no command was in flight. */ + callout_stop(&sc->ch); + AML_MMC_UNLOCK(sc); bus_generic_detach(dev); @@ -787,8 +838,7 @@ aml8726_mmc_shutdown(device_t dev) * disable the interrupts, and clear the interrupts. */ (void)aml8726_mmc_power_off(sc); - CSR_WRITE_4(sc, AML_MMC_IRQ_CONFIG_REG, AML_MMC_IRQ_CONFIG_SOFT_RESET); - CSR_BARRIER(sc, AML_MMC_IRQ_CONFIG_REG); + aml8726_mmc_soft_reset(sc, false); CSR_WRITE_4(sc, AML_MMC_IRQ_STATUS_REG, AML_MMC_IRQ_STATUS_CLEAR_IRQ); return (0); @@ -799,7 +849,6 @@ aml8726_mmc_update_ios(device_t bus, device_t child) { struct aml8726_mmc_softc *sc = device_get_softc(bus); struct mmc_ios *ios = &sc->host.ios; - unsigned int divisor; int error; int i; uint32_t cfgr; @@ -820,15 +869,8 @@ aml8726_mmc_update_ios(device_t bus, device_t child) return (EINVAL); } - divisor = sc->ref_freq / (ios->clock * 2) - 1; - if (divisor == 0 || divisor == -1) - divisor = 1; - if ((sc->ref_freq / ((divisor + 1) * 2)) > ios->clock) - divisor += 1; - if (divisor > 0x3ff) - divisor = 0x3ff; - - cfgr |= divisor; + cfgr |= aml8726_mmc_div(sc, ios->clock) << + AML_MMC_CONFIG_CMD_CLK_DIV_SHIFT; CSR_WRITE_4(sc, AML_MMC_CONFIG_REG, cfgr); diff --git a/sys/arm/amlogic/aml8726/aml8726_mmc.h b/sys/arm/amlogic/aml8726/aml8726_mmc.h index 0370943..64e3bae 100644 --- a/sys/arm/amlogic/aml8726/aml8726_mmc.h +++ b/sys/arm/amlogic/aml8726/aml8726_mmc.h @@ -47,20 +47,6 @@ #define AML_MMC_WRITE_TIMEOUT 500 #define AML_MMC_MAX_TIMEOUT 5000 -/* - * Internally the timeout is implemented by counting clock cycles. - * - * Since the hardware implements timeouts by counting cycles - * the minimum read / write timeout (assuming the minimum - * conversion factor of 1 cycle per usec) is: - * - * (8 bits * 512 bytes per block + 16 bits CRC) = 4112 usec - */ -#if ((AML_MMC_READ_TIMEOUT * 1000) < 4112 || \ - (AML_MMC_WRITE_TIMEOUT * 1000) < 4112) -#error "Single block timeout is smaller than supported" -#endif - #define AML_MMC_CMD_ARGUMENT_REG 0 #define AML_MMC_CMD_SEND_REG 4 diff --git a/sys/arm/amlogic/aml8726/files.aml8726 b/sys/arm/amlogic/aml8726/files.aml8726 index b32ebcd..ca058a5 100644 --- a/sys/arm/amlogic/aml8726/files.aml8726 +++ b/sys/arm/amlogic/aml8726/files.aml8726 @@ -4,12 +4,13 @@ kern/kern_clocksource.c standard arm/arm/bus_space_base.c standard arm/arm/bus_space_generic.c standard -arm/arm/bus_space_asm_generic.S standard +arm/arm/gic.c standard arm/arm/pl310.c standard arm/amlogic/aml8726/aml8726_l2cache.c standard arm/amlogic/aml8726/aml8726_machdep.c standard +arm/amlogic/aml8726/aml8726_mp.c optional smp arm/amlogic/aml8726/aml8726_identsoc.c standard arm/amlogic/aml8726/aml8726_ccm.c standard arm/amlogic/aml8726/aml8726_clkmsr.c standard diff --git a/sys/arm/amlogic/aml8726/files.smp b/sys/arm/amlogic/aml8726/files.smp deleted file mode 100644 index 9b6c6ab..0000000 --- a/sys/arm/amlogic/aml8726/files.smp +++ /dev/null @@ -1,4 +0,0 @@ -#$FreeBSD$ - -arm/arm/gic.c standard -arm/amlogic/aml8726/aml8726_mp.c standard diff --git a/sys/arm/amlogic/aml8726/std.aml8726 b/sys/arm/amlogic/aml8726/std.aml8726 index 073f02c..61b515f 100644 --- a/sys/arm/amlogic/aml8726/std.aml8726 +++ b/sys/arm/amlogic/aml8726/std.aml8726 @@ -4,6 +4,15 @@ cpu CPU_CORTEXA machine arm armv6 makeoptions CONF_CFLAGS="-march=armv7a" +# Physical memory starts at 0x80000000. We assume the kernel is loaded +# at 0x80100000 by u-boot (which doesn't support ubldr since it's missing +# CONFIG_API). The kernel must be supplied as a binary since u-boot is +# also missing CONFIG_CMD_ELF. +# +# +options KERNVIRTADDR=0xc0100000 # Used in ldscript.arm +makeoptions KERNVIRTADDR=0xc0100000 + device fdt_pinctrl files "../amlogic/aml8726/files.aml8726" diff --git a/sys/arm/amlogic/aml8726/std.odroidc1 b/sys/arm/amlogic/aml8726/std.odroidc1 deleted file mode 100644 index 441c135..0000000 --- a/sys/arm/amlogic/aml8726/std.odroidc1 +++ /dev/null @@ -1,17 +0,0 @@ -# $FreeBSD$ - -include "../amlogic/aml8726/std.aml8726" - -makeoptions FDT_DTS_FILE=odroidc1.dts - -options SMP # Enable multiple cores -files "../amlogic/aml8726/files.smp" - -# Physical memory starts at 0x00000000. We assume the kernel is loaded -# at 0x00100000 by u-boot (which doesn't support ubldr since it's missing -# CONFIG_API). The kernel must be supplied as a binary since u-boot is -# also missing CONFIG_CMD_ELF. -# -# -options KERNVIRTADDR=0xc0100000 # Used in ldscript.arm -makeoptions KERNVIRTADDR=0xc0100000 diff --git a/sys/arm/amlogic/aml8726/std.vsatv102-m6 b/sys/arm/amlogic/aml8726/std.vsatv102-m6 deleted file mode 100644 index e0014a4..0000000 --- a/sys/arm/amlogic/aml8726/std.vsatv102-m6 +++ /dev/null @@ -1,17 +0,0 @@ -# $FreeBSD$ - -include "../amlogic/aml8726/std.aml8726" - -makeoptions FDT_DTS_FILE=vsatv102-m6.dts - -options SMP # Enable multiple cores -files "../amlogic/aml8726/files.smp" - -# Physical memory starts at 0x80000000. We assume the kernel is loaded -# at 0x80100000 by u-boot (which doesn't support ubldr since it's missing -# CONFIG_API). The kernel must be supplied as a binary since u-boot is -# also missing CONFIG_CMD_ELF. -# -# -options KERNVIRTADDR=0xc0100000 # Used in ldscript.arm -makeoptions KERNVIRTADDR=0xc0100000 diff --git a/sys/arm/arm/busdma_machdep-v6.c b/sys/arm/arm/busdma_machdep-v6.c index ed501c5..7236c5a 100644 --- a/sys/arm/arm/busdma_machdep-v6.c +++ b/sys/arm/arm/busdma_machdep-v6.c @@ -1685,8 +1685,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= vaddr & PAGE_MASK; - bpage->busaddr |= vaddr & PAGE_MASK; + bpage->vaddr |= addr & PAGE_MASK; + bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; bpage->dataaddr = addr; diff --git a/sys/arm/arm/busdma_machdep.c b/sys/arm/arm/busdma_machdep.c index 265292d..acd8f81 100644 --- a/sys/arm/arm/busdma_machdep.c +++ b/sys/arm/arm/busdma_machdep.c @@ -1441,8 +1441,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= vaddr & PAGE_MASK; - bpage->busaddr |= vaddr & PAGE_MASK; + bpage->vaddr |= addr & PAGE_MASK; + bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; bpage->dataaddr = addr; diff --git a/sys/arm/arm/cpufunc.c b/sys/arm/arm/cpufunc.c index f98e91a..a3c8239 100644 --- a/sys/arm/arm/cpufunc.c +++ b/sys/arm/arm/cpufunc.c @@ -1186,7 +1186,8 @@ arm11x6_setup(void) CPU_CONTROL_32BD_ENABLE | CPU_CONTROL_LABT_ENABLE | CPU_CONTROL_SYST_ENABLE | - CPU_CONTROL_IC_ENABLE; + CPU_CONTROL_IC_ENABLE | + CPU_CONTROL_UNAL_ENABLE; /* * "write as existing" bits diff --git a/sys/arm/arm/locore-v4.S b/sys/arm/arm/locore-v4.S index 8ef53e9..d798e97 100644 --- a/sys/arm/arm/locore-v4.S +++ b/sys/arm/arm/locore-v4.S @@ -116,7 +116,7 @@ ASENTRY_NP(_start) * If we're running with MMU disabled, test against the * physical address instead. */ - mrc p15, 0, r2, c1, c0, 0 + mrc p15, 0, r2, c1, c0, 0 ands r2, r2, #CPU_CONTROL_MMU_ENABLE ldreq r6, =PHYSADDR ldrne r6, =LOADERRAMADDR @@ -125,7 +125,7 @@ ASENTRY_NP(_start) cmp r7, pc bhi from_ram b do_copy - + flash_lower: cmp r6, pc bls from_ram @@ -148,12 +148,12 @@ from_ram: disable_mmu: /* Disable MMU for a while */ - mrc p15, 0, r2, c1, c0, 0 + mrc p15, 0, r2, c1, c0, 0 bic r2, r2, #(CPU_CONTROL_MMU_ENABLE | CPU_CONTROL_DC_ENABLE |\ CPU_CONTROL_WBUF_ENABLE) bic r2, r2, #(CPU_CONTROL_IC_ENABLE) bic r2, r2, #(CPU_CONTROL_BPRD_ENABLE) - mcr p15, 0, r2, c1, c0, 0 + mcr p15, 0, r2, c1, c0, 0 nop nop @@ -169,36 +169,16 @@ Lunmapped: adr r0, Lpagetable bl translate_va_to_pa -#ifndef _ARM_ARCH_6 /* * Some of the older ports (the various XScale, mostly) assume * that the memory before the kernel is mapped, and use it for - * the various stacks, page tables, etc. For those CPUs, map the - * 64 first MB of RAM, as it used to be. + * the various stacks, page tables, etc. For those CPUs, map the + * 64 first MB of RAM, as it used to be. */ /* * Map PA == VA - */ - ldr r5, =PHYSADDR - mov r1, r5 - mov r2, r5 - /* Map 64MiB, preserved over calls to build_pagetables */ - mov r3, #64 - bl build_pagetables - - /* Create the kernel map to jump to */ - mov r1, r5 - ldr r2, =(KERNBASE) - bl build_pagetables - ldr r5, =(KERNPHYSADDR) -#else - /* - * Map PA == VA - */ - /* Find the start kernels load address */ - adr r5, _start - ldr r2, =(L1_S_OFFSET) - bic r5, r2 + */ + ldr r5, =PHYSADDR mov r1, r5 mov r2, r5 /* Map 64MiB, preserved over calls to build_pagetables */ @@ -207,10 +187,10 @@ Lunmapped: /* Create the kernel map to jump to */ mov r1, r5 - ldr r2, =(KERNVIRTADDR) + ldr r2, =(KERNBASE) bl build_pagetables -#endif - + ldr r5, =(KERNPHYSADDR) + #if defined(SOCDEV_PA) && defined(SOCDEV_VA) /* Create the custom map */ ldr r1, =SOCDEV_PA @@ -221,26 +201,16 @@ Lunmapped: mcr p15, 0, r0, c2, c0, 0 /* Set TTB */ mcr p15, 0, r0, c8, c7, 0 /* Flush TLB */ -#if defined(CPU_ARM1136) || defined(CPU_ARM1176) || defined(CPU_CORTEXA) || defined(CPU_MV_PJ4B) || defined(CPU_KRAIT) - mov r0, #0 - mcr p15, 0, r0, c13, c0, 1 /* Set ASID to 0 */ -#endif - /* Set the Domain Access register. Very important! */ - mov r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT) + mov r0, #((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT) mcr p15, 0, r0, c3, c0, 0 - /* + /* * Enable MMU. * On armv6 enable extended page tables, and set alignment checking * to modulo-4 (CPU_CONTROL_UNAL_ENABLE) for the ldrd/strd * instructions emitted by clang. */ mrc p15, 0, r0, c1, c0, 0 -#ifdef _ARM_ARCH_6 - orr r0, r0, #(CPU_CONTROL_V6_EXTPAGE | CPU_CONTROL_UNAL_ENABLE) - orr r0, r0, #(CPU_CONTROL_AFLT_ENABLE) - orr r0, r0, #(CPU_CONTROL_AF_ENABLE) -#endif orr r0, r0, #(CPU_CONTROL_MMU_ENABLE) mcr p15, 0, r0, c1, c0, 0 nop @@ -280,7 +250,7 @@ virt_done: /* init arm will return the new stack pointer. */ mov sp, r0 - bl _C_LABEL(mi_startup) /* call mi_startup()! */ + bl _C_LABEL(mi_startup) /* call mi_startup()! */ adr r0, .Lmainreturned b _C_LABEL(panic) @@ -389,11 +359,11 @@ pagetable: .word _C_LABEL(cpufuncs) ENTRY_NP(cpu_halt) - mrs r2, cpsr + mrs r2, cpsr bic r2, r2, #(PSR_MODE) - orr r2, r2, #(PSR_SVC32_MODE) + orr r2, r2, #(PSR_SVC32_MODE) orr r2, r2, #(PSR_I | PSR_F) - msr cpsr_fsxc, r2 + msr cpsr_fsxc, r2 ldr r4, .Lcpu_reset_address ldr r4, [r4] @@ -419,9 +389,9 @@ ENTRY_NP(cpu_halt) * Hurl ourselves into the ROM */ mov r0, #(CPU_CONTROL_32BP_ENABLE | CPU_CONTROL_32BD_ENABLE) - mcr p15, 0, r0, c1, c0, 0 - mcrne p15, 0, r2, c8, c7, 0 /* nail I+D TLB on ARMv4 and greater */ - mov pc, r4 + mcr p15, 0, r0, c1, c0, 0 + mcrne p15, 0, r2, c8, c7, 0 /* nail I+D TLB on ARMv4 and greater */ + mov pc, r4 /* * _cpu_reset_address contains the address to branch to, to complete @@ -458,7 +428,7 @@ ENTRY(longjmp) END(longjmp) .data - .global _C_LABEL(esym) + .global _C_LABEL(esym) _C_LABEL(esym): .word _C_LABEL(end) ENTRY_NP(abort) @@ -471,7 +441,7 @@ ENTRY_NP(sigcode) /* * Call the sigreturn system call. - * + * * We have to load r7 manually rather than using * "ldr r7, =SYS_sigreturn" to ensure the value of szsigcode is * correct. Using the alternative places esigcode at the address diff --git a/sys/arm/arm/locore-v6.S b/sys/arm/arm/locore-v6.S index 55b4311..7d5ba97 100644 --- a/sys/arm/arm/locore-v6.S +++ b/sys/arm/arm/locore-v6.S @@ -39,7 +39,7 @@ __FBSDID("$FreeBSD$"); -#ifndef ARM_NEW_PMAP +#ifndef ARM_NEW_PMAP #define PTE1_OFFSET L1_S_OFFSET #define PTE1_SHIFT L1_S_SHIFT #define PTE1_SIZE L1_S_SIZE @@ -52,13 +52,13 @@ __FBSDID("$FreeBSD$"); .align 2 /* - * On entry for FreeBSD boot ABI: - * r0 - metadata pointer or 0 (boothowto on AT91's boot2) - * r1 - if (r0 == 0) then metadata pointer - * On entry for Linux boot ABI: + * On entry for FreeBSD boot ABI: + * r0 - metadata pointer or 0 (boothowto on AT91's boot2) + * r1 - if (r0 == 0) then metadata pointer + * On entry for Linux boot ABI: * r0 - 0 * r1 - machine type (passed as arg2 to initarm) - * r2 - Pointer to a tagged list or dtb image (phys addr) (passed as arg1 initarm) + * r2 - Pointer to a tagged list or dtb image (phys addr) (passed as arg1 initarm) * * For both types of boot we gather up the args, put them in a struct arm_boot_params * structure and pass that to initarm. @@ -66,17 +66,17 @@ __FBSDID("$FreeBSD$"); .globl btext btext: ASENTRY_NP(_start) - STOP_UNWINDING /* Can't unwind into the bootloader! */ + STOP_UNWINDING /* Can't unwind into the bootloader! */ - /* Make sure interrupts are disabled. */ + /* Make sure interrupts are disabled. */ cpsid ifa - mov r8, r0 /* 0 or boot mode from boot2 */ - mov r9, r1 /* Save Machine type */ - mov r10, r2 /* Save meta data */ + mov r8, r0 /* 0 or boot mode from boot2 */ + mov r9, r1 /* Save Machine type */ + mov r10, r2 /* Save meta data */ mov r11, r3 /* Future expansion */ - /* + /* * Check whether data cache is enabled. If it is, then we know * current tags are valid (not power-on garbage values) and there * might be dirty lines that need cleaning. Disable cache to prevent @@ -93,7 +93,7 @@ ASENTRY_NP(_start) * valid. Disable all caches and the MMU, and invalidate everything * before setting up new page tables and re-enabling the mmu. */ -1: +1: bic r7, #CPU_CONTROL_DC_ENABLE bic r7, #CPU_CONTROL_MMU_ENABLE bic r7, #CPU_CONTROL_IC_ENABLE @@ -119,13 +119,13 @@ ASENTRY_NP(_start) /* * Map PA == VA */ - /* Find the start kernels load address */ + /* Find the start kernels load address */ adr r5, _start ldr r2, =(PTE1_OFFSET) bic r5, r2 mov r1, r5 mov r2, r5 - /* Map 64MiB, preserved over calls to build_pagetables */ + /* Map 64MiB, preserved over calls to build_pagetables */ mov r3, #64 bl build_pagetables @@ -142,41 +142,41 @@ ASENTRY_NP(_start) #endif bl init_mmu - /* Switch to virtual addresses. */ + /* Switch to virtual addresses. */ ldr pc, =1f 1: - /* Setup stack, clear BSS */ + /* Setup stack, clear BSS */ ldr r1, =.Lstart ldmia r1, {r1, r2, sp} /* Set initial stack and */ add sp, sp, #INIT_ARM_STACK_SIZE - sub r2, r2, r1 /* get zero init data */ + sub r2, r2, r1 /* get zero init data */ mov r3, #0 2: str r3, [r1], #0x0004 /* get zero init data */ - subs r2, r2, #4 + subs r2, r2, #4 bgt 2b - mov r1, #28 /* loader info size is 28 bytes also second arg */ - subs sp, sp, r1 /* allocate arm_boot_params struct on stack */ - mov r0, sp /* loader info pointer is first arg */ - bic sp, sp, #7 /* align stack to 8 bytes */ - str r1, [r0] /* Store length of loader info */ + mov r1, #28 /* loader info size is 28 bytes also second arg */ + subs sp, sp, r1 /* allocate arm_boot_params struct on stack */ + mov r0, sp /* loader info pointer is first arg */ + bic sp, sp, #7 /* align stack to 8 bytes */ + str r1, [r0] /* Store length of loader info */ str r8, [r0, #4] /* Store r0 from boot loader */ str r9, [r0, #8] /* Store r1 from boot loader */ str r10, [r0, #12] /* store r2 from boot loader */ str r11, [r0, #16] /* store r3 from boot loader */ str r5, [r0, #20] /* store the physical address */ - adr r4, Lpagetable /* load the pagetable address */ + adr r4, Lpagetable /* load the pagetable address */ ldr r5, [r4, #4] str r5, [r0, #24] /* store the pagetable address */ mov fp, #0 /* trace back starts here */ bl _C_LABEL(initarm) /* Off we go */ - /* init arm will return the new stack pointer. */ + /* init arm will return the new stack pointer. */ mov sp, r0 - bl _C_LABEL(mi_startup) /* call mi_startup()! */ + bl _C_LABEL(mi_startup) /* call mi_startup()! */ ldr r0, =.Lmainreturned b _C_LABEL(panic) @@ -219,8 +219,8 @@ translate_va_to_pa: mov pc, lr /* - * Init MMU - * r0 - The table base address + * Init MMU + * r0 - the table base address */ ASENTRY_NP(init_mmu) @@ -267,11 +267,11 @@ END(init_mmu) /* - * Init SMP coherent mode, enable caching and switch to final MMU table. - * Called with disabled caches - * r0 - The table base address - * r1 - clear bits for aux register - * r2 - set bits for aux register + * Init SMP coherent mode, enable caching and switch to final MMU table. + * Called with disabled caches + * r0 - The table base address + * r1 - clear bits for aux register + * r2 - set bits for aux register */ ASENTRY_NP(reinit_mmu) push {r4-r11, lr} @@ -331,11 +331,11 @@ END(reinit_mmu) /* * Builds the page table - * r0 - The table base address - * r1 - The physical address (trashed) - * r2 - The virtual address (trashed) - * r3 - The number of 1MiB sections - * r4 - Trashed + * r0 - The table base address + * r1 - The physical address (trashed) + * r2 - The virtual address (trashed) + * r3 - The number of 1MiB sections + * r4 - Trashed * * Addresses must be 1MiB aligned */ @@ -350,15 +350,15 @@ build_pagetables: #endif orr r1, r4 - /* Move the virtual address to the correct bit location */ + /* Move the virtual address to the correct bit location */ lsr r2, #(PTE1_SHIFT - 2) mov r4, r3 1: str r1, [r0, r2] - add r2, r2, #4 - add r1, r1, #(PTE1_SIZE) - adds r4, r4, #-1 + add r2, r2, #4 + add r1, r1, #(PTE1_SIZE) + adds r4, r4, #-1 bhi 1b mov pc, lr @@ -372,7 +372,7 @@ VA_TO_PA_POINTER(Lpagetable, boot_pt1) .word svcstk /* must remain in order together. */ .Lmainreturned: - .asciz "main() returned" + .asciz "main() returned" .align 2 .bss @@ -380,8 +380,8 @@ svcstk: .space INIT_ARM_STACK_SIZE * MAXCPU /* - * Memory for the initial pagetable. We are unable to place this in - * the bss as this will be cleared after the table is loaded. + * Memory for the initial pagetable. We are unable to place this in + * the bss as this will be cleared after the table is loaded. */ .section ".init_pagetable" .align 14 /* 16KiB aligned */ @@ -398,7 +398,7 @@ boot_pt1: #if defined(SMP) ASENTRY_NP(mpentry) - /* Make sure interrupts are disabled. */ + /* Make sure interrupts are disabled. */ cpsid ifa /* Setup core, disable all caches. */ @@ -419,10 +419,10 @@ ASENTRY_NP(mpentry) mcr CP15_ICIALLU ISB - /* Find the delta between VA and PA */ + /* Find the delta between VA and PA */ adr r0, Lpagetable bl translate_va_to_pa - + bl init_mmu adr r1, .Lstart+8 /* Get initstack pointer from */ @@ -433,7 +433,7 @@ ASENTRY_NP(mpentry) mul r2, r1, r0 /* Point sp to initstack */ add sp, sp, r2 /* area for this processor. */ - /* Switch to virtual addresses. */ + /* Switch to virtual addresses. */ ldr pc, =1f 1: mov fp, #0 /* trace back starts here */ @@ -459,14 +459,14 @@ ENTRY_NP(cpu_halt) ldr r4, [r4] teq r4, #0 movne pc, r4 -1: +1: WFI b 1b /* * _cpu_reset_address contains the address to branch to, to complete * the cpu reset after turning the MMU off - * This variable is provided by the hardware specific code + * This variable is provided by the hardware specific code */ .Lcpu_reset_address: .word _C_LABEL(cpu_reset_address) @@ -498,38 +498,37 @@ END(abort) ENTRY_NP(sigcode) mov r0, sp - add r0, r0, #SIGF_UC + add r0, r0, #SIGF_UC /* - * Call the sigreturn system call. + * Call the sigreturn system call. * * We have to load r7 manually rather than using - * "ldr r7, =SYS_sigreturn" to ensure the value of szsigcode is + * "ldr r7, =SYS_sigreturn" to ensure the value of szsigcode is * correct. Using the alternative places esigcode at the address - * of the data rather than the address one past the data. + * of the data rather than the address one past the data. */ - ldr r7, [pc, #12] /* Load SYS_sigreturn */ + ldr r7, [pc, #12] /* Load SYS_sigreturn */ swi SYS_sigreturn - /* Well if that failed we better exit quick ! */ + /* Well if that failed we better exit quick ! */ - ldr r7, [pc, #8] /* Load SYS_exit */ + ldr r7, [pc, #8] /* Load SYS_exit */ swi SYS_exit - /* Branch back to retry SYS_sigreturn */ + /* Branch back to retry SYS_sigreturn */ b . - 16 END(sigcode) - .word SYS_sigreturn .word SYS_exit .align 2 - .global _C_LABEL(esigcode) + .global _C_LABEL(esigcode) _C_LABEL(esigcode): .data - .global szsigcode + .global szsigcode szsigcode: .long esigcode-sigcode diff --git a/sys/arm/arm/pmap-v6-new.c b/sys/arm/arm/pmap-v6-new.c index 53896d4..f404214 100644 --- a/sys/arm/arm/pmap-v6-new.c +++ b/sys/arm/arm/pmap-v6-new.c @@ -6094,13 +6094,13 @@ pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) /* - * Clean L1 data cache range on a single page, which is not mapped yet. + * Clean L1 data cache range by physical address. + * The range must be within a single page. */ static void pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { struct sysmaps *sysmaps; - vm_offset_t va; KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, ("%s: not on single page", __func__)); @@ -6111,9 +6111,8 @@ pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) if (*sysmaps->CMAP3) panic("%s: CMAP3 busy", __func__); pte2_store(sysmaps->CMAP3, PTE2_KERN_NG(pa, PTE2_AP_KRW, ma)); - va = (vm_offset_t)sysmaps->CADDR3; - tlb_flush_local(va); - dcache_wb_pou(va, size); + tlb_flush_local((vm_offset_t)sysmaps->CADDR3); + dcache_wb_pou((vm_offset_t)sysmaps->CADDR3 + (pa & PAGE_MASK), size); pte2_clear(sysmaps->CMAP3); sched_unpin(); mtx_unlock(&sysmaps->lock); diff --git a/sys/arm/broadcom/bcm2835/bcm2835_mbox.c b/sys/arm/broadcom/bcm2835/bcm2835_mbox.c index 2ab7ebf..af600ca 100644 --- a/sys/arm/broadcom/bcm2835/bcm2835_mbox.c +++ b/sys/arm/broadcom/bcm2835/bcm2835_mbox.c @@ -306,8 +306,7 @@ bcm2835_mbox_init_dma(device_t dev, size_t len, bus_dma_tag_t *tag, return (NULL); } - err = bus_dmamap_load(*tag, *map, buf, - sizeof(struct msg_set_power_state), bcm2835_mbox_dma_cb, + err = bus_dmamap_load(*tag, *map, buf, len, bcm2835_mbox_dma_cb, phys, 0); if (err != 0) { bus_dmamem_free(*tag, buf, *map); diff --git a/sys/arm/broadcom/bcm2835/bcm2835_sdhci.c b/sys/arm/broadcom/bcm2835/bcm2835_sdhci.c index 19eeb71..1024a48 100644 --- a/sys/arm/broadcom/bcm2835/bcm2835_sdhci.c +++ b/sys/arm/broadcom/bcm2835/bcm2835_sdhci.c @@ -181,17 +181,16 @@ bcm_sdhci_attach(device_t dev) if (err == 0) { /* Convert to MHz */ default_freq /= 1000000; - if (bootverbose) - device_printf(dev, "default frequency: %dMHz\n", - default_freq); + } + if (default_freq == 0) { + node = ofw_bus_get_node(sc->sc_dev); + if ((OF_getencprop(node, "clock-frequency", &cell, + sizeof(cell))) > 0) + default_freq = cell / 1000000; } if (default_freq == 0) default_freq = BCM2835_DEFAULT_SDHCI_FREQ; - node = ofw_bus_get_node(sc->sc_dev); - if ((OF_getprop(node, "clock-frequency", &cell, sizeof(cell))) > 0) - default_freq = fdt32_to_cpu(cell)/1000000; - if (bootverbose) device_printf(dev, "SDHCI frequency: %dMHz\n", default_freq); diff --git a/sys/arm/broadcom/bcm2835/std.bcm2836 b/sys/arm/broadcom/bcm2835/std.bcm2836 index 874083f..bb112be 100644 --- a/sys/arm/broadcom/bcm2835/std.bcm2836 +++ b/sys/arm/broadcom/bcm2835/std.bcm2836 @@ -5,6 +5,8 @@ cpu CPU_CORTEXA makeoptions CONF_CFLAGS="-march=armv7a" options SOC_BCM2836 +options ARM_L2_PIPT + files "../broadcom/bcm2835/files.bcm2836" files "../broadcom/bcm2835/files.bcm283x" diff --git a/sys/arm/conf/AML8726 b/sys/arm/conf/AML8726 new file mode 100644 index 0000000..7272db4 --- /dev/null +++ b/sys/arm/conf/AML8726 @@ -0,0 +1,143 @@ +# +# Kernel configuration for Amlogic aml8726 boards. +# +# For more information on this file, please read the config(5) manual page, +# and/or the handbook section on Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD$ + +ident AML8726 +include "../amlogic/aml8726/std.aml8726" + +options HZ=100 +options SCHED_ULE # ULE scheduler +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Control Transmission Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options QUOTA # Enable disk quotas for UFS +options NFSCL # Network Filesystem Client +options NFSLOCKD # Network Lock Manager +options NFS_ROOT # NFS usable as /, requires NFSCL +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options TMPFS # Efficient memory filesystem +options GEOM_PART_GPT # GUID Partition Tables +options GEOM_PART_BSD # BSD partition scheme +options GEOM_PART_MBR # MBR partition scheme +options COMPAT_43 # Compatible with BSD 4.3 [KEEP THIS!] +options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options FREEBSD_BOOT_LOADER # Process metadata passed from loader(8) +options LINUX_BOOT_ABI +options VFP # Enable floating point hardware support +options SMP # Enable multiple cores + +# Debugging +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols +options ALT_BREAK_TO_DEBUGGER +#options VERBOSE_SYSINIT # Enable verbose sysinit messages +options BOOTVERBOSE=1 +options KDB # Enable kernel debugger support +# For minimum debugger support (stable branch) use: +#options KDB_TRACE # Print a stack trace for a panic +# For full debugger support use this instead: +options DDB # Enable the kernel debugger +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed +#options DIAGNOSTIC + +# NFS root from boopt/dhcp +#options BOOTP +#options BOOTP_NFSROOT +#options BOOTP_COMPAT +#options BOOTP_NFSV3 +#options BOOTP_WIRED_TO=axe0 + +# MMC/SD/SDIO Card slot support +device mmc # mmc/sd bus +device mmcsd # mmc/sd flash cards + +# Boot device is 2nd slice on MMC/SD card +options ROOTDEVNAME=\"ufs:mmcsd0s2\" + +# GPIO +device gpio +device gpioled + +# I2C support +device iicbus +device iicbb +device iic + +# vt is the default console driver, resembling an SCO console +device vt +#device kbdmux + +# Serial (COM) ports +device uart # Generic UART driver + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device pty # BSD-style compatibility pseudo ttys + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +# USB support +device usb # General USB code (mandatory for USB) +device dwcotg # DWC OTG controller +options USB_HOST_ALIGN=64 # Align usb buffers to cache line size. +options USB_DEBUG +#options USB_REQ_DEBUG +#options USB_VERBOSE + +#device ukbd # USB keyboard +#device ums # USB mouse + +device scbus # SCSI bus (required for ATA/SCSI) +device da # Direct Access (disks) +device umass # Disks/Mass storage - Requires scbus and da + +# Ethernet support +device miibus # MII bus support + +# SoC Ethernet, requires miibus +device dwc + +# USB Ethernet support, requires miibus +device axe # ASIX Electronics USB Ethernet + +# Flattened Device Tree +options FDT # Configure using FDT/DTB data diff --git a/sys/arm/conf/EFIKA_MX b/sys/arm/conf/EFIKA_MX index 6ea9d8a..a4690f6 100644 --- a/sys/arm/conf/EFIKA_MX +++ b/sys/arm/conf/EFIKA_MX @@ -24,6 +24,8 @@ include "../freescale/imx/std.imx51" makeoptions WITHOUT_MODULES="ahc" +options SOC_IMX51 + options SCHED_4BSD # 4BSD scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking diff --git a/sys/arm/conf/IMX53 b/sys/arm/conf/IMX53 index d627c25..9f78f30 100644 --- a/sys/arm/conf/IMX53 +++ b/sys/arm/conf/IMX53 @@ -22,6 +22,8 @@ ident IMX53 include "../freescale/imx/std.imx53" +options SOC_IMX53 + options SCHED_4BSD # 4BSD scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking diff --git a/sys/arm/conf/IMX6 b/sys/arm/conf/IMX6 index ada4eea..f1baf29 100644 --- a/sys/arm/conf/IMX6 +++ b/sys/arm/conf/IMX6 @@ -21,6 +21,8 @@ ident IMX6 include "../freescale/imx/std.imx6" +options SOC_IMX6 + options HZ=500 # Scheduling quantum is 2 milliseconds. options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption diff --git a/sys/arm/conf/ODROIDC1 b/sys/arm/conf/ODROIDC1 index 7d70fa4..7bb7b56 100644 --- a/sys/arm/conf/ODROIDC1 +++ b/sys/arm/conf/ODROIDC1 @@ -1,7 +1,7 @@ # ODROIDC1 -- Custom configuration for the HardKernel ODROID-C1 SBC # -# For more information on this file, please read the handbook section on -# Kernel Configuration Files: +# For more information on this file, please read the config(5) manual page, +# and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # @@ -17,117 +17,10 @@ # # $FreeBSD$ -ident ODROIDC1 - -include "../amlogic/aml8726/std.odroidc1" - -options HZ=100 -options SCHED_ULE # ULE scheduler -options PREEMPTION # Enable kernel thread preemption -options INET # InterNETworking -options INET6 # IPv6 communications protocols -options FFS # Berkeley Fast Filesystem -options SOFTUPDATES # Enable FFS soft updates support -options UFS_ACL # Support for access control lists -options UFS_DIRHASH # Improve performance on big directories -options PROCFS # Process filesystem (requires PSEUDOFS) -options PSEUDOFS # Pseudo-filesystem framework -options GEOM_PART_BSD # BSD partition scheme -options GEOM_PART_MBR # MBR partition scheme -options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI -options KTRACE # ktrace(1) support -options SYSVSHM # SYSV-style shared memory -options SYSVMSG # SYSV-style message queues -options SYSVSEM # SYSV-style semaphores -options _KPOSIX_PRIORITY_SCHEDULING # Posix P1003_1B real-time extensions -options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. -options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options LINUX_BOOT_ABI -options VFP # vfp/neon - -# Debugging -makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols -options ALT_BREAK_TO_DEBUGGER -#options VERBOSE_SYSINIT # Enable verbose sysinit messages -options BOOTVERBOSE=1 -options KDB -options DDB # Enable the kernel debugger -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed -#options DIAGNOSTIC - -# NFS support -#options NFSCL # New Network Filesystem Client -#options NFSLOCKD # Network Lock Manager -#options NFS_ROOT # NFS usable as /, requires NFSCL - -# NFS root from boopt/dhcp -#options BOOTP -#options BOOTP_NFSROOT -#options BOOTP_COMPAT -#options BOOTP_NFSV3 -#options BOOTP_WIRED_TO=axe0 - -# MMC/SD/SDIO card slot support -device mmc # mmc/sd bus -device mmcsd # mmc/sd flash cards - -# Boot device is 2nd slice on MMC/SD card -options ROOTDEVNAME=\"ufs:mmcsd0s2\" - -# GPIO -device gpio -device gpioled - -# I2C support -device iicbus -device iicbb -device iic +#NO_UNIVERSE -# vt is the default console driver, resembling an SCO console -device vt -#device kbdmux - -# Serial (COM) ports -device uart # Generic UART driver - -# Pseudo devices. -device loop # Network loopback -device random # Entropy device -device ether # Ethernet support -device pty # BSD-style compatibility pseudo ttys - -# The `bpf' device enables the Berkeley Packet Filter. -# Be aware of the administrative consequences of enabling this! -# Note that 'bpf' is required for DHCP. -device bpf # Berkeley packet filter - -# USB support -device usb # General USB code (mandatory for USB) -device dwcotg # DWC OTG controller -options USB_HOST_ALIGN=64 # Cacheline size is 64 -options USB_DEBUG -#options USB_REQ_DEBUG -#options USB_VERBOSE - -#device ukbd # USB keyboard -#device ums # USB mouse - -device scbus # SCSI bus (required for ATA/SCSI) -device da # Direct Access (disks) -device umass # Disks/Mass storage - Requires scbus and da - -# Ethernet support -device miibus # MII bus support - -# SoC Ethernet, requires miibus -device dwc - -# USB Ethernet, requires miibus -device axe # ASIX Electronics USB Ethernet +include "AML8726" +ident ODROIDC1 -# Flattened Device Tree -options FDT -options FDT_DTB_STATIC +options FDT_DTB_STATIC +makeoptions FDT_DTS_FILE=odroidc1.dts diff --git a/sys/arm/conf/RK3188 b/sys/arm/conf/RK3188 index 2aba68e..64065d3 100644 --- a/sys/arm/conf/RK3188 +++ b/sys/arm/conf/RK3188 @@ -72,8 +72,8 @@ options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options DIAGNOSTIC -# Boot device is 2nd slice on USB -options ROOTDEVNAME=\"ufs:/dev/da0s2\" +# Root mount from MMC/SD card +options ROOTDEVNAME=\"ufs:/dev/mmcsd0\" # MMC/SD/SDIO Card slot support device mmc # mmc/sd bus diff --git a/sys/arm/conf/RPI2 b/sys/arm/conf/RPI2 index 02f3869..18d97d7 100644 --- a/sys/arm/conf/RPI2 +++ b/sys/arm/conf/RPI2 @@ -137,6 +137,6 @@ device bcm2835_spi options FDT # Configure using FDT/DTB data # Note: DTB is normally loaded and modified by RPi boot loader, then # handed to kernel via U-Boot and ubldr. -options FDT_DTB_STATIC -makeoptions FDT_DTS_FILE=rpi2.dts +#options FDT_DTB_STATIC +#makeoptions FDT_DTS_FILE=rpi2.dts makeoptions MODULES_EXTRA=dtb/rpi diff --git a/sys/arm/conf/VSATV102 b/sys/arm/conf/VSATV102 index a3df9fa..b5de5f9 100644 --- a/sys/arm/conf/VSATV102 +++ b/sys/arm/conf/VSATV102 @@ -1,7 +1,7 @@ # VSATV102 -- Custom configuration for the Visson ATV-102 media player # -# For more information on this file, please read the handbook section on -# Kernel Configuration Files: +# For more information on this file, please read the config(5) manual page, +# and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # @@ -17,117 +17,10 @@ # # $FreeBSD$ -ident VSATV102 - -include "../amlogic/aml8726/std.vsatv102-m6" - -options HZ=100 -options SCHED_ULE # ULE scheduler -options PREEMPTION # Enable kernel thread preemption -options INET # InterNETworking -options INET6 # IPv6 communications protocols -options FFS # Berkeley Fast Filesystem -options SOFTUPDATES # Enable FFS soft updates support -options UFS_ACL # Support for access control lists -options UFS_DIRHASH # Improve performance on big directories -options PROCFS # Process filesystem (requires PSEUDOFS) -options PSEUDOFS # Pseudo-filesystem framework -options GEOM_PART_BSD # BSD partition scheme -options GEOM_PART_MBR # MBR partition scheme -options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI -options KTRACE # ktrace(1) support -options SYSVSHM # SYSV-style shared memory -options SYSVMSG # SYSV-style message queues -options SYSVSEM # SYSV-style semaphores -options _KPOSIX_PRIORITY_SCHEDULING # Posix P1003_1B real-time extensions -options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. -options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options LINUX_BOOT_ABI -options VFP # vfp/neon - -# Debugging -makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols -options ALT_BREAK_TO_DEBUGGER -#options VERBOSE_SYSINIT # Enable verbose sysinit messages -options BOOTVERBOSE=1 -options KDB -options DDB # Enable the kernel debugger -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed -#options DIAGNOSTIC - -# NFS support -#options NFSCL # New Network Filesystem Client -#options NFSLOCKD # Network Lock Manager -#options NFS_ROOT # NFS usable as /, requires NFSCL - -# NFS root from boopt/dhcp -#options BOOTP -#options BOOTP_NFSROOT -#options BOOTP_COMPAT -#options BOOTP_NFSV3 -#options BOOTP_WIRED_TO=axe0 - -# MMC/SD/SDIO card slot support -device mmc # mmc/sd bus -device mmcsd # mmc/sd flash cards - -# Boot device is 2nd slice on MMC/SD card -options ROOTDEVNAME=\"ufs:mmcsd0s2\" - -# GPIO -device gpio -device gpioled - -# I2C support -device iicbus -device iicbb -device iic +#NO_UNIVERSE -# vt is the default console driver, resembling an SCO console -device vt -#device kbdmux - -# Serial (COM) ports -device uart # Generic UART driver - -# Pseudo devices. -device loop # Network loopback -device random # Entropy device -device ether # Ethernet support -device pty # BSD-style compatibility pseudo ttys - -# The `bpf' device enables the Berkeley Packet Filter. -# Be aware of the administrative consequences of enabling this! -# Note that 'bpf' is required for DHCP. -device bpf # Berkeley packet filter - -# USB support -device usb # General USB code (mandatory for USB) -device dwcotg # DWC OTG controller -options USB_HOST_ALIGN=64 # Cacheline size is 64 -options USB_DEBUG -#options USB_REQ_DEBUG -#options USB_VERBOSE - -#device ukbd # USB keyboard -#device ums # USB mouse - -device scbus # SCSI bus (required for ATA/SCSI) -device da # Direct Access (disks) -device umass # Disks/Mass storage - Requires scbus and da - -# Ethernet support -device miibus # MII bus support - -# SoC Ethernet, requires miibus -device dwc - -# USB Ethernet, requires miibus -device axe # ASIX Electronics USB Ethernet +include "AML8726" +ident VSATV102 -# Flattened Device Tree -options FDT -options FDT_DTB_STATIC +options FDT_DTB_STATIC +makeoptions FDT_DTS_FILE=vsatv102-m6.dts diff --git a/sys/arm/freescale/imx/files.imx53 b/sys/arm/freescale/imx/files.imx5 index 6ca4ffd..58e925b 100644 --- a/sys/arm/freescale/imx/files.imx53 +++ b/sys/arm/freescale/imx/files.imx5 @@ -6,14 +6,15 @@ kern/kern_clocksource.c standard # Init arm/freescale/imx/imx_common.c standard arm/freescale/imx/imx_machdep.c standard -arm/freescale/imx/imx53_machdep.c standard +arm/freescale/imx/imx51_machdep.c optional soc_imx51 +arm/freescale/imx/imx53_machdep.c optional soc_imx53 arm/arm/bus_space_base.c standard # Special serial console for debuging early boot code #arm/freescale/imx/console.c standard # UART driver (includes serial console support) -dev/uart/uart_dev_imx.c optional uart +dev/uart/uart_dev_imx.c optional uart # TrustZone Interrupt Controller arm/freescale/imx/tzic.c standard diff --git a/sys/arm/freescale/imx/files.imx51 b/sys/arm/freescale/imx/files.imx51 deleted file mode 100644 index b779ee2..0000000 --- a/sys/arm/freescale/imx/files.imx51 +++ /dev/null @@ -1,49 +0,0 @@ -# $FreeBSD$ -arm/arm/bus_space_asm_generic.S standard -arm/arm/bus_space_generic.c standard -kern/kern_clocksource.c standard - -# Init -arm/freescale/imx/imx_common.c standard -arm/freescale/imx/imx_machdep.c standard -arm/freescale/imx/imx51_machdep.c standard -arm/arm/bus_space_base.c standard - -# Dummy serial console -#arm/freescale/imx/console.c standard - -# TrustZone Interrupt Controller -arm/freescale/imx/tzic.c standard - -# IOMUX - external pins multiplexor -arm/freescale/imx/imx_iomux.c standard - -# GPIO -arm/freescale/imx/imx_gpio.c optional gpio - -# Generic Periodic Timer -arm/freescale/imx/imx_gpt.c standard - -# Clock Configuration Manager -arm/freescale/imx/imx51_ccm.c standard - -# i.MX5xx PATA controller -dev/ata/chipsets/ata-fsl.c optional imxata - -# UART driver -dev/uart/uart_dev_imx.c optional uart - -# USB OH3 controller (1 OTG, 3 EHCI) -arm/freescale/imx/imx_nop_usbphy.c optional ehci -dev/usb/controller/ehci_imx.c optional ehci - -# Watchdog -arm/freescale/imx/imx_wdog.c optional imxwdt - -# i2c -arm/freescale/imx/imx_i2c.c optional fsliic - -# IPU - Image Processing Unit (frame buffer also) -arm/freescale/imx/imx51_ipuv3.c optional sc -arm/freescale/imx/imx51_ipuv3_fbd.c optional vt -dev/vt/hw/fb/vt_early_fb.c optional vt diff --git a/sys/arm/freescale/imx/std.imx51 b/sys/arm/freescale/imx/std.imx51 index fbc1349..eca33c2 100644 --- a/sys/arm/freescale/imx/std.imx51 +++ b/sys/arm/freescale/imx/std.imx51 @@ -13,5 +13,4 @@ options PHYSADDR=0x90000000 device fdt_pinctrl -files "../freescale/imx/files.imx51" - +files "../freescale/imx/files.imx5" diff --git a/sys/arm/freescale/imx/std.imx53 b/sys/arm/freescale/imx/std.imx53 index cbef21a..1da484c 100644 --- a/sys/arm/freescale/imx/std.imx53 +++ b/sys/arm/freescale/imx/std.imx53 @@ -13,5 +13,4 @@ options PHYSADDR=0x70000000 device fdt_pinctrl -files "../freescale/imx/files.imx53" - +files "../freescale/imx/files.imx5" diff --git a/sys/arm64/arm64/bus_machdep.c b/sys/arm64/arm64/bus_machdep.c index d8e6646..25a675e 100644 --- a/sys/arm64/arm64/bus_machdep.c +++ b/sys/arm64/arm64/bus_machdep.c @@ -169,10 +169,10 @@ struct bus_space memmap_bus = { .bs_r_8_s = NULL, /* read multiple stream */ - .bs_rm_1_s = NULL, - .bs_rm_2_s = NULL, - .bs_rm_4_s = NULL, - .bs_rm_8_s = NULL, + .bs_rm_1_s = generic_bs_rm_1, + .bs_rm_2_s = generic_bs_rm_2, + .bs_rm_4_s = generic_bs_rm_4, + .bs_rm_8_s = generic_bs_rm_8, /* read region stream */ .bs_rr_1_s = NULL, @@ -187,10 +187,10 @@ struct bus_space memmap_bus = { .bs_w_8_s = NULL, /* write multiple stream */ - .bs_wm_1_s = NULL, - .bs_wm_2_s = NULL, - .bs_wm_4_s = NULL, - .bs_wm_8_s = NULL, + .bs_wm_1_s = generic_bs_wm_1, + .bs_wm_2_s = generic_bs_wm_2, + .bs_wm_4_s = generic_bs_wm_4, + .bs_wm_8_s = generic_bs_wm_8, /* write region stream */ .bs_wr_1_s = NULL, diff --git a/sys/arm64/arm64/bus_space_asm.S b/sys/arm64/arm64/bus_space_asm.S index 52ad5dd..20d4128 100644 --- a/sys/arm64/arm64/bus_space_asm.S +++ b/sys/arm64/arm64/bus_space_asm.S @@ -63,7 +63,7 @@ ENTRY(generic_bs_rm_1) /* Read the data. */ 1: ldrb w1, [x0] - strb w1, [x3], #2 + strb w1, [x3], #1 subs x4, x4, #1 b.ne 1b @@ -105,7 +105,7 @@ ENTRY(generic_bs_rm_4) /* Read the data. */ 1: ldr w1, [x0] - str w1, [x3], #2 + str w1, [x3], #4 subs x4, x4, #1 b.ne 1b @@ -126,7 +126,7 @@ ENTRY(generic_bs_rm_8) /* Read the data. */ 1: ldr x1, [x0] - str x1, [x3], #2 + str x1, [x3], #8 subs x4, x4, #1 b.ne 1b diff --git a/sys/arm64/arm64/db_disasm.c b/sys/arm64/arm64/db_disasm.c new file mode 100644 index 0000000..ec943a4 --- /dev/null +++ b/sys/arm64/arm64/db_disasm.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Semihalf under + * the sponsorship of the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <ddb/ddb.h> + +vm_offset_t +db_disasm(vm_offset_t loc, boolean_t altfmt) +{ + return 0; +} + +/* End of db_disasm.c */ diff --git a/sys/arm64/arm64/db_interface.c b/sys/arm64/arm64/db_interface.c new file mode 100644 index 0000000..38834af --- /dev/null +++ b/sys/arm64/arm64/db_interface.c @@ -0,0 +1,168 @@ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Semihalf under + * the sponsorship of the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <sys/proc.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +#ifdef KDB +#include <sys/kdb.h> +#endif + +#include <ddb/ddb.h> +#include <ddb/db_variables.h> + +#include <machine/cpu.h> +#include <machine/pcb.h> +#include <machine/vmparam.h> + +static int +db_frame(struct db_variable *vp, db_expr_t *valuep, int op) +{ + long *reg; + + if (kdb_frame == NULL) + return (0); + + reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep); + if (op == DB_VAR_GET) + *valuep = *reg; + else + *reg = *valuep; + return (1); +} + +#define DB_OFFSET(x) (db_expr_t *)offsetof(struct trapframe, x) +struct db_variable db_regs[] = { + { "spsr", DB_OFFSET(tf_spsr), db_frame }, + { "x0", DB_OFFSET(tf_x[0]), db_frame }, + { "x1", DB_OFFSET(tf_x[1]), db_frame }, + { "x2", DB_OFFSET(tf_x[2]), db_frame }, + { "x3", DB_OFFSET(tf_x[3]), db_frame }, + { "x4", DB_OFFSET(tf_x[4]), db_frame }, + { "x5", DB_OFFSET(tf_x[5]), db_frame }, + { "x6", DB_OFFSET(tf_x[6]), db_frame }, + { "x7", DB_OFFSET(tf_x[7]), db_frame }, + { "x8", DB_OFFSET(tf_x[8]), db_frame }, + { "x9", DB_OFFSET(tf_x[9]), db_frame }, + { "x10", DB_OFFSET(tf_x[10]), db_frame }, + { "x11", DB_OFFSET(tf_x[11]), db_frame }, + { "x12", DB_OFFSET(tf_x[12]), db_frame }, + { "x13", DB_OFFSET(tf_x[13]), db_frame }, + { "x14", DB_OFFSET(tf_x[14]), db_frame }, + { "x15", DB_OFFSET(tf_x[15]), db_frame }, + { "x16", DB_OFFSET(tf_x[16]), db_frame }, + { "x17", DB_OFFSET(tf_x[17]), db_frame }, + { "x18", DB_OFFSET(tf_x[18]), db_frame }, + { "x19", DB_OFFSET(tf_x[19]), db_frame }, + { "x20", DB_OFFSET(tf_x[20]), db_frame }, + { "x21", DB_OFFSET(tf_x[21]), db_frame }, + { "x22", DB_OFFSET(tf_x[22]), db_frame }, + { "x23", DB_OFFSET(tf_x[23]), db_frame }, + { "x24", DB_OFFSET(tf_x[24]), db_frame }, + { "x25", DB_OFFSET(tf_x[25]), db_frame }, + { "x26", DB_OFFSET(tf_x[26]), db_frame }, + { "x27", DB_OFFSET(tf_x[27]), db_frame }, + { "x28", DB_OFFSET(tf_x[28]), db_frame }, + { "x29", DB_OFFSET(tf_x[29]), db_frame }, + { "lr", DB_OFFSET(tf_lr), db_frame }, + { "elr", DB_OFFSET(tf_elr), db_frame }, + { "sp", DB_OFFSET(tf_sp), db_frame }, +}; + +struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]); + +void +db_show_mdpcpu(struct pcpu *pc) +{ +} + +static int +db_validate_address(vm_offset_t addr) +{ + struct proc *p = curproc; + struct pmap *pmap; + + if (!p || !p->p_vmspace || !p->p_vmspace->vm_map.pmap || + addr >= VM_MAXUSER_ADDRESS) + pmap = pmap_kernel(); + else + pmap = p->p_vmspace->vm_map.pmap; + + return (pmap_extract(pmap, addr) == FALSE); +} + +/* + * Read bytes from kernel address space for debugger. + */ +int +db_read_bytes(vm_offset_t addr, size_t size, char *data) +{ + const char *src = (const char *)addr; + + while (size-- > 0) { + if (db_validate_address((vm_offset_t)src)) { + db_printf("address %p is invalid\n", src); + return (-1); + } + *data++ = *src++; + } + return (0); +} + +/* + * Write bytes to kernel address space for debugger. + */ +int +db_write_bytes(vm_offset_t addr, size_t size, char *data) +{ + char *dst; + + dst = (char *)addr; + while (size-- > 0) { + if (db_validate_address((vm_offset_t)dst)) { + db_printf("address %p is invalid\n", dst); + return (-1); + } + *dst++ = *data++; + } + + dsb(); + /* Clean D-cache and invalidate I-cache */ + cpu_dcache_wb_range(addr, (vm_size_t)size); + cpu_icache_sync_range(addr, (vm_size_t)size); + dsb(); + isb(); + + return (0); +} diff --git a/sys/arm64/arm64/db_trace.c b/sys/arm64/arm64/db_trace.c new file mode 100644 index 0000000..1e89bac --- /dev/null +++ b/sys/arm64/arm64/db_trace.c @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2015 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Semihalf under + * the sponsorship of the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/kdb.h> +#include <machine/pcb.h> +#include <ddb/ddb.h> +#include <ddb/db_sym.h> + +#include <machine/armreg.h> +#include <machine/debug_monitor.h> + +struct unwind_state { + uint64_t fp; + uint64_t sp; + uint64_t pc; +}; + +void +db_md_list_watchpoints() +{ + + dbg_show_watchpoint(); +} + +int +db_md_clr_watchpoint(db_expr_t addr, db_expr_t size) +{ + + return (dbg_remove_watchpoint(addr, size, DBG_FROM_EL1)); +} + +int +db_md_set_watchpoint(db_expr_t addr, db_expr_t size) +{ + + return (dbg_setup_watchpoint(addr, size, DBG_FROM_EL1, + HW_BREAKPOINT_RW)); +} + +static int +db_unwind_frame(struct unwind_state *frame) +{ + uint64_t fp = frame->fp; + + if (fp == 0) + return -1; + + frame->sp = fp + 0x10; + /* FP to previous frame (X29) */ + frame->fp = *(uint64_t *)(fp); + /* LR (X30) */ + frame->pc = *(uint64_t *)(fp + 8) - 4; + return (0); +} + +static void +db_stack_trace_cmd(struct unwind_state *frame) +{ + c_db_sym_t sym; + const char *name; + db_expr_t value; + db_expr_t offset; + + while (1) { + uint64_t pc = frame->pc; + int ret; + + ret = db_unwind_frame(frame); + if (ret < 0) + break; + + sym = db_search_symbol(pc, DB_STGY_ANY, &offset); + if (sym == C_DB_SYM_NULL) { + value = 0; + name = "(null)"; + } else + db_symbol_values(sym, &name, &value); + + db_printf("%s() at ", name); + db_printsym(frame->pc, DB_STGY_PROC); + db_printf("\n"); + + db_printf("\t pc = 0x%016lx lr = 0x%016lx\n", pc, + frame->pc); + db_printf("\t sp = 0x%016lx fp = 0x%016lx\n", frame->sp, + frame->fp); + /* TODO: Show some more registers */ + db_printf("\n"); + } +} + +int +db_trace_thread(struct thread *thr, int count) +{ + struct unwind_state frame; + struct pcb *ctx; + + if (thr != curthread) { + ctx = kdb_thr_ctx(thr); + + frame.sp = (uint64_t)ctx->pcb_sp; + frame.fp = (uint64_t)ctx->pcb_x[29]; + frame.pc = (uint64_t)ctx->pcb_x[30]; + db_stack_trace_cmd(&frame); + } else + db_trace_self(); + return (0); +} + +void +db_trace_self(void) +{ + struct unwind_state frame; + uint64_t sp; + + __asm __volatile("mov %0, sp" : "=&r" (sp)); + + frame.sp = sp; + frame.fp = (uint64_t)__builtin_frame_address(0); + frame.pc = (uint64_t)db_trace_self; + db_stack_trace_cmd(&frame); +} diff --git a/sys/arm64/arm64/debug_monitor.c b/sys/arm64/arm64/debug_monitor.c new file mode 100644 index 0000000..50d663d --- /dev/null +++ b/sys/arm64/arm64/debug_monitor.c @@ -0,0 +1,487 @@ +/*- + * Copyright (c) 2014 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Semihalf under + * the sponsorship of the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kdb.h> +#include <sys/pcpu.h> +#include <sys/systm.h> + +#include <machine/armreg.h> +#include <machine/cpu.h> +#include <machine/debug_monitor.h> +#include <machine/kdb.h> +#include <machine/param.h> + +#include <ddb/ddb.h> +#include <ddb/db_sym.h> + +enum dbg_t { + DBG_TYPE_BREAKPOINT = 0, + DBG_TYPE_WATCHPOINT = 1, +}; + +static int dbg_watchpoint_num; +static int dbg_breakpoint_num; +static int dbg_ref_count_mde[MAXCPU]; +static int dbg_ref_count_kde[MAXCPU]; + +/* Watchpoints/breakpoints control register bitfields */ +#define DBG_WATCH_CTRL_LEN_1 (0x1 << 5) +#define DBG_WATCH_CTRL_LEN_2 (0x3 << 5) +#define DBG_WATCH_CTRL_LEN_4 (0xf << 5) +#define DBG_WATCH_CTRL_LEN_8 (0xff << 5) +#define DBG_WATCH_CTRL_LEN_MASK(x) ((x) & (0xff << 5)) +#define DBG_WATCH_CTRL_EXEC (0x0 << 3) +#define DBG_WATCH_CTRL_LOAD (0x1 << 3) +#define DBG_WATCH_CTRL_STORE (0x2 << 3) +#define DBG_WATCH_CTRL_ACCESS_MASK(x) ((x) & (0x3 << 3)) + +/* Common for breakpoint and watchpoint */ +#define DBG_WB_CTRL_EL1 (0x1 << 1) +#define DBG_WB_CTRL_EL0 (0x2 << 1) +#define DBG_WB_CTRL_ELX_MASK(x) ((x) & (0x3 << 1)) +#define DBG_WB_CTRL_E (0x1 << 0) + +#define DBG_REG_BASE_BVR 0 +#define DBG_REG_BASE_BCR (DBG_REG_BASE_BVR + 16) +#define DBG_REG_BASE_WVR (DBG_REG_BASE_BCR + 16) +#define DBG_REG_BASE_WCR (DBG_REG_BASE_WVR + 16) + +/* Watchpoint/breakpoint helpers */ +#define DBG_WB_WVR "wvr" +#define DBG_WB_WCR "wcr" +#define DBG_WB_BVR "bvr" +#define DBG_WB_BCR "bcr" + +#define DBG_WB_READ(reg, num, val) do { \ + __asm __volatile("mrs %0, dbg" reg #num "_el1" : "=r" (val)); \ +} while (0) + +#define DBG_WB_WRITE(reg, num, val) do { \ + __asm __volatile("msr dbg" reg #num "_el1, %0" :: "r" (val)); \ +} while (0) + +#define READ_WB_REG_CASE(reg, num, offset, val) \ + case (num + offset): \ + DBG_WB_READ(reg, num, val); \ + break + +#define WRITE_WB_REG_CASE(reg, num, offset, val) \ + case (num + offset): \ + DBG_WB_WRITE(reg, num, val); \ + break + +#define SWITCH_CASES_READ_WB_REG(reg, offset, val) \ + READ_WB_REG_CASE(reg, 0, offset, val); \ + READ_WB_REG_CASE(reg, 1, offset, val); \ + READ_WB_REG_CASE(reg, 2, offset, val); \ + READ_WB_REG_CASE(reg, 3, offset, val); \ + READ_WB_REG_CASE(reg, 4, offset, val); \ + READ_WB_REG_CASE(reg, 5, offset, val); \ + READ_WB_REG_CASE(reg, 6, offset, val); \ + READ_WB_REG_CASE(reg, 7, offset, val); \ + READ_WB_REG_CASE(reg, 8, offset, val); \ + READ_WB_REG_CASE(reg, 9, offset, val); \ + READ_WB_REG_CASE(reg, 10, offset, val); \ + READ_WB_REG_CASE(reg, 11, offset, val); \ + READ_WB_REG_CASE(reg, 12, offset, val); \ + READ_WB_REG_CASE(reg, 13, offset, val); \ + READ_WB_REG_CASE(reg, 14, offset, val); \ + READ_WB_REG_CASE(reg, 15, offset, val) + +#define SWITCH_CASES_WRITE_WB_REG(reg, offset, val) \ + WRITE_WB_REG_CASE(reg, 0, offset, val); \ + WRITE_WB_REG_CASE(reg, 1, offset, val); \ + WRITE_WB_REG_CASE(reg, 2, offset, val); \ + WRITE_WB_REG_CASE(reg, 3, offset, val); \ + WRITE_WB_REG_CASE(reg, 4, offset, val); \ + WRITE_WB_REG_CASE(reg, 5, offset, val); \ + WRITE_WB_REG_CASE(reg, 6, offset, val); \ + WRITE_WB_REG_CASE(reg, 7, offset, val); \ + WRITE_WB_REG_CASE(reg, 8, offset, val); \ + WRITE_WB_REG_CASE(reg, 9, offset, val); \ + WRITE_WB_REG_CASE(reg, 10, offset, val); \ + WRITE_WB_REG_CASE(reg, 11, offset, val); \ + WRITE_WB_REG_CASE(reg, 12, offset, val); \ + WRITE_WB_REG_CASE(reg, 13, offset, val); \ + WRITE_WB_REG_CASE(reg, 14, offset, val); \ + WRITE_WB_REG_CASE(reg, 15, offset, val) + +static uint64_t +dbg_wb_read_reg(int reg, int n) +{ + uint64_t val = 0; + + switch (reg + n) { + SWITCH_CASES_READ_WB_REG(DBG_WB_WVR, DBG_REG_BASE_WVR, val); + SWITCH_CASES_READ_WB_REG(DBG_WB_WCR, DBG_REG_BASE_WCR, val); + SWITCH_CASES_READ_WB_REG(DBG_WB_BVR, DBG_REG_BASE_BVR, val); + SWITCH_CASES_READ_WB_REG(DBG_WB_BCR, DBG_REG_BASE_BCR, val); + default: + db_printf("trying to read from wrong debug register %d\n", n); + } + + return val; +} + +static void +dbg_wb_write_reg(int reg, int n, uint64_t val) +{ + switch (reg + n) { + SWITCH_CASES_WRITE_WB_REG(DBG_WB_WVR, DBG_REG_BASE_WVR, val); + SWITCH_CASES_WRITE_WB_REG(DBG_WB_WCR, DBG_REG_BASE_WCR, val); + SWITCH_CASES_WRITE_WB_REG(DBG_WB_BVR, DBG_REG_BASE_BVR, val); + SWITCH_CASES_WRITE_WB_REG(DBG_WB_BCR, DBG_REG_BASE_BCR, val); + default: + db_printf("trying to write to wrong debug register %d\n", n); + } + isb(); +} + +void +kdb_cpu_set_singlestep(void) +{ + + kdb_frame->tf_spsr |= DBG_SPSR_SS; + WRITE_SPECIALREG(MDSCR_EL1, READ_SPECIALREG(MDSCR_EL1) | + DBG_MDSCR_SS | DBG_MDSCR_KDE); + + /* + * Disable breakpoints and watchpoints, e.g. stepping + * over watched instruction will trigger break exception instead of + * single-step exception and locks CPU on that instruction for ever. + */ + if (dbg_ref_count_mde[PCPU_GET(cpuid)] > 0) { + WRITE_SPECIALREG(MDSCR_EL1, + READ_SPECIALREG(MDSCR_EL1) & ~DBG_MDSCR_MDE); + } +} + +void +kdb_cpu_clear_singlestep(void) +{ + + WRITE_SPECIALREG(MDSCR_EL1, READ_SPECIALREG(MDSCR_EL1) & + ~(DBG_MDSCR_SS | DBG_MDSCR_KDE)); + + /* Restore breakpoints and watchpoints */ + if (dbg_ref_count_mde[PCPU_GET(cpuid)] > 0) { + WRITE_SPECIALREG(MDSCR_EL1, + READ_SPECIALREG(MDSCR_EL1) | DBG_MDSCR_MDE); + } + + if (dbg_ref_count_kde[PCPU_GET(cpuid)] > 0) { + WRITE_SPECIALREG(MDSCR_EL1, + READ_SPECIALREG(MDSCR_EL1) | DBG_MDSCR_KDE); + } +} + +static const char * +dbg_watchtype_str(uint32_t type) +{ + switch (type) { + case DBG_WATCH_CTRL_EXEC: + return ("execute"); + case DBG_WATCH_CTRL_STORE: + return ("write"); + case DBG_WATCH_CTRL_LOAD: + return ("read"); + case DBG_WATCH_CTRL_LOAD | DBG_WATCH_CTRL_STORE: + return ("read/write"); + default: + return ("invalid"); + } +} + +static int +dbg_watchtype_len(uint32_t len) +{ + switch (len) { + case DBG_WATCH_CTRL_LEN_1: + return (1); + case DBG_WATCH_CTRL_LEN_2: + return (2); + case DBG_WATCH_CTRL_LEN_4: + return (4); + case DBG_WATCH_CTRL_LEN_8: + return (8); + default: + return (0); + } +} + +void +dbg_show_watchpoint(void) +{ + uint32_t wcr, len, type; + uint64_t addr; + int i; + + db_printf("\nhardware watchpoints:\n"); + db_printf(" watch status type len address symbol\n"); + db_printf(" ----- -------- ---------- --- ------------------ ------------------\n"); + for (i = 0; i < dbg_watchpoint_num; i++) { + wcr = dbg_wb_read_reg(DBG_REG_BASE_WCR, i); + if ((wcr & DBG_WB_CTRL_E) != 0) { + type = DBG_WATCH_CTRL_ACCESS_MASK(wcr); + len = DBG_WATCH_CTRL_LEN_MASK(wcr); + addr = dbg_wb_read_reg(DBG_REG_BASE_WVR, i); + db_printf(" %-5d %-8s %10s %3d 0x%16lx ", + i, "enabled", dbg_watchtype_str(type), + dbg_watchtype_len(len), addr); + db_printsym((db_addr_t)addr, DB_STGY_ANY); + db_printf("\n"); + } else { + db_printf(" %-5d disabled\n", i); + } + } +} + + +static int +dbg_find_free_slot(enum dbg_t type) +{ + u_int max, reg, i; + + switch(type) { + case DBG_TYPE_BREAKPOINT: + max = dbg_breakpoint_num; + reg = DBG_REG_BASE_BCR; + + break; + case DBG_TYPE_WATCHPOINT: + max = dbg_watchpoint_num; + reg = DBG_REG_BASE_WCR; + break; + default: + db_printf("Unsupported debug type\n"); + return (i); + } + + for (i = 0; i < max; i++) { + if ((dbg_wb_read_reg(reg, i) & DBG_WB_CTRL_E) == 0) + return (i); + } + + return (-1); +} + +static int +dbg_find_slot(enum dbg_t type, db_expr_t addr) +{ + u_int max, reg_addr, reg_ctrl, i; + + switch(type) { + case DBG_TYPE_BREAKPOINT: + max = dbg_breakpoint_num; + reg_addr = DBG_REG_BASE_BVR; + reg_ctrl = DBG_REG_BASE_BCR; + break; + case DBG_TYPE_WATCHPOINT: + max = dbg_watchpoint_num; + reg_addr = DBG_REG_BASE_WVR; + reg_ctrl = DBG_REG_BASE_WCR; + break; + default: + db_printf("Unsupported debug type\n"); + return (i); + } + + for (i = 0; i < max; i++) { + if ((dbg_wb_read_reg(reg_addr, i) == addr) && + ((dbg_wb_read_reg(reg_ctrl, i) & DBG_WB_CTRL_E) != 0)) + return (i); + } + + return (-1); +} + +static void +dbg_enable_monitor(enum dbg_el_t el) +{ + uint64_t reg_mdcr = 0; + + /* + * There is no need to have debug monitor on permanently, thus we are + * refcounting and turn it on only if any of CPU is going to use that. + */ + if (atomic_fetchadd_int(&dbg_ref_count_mde[PCPU_GET(cpuid)], 1) == 0) + reg_mdcr = DBG_MDSCR_MDE; + + if ((el == DBG_FROM_EL1) && + atomic_fetchadd_int(&dbg_ref_count_kde[PCPU_GET(cpuid)], 1) == 0) + reg_mdcr |= DBG_MDSCR_KDE; + + if (reg_mdcr) + WRITE_SPECIALREG(MDSCR_EL1, READ_SPECIALREG(MDSCR_EL1) | reg_mdcr); +} + +static void +dbg_disable_monitor(enum dbg_el_t el) +{ + uint64_t reg_mdcr = 0; + + if (atomic_fetchadd_int(&dbg_ref_count_mde[PCPU_GET(cpuid)], -1) == 1) + reg_mdcr = DBG_MDSCR_MDE; + + if ((el == DBG_FROM_EL1) && + atomic_fetchadd_int(&dbg_ref_count_kde[PCPU_GET(cpuid)], -1) == 1) + reg_mdcr |= DBG_MDSCR_KDE; + + if (reg_mdcr) + WRITE_SPECIALREG(MDSCR_EL1, READ_SPECIALREG(MDSCR_EL1) & ~reg_mdcr); +} + +int +dbg_setup_watchpoint(db_expr_t addr, db_expr_t size, enum dbg_el_t el, + enum dbg_access_t access) +{ + uint64_t wcr_size, wcr_priv, wcr_access; + u_int i; + + i = dbg_find_free_slot(DBG_TYPE_WATCHPOINT); + if (i == -1) { + db_printf("Can not find slot for watchpoint, max %d" + " watchpoints supported\n", dbg_watchpoint_num); + return (i); + } + + switch(size) { + case 1: + wcr_size = DBG_WATCH_CTRL_LEN_1; + break; + case 2: + wcr_size = DBG_WATCH_CTRL_LEN_2; + break; + case 4: + wcr_size = DBG_WATCH_CTRL_LEN_4; + break; + case 8: + wcr_size = DBG_WATCH_CTRL_LEN_8; + break; + default: + db_printf("Unsupported address size for watchpoint\n"); + return (-1); + } + + switch(el) { + case DBG_FROM_EL0: + wcr_priv = DBG_WB_CTRL_EL0; + break; + case DBG_FROM_EL1: + wcr_priv = DBG_WB_CTRL_EL1; + break; + default: + db_printf("Unsupported exception level for watchpoint\n"); + return (-1); + } + + switch(access) { + case HW_BREAKPOINT_X: + wcr_access = DBG_WATCH_CTRL_EXEC; + break; + case HW_BREAKPOINT_R: + wcr_access = DBG_WATCH_CTRL_LOAD; + break; + case HW_BREAKPOINT_W: + wcr_access = DBG_WATCH_CTRL_STORE; + break; + case HW_BREAKPOINT_RW: + wcr_access = DBG_WATCH_CTRL_LOAD | DBG_WATCH_CTRL_STORE; + break; + default: + db_printf("Unsupported exception level for watchpoint\n"); + return (-1); + } + + dbg_wb_write_reg(DBG_REG_BASE_WVR, i, addr); + dbg_wb_write_reg(DBG_REG_BASE_WCR, i, wcr_size | wcr_access | wcr_priv | + DBG_WB_CTRL_E); + dbg_enable_monitor(el); + return (0); +} + +int +dbg_remove_watchpoint(db_expr_t addr, db_expr_t size, enum dbg_el_t el) +{ + u_int i; + + i = dbg_find_slot(DBG_TYPE_WATCHPOINT, addr); + if (i == -1) { + db_printf("Can not find watchpoint for address 0%lx\n", addr); + return (i); + } + + dbg_wb_write_reg(DBG_REG_BASE_WCR, i, 0); + dbg_disable_monitor(el); + return (0); +} + +void +dbg_monitor_init(void) +{ + u_int i; + + /* Clear OS lock */ + WRITE_SPECIALREG(OSLAR_EL1, 0); + + /* Find out many breakpoints and watchpoints we can use */ + dbg_watchpoint_num = ((READ_SPECIALREG(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1; + dbg_breakpoint_num = ((READ_SPECIALREG(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1; + + if (bootverbose && PCPU_GET(cpuid) == 0) { + db_printf("%d watchpoints and %d breakpoints supported\n", + dbg_watchpoint_num, dbg_breakpoint_num); + } + + /* + * We have limited number of {watch,break}points, each consists of + * two registers: + * - wcr/bcr regsiter configurates corresponding {watch,break}point + * behaviour + * - wvr/bvr register keeps address we are hunting for + * + * Reset all breakpoints and watchpoints. + */ + for (i = 0; i < dbg_watchpoint_num; ++i) { + dbg_wb_write_reg(DBG_REG_BASE_WCR, i, 0); + dbg_wb_write_reg(DBG_REG_BASE_WVR, i, 0); + } + + for (i = 0; i < dbg_breakpoint_num; ++i) { + dbg_wb_write_reg(DBG_REG_BASE_BCR, i, 0); + dbg_wb_write_reg(DBG_REG_BASE_BVR, i, 0); + } + + dbg_enable(); +} diff --git a/sys/arm64/arm64/nexus.c b/sys/arm64/arm64/nexus.c index 15b18b5..bc38330 100644 --- a/sys/arm64/arm64/nexus.c +++ b/sys/arm64/arm64/nexus.c @@ -208,12 +208,12 @@ nexus_alloc_resource(device_t bus, device_t child, int type, int *rid, break; default: - return (0); + return (NULL); } rv = rman_reserve_resource(rm, start, end, count, flags, child); if (rv == 0) - return (0); + return (NULL); rman_set_rid(rv, *rid); rman_set_bushandle(rv, rman_get_start(rv)); @@ -221,7 +221,7 @@ nexus_alloc_resource(device_t bus, device_t child, int type, int *rid, if (needactivate) { if (bus_activate_resource(child, type, *rid, rv)) { rman_release_resource(rv); - return (0); + return (NULL); } } diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 174b023..c192fa6 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -2936,6 +2936,18 @@ pmap_clear_modify(vm_page_t m) /* TODO: We lack support for tracking if a page is modified */ } +void * +pmap_mapbios(vm_paddr_t pa, vm_size_t size) +{ + + return ((void *)PHYS_TO_DMAP(pa)); +} + +void +pmap_unmapbios(vm_paddr_t pa, vm_size_t size) +{ +} + /* * Sets the memory attribute for the specified page. */ diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h index e5401ef..63ae34c 100644 --- a/sys/arm64/include/pmap.h +++ b/sys/arm64/include/pmap.h @@ -142,7 +142,9 @@ void pmap_kremove(vm_offset_t); void pmap_kremove_device(vm_offset_t, vm_size_t); void *pmap_mapdev(vm_offset_t, vm_size_t); +void *pmap_mapbios(vm_paddr_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_unmapbios(vm_offset_t, vm_size_t); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); diff --git a/sys/boot/common/md.c b/sys/boot/common/md.c index 6d2d2b4..a8f092b9 100644 --- a/sys/boot/common/md.c +++ b/sys/boot/common/md.c @@ -27,11 +27,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <stand.h> #include <sys/param.h> #include <sys/endian.h> #include <sys/queue.h> #include <machine/stdarg.h> -#include <stand.h> #include "bootstrap.h" diff --git a/sys/boot/fdt/dts/arm/bcm2836.dtsi b/sys/boot/fdt/dts/arm/bcm2836.dtsi index 7fdaecf..bd75c84 100644 --- a/sys/boot/fdt/dts/arm/bcm2836.dtsi +++ b/sys/boot/fdt/dts/arm/bcm2836.dtsi @@ -101,6 +101,12 @@ */ }; + watchdog0 { + compatible = "broadcom,bcm2835-wdt", + "broadcom,bcm2708-wdt"; + reg = <0x10001c 0x0c>; /* 0x1c, 0x20, 0x24 */ + }; + gpio: gpio { compatible = "broadcom,bcm2835-gpio", "broadcom,bcm2708-gpio"; @@ -125,11 +131,11 @@ pinctrl-names = "default"; pinctrl-0 = <&pins_reserved>; - /* Pins that can short 3.3V to GND in output mode: 46-47 + /* Pins that can short 3.3V to GND in output mode: 46 * Pins used by VideoCore: 48-53 */ - broadcom,read-only = <46>, <47>, <48>, <49>, <50>, - <51>, <52>, <53>; + broadcom,read-only = <46>, <48>, <49>, <50>, + <51>, <52>, <53>; /* BSC0 */ pins_bsc0_a: bsc0_a { @@ -431,7 +437,7 @@ interrupts = <70>; interrupt-parent = <&intc>; - clock-frequency = <2500000000>; /* Set by VideoCore */ + clock-frequency = <250000000>; /* Set by VideoCore */ }; uart0: uart0 { diff --git a/sys/boot/fdt/dts/arm/rpi2.dts b/sys/boot/fdt/dts/arm/rpi2.dts index 59974f1..1c8c559 100644 --- a/sys/boot/fdt/dts/arm/rpi2.dts +++ b/sys/boot/fdt/dts/arm/rpi2.dts @@ -322,18 +322,14 @@ leds { compatible = "gpio-leds"; - ok { - label = "ok"; - gpios = <&gpio 16 1>; - - /* Don't change this - it configures - * how the led driver determines if - * the led is on or off when it loads. - */ - default-state = "keep"; - - /* This is the real default state. */ - linux,default-trigger = "default-on"; + pwr { + label = "pwr"; + gpios = <&gpio 35 0>; + }; + + act { + label = "act"; + gpios = <&gpio 47 0>; }; }; diff --git a/sys/boot/forth/loader.conf b/sys/boot/forth/loader.conf index 275a58b..240e403 100644 --- a/sys/boot/forth/loader.conf +++ b/sys/boot/forth/loader.conf @@ -48,6 +48,16 @@ entropy_cache_type="/boot/entropy" #kern.random.sys.seeded="0" # Set this to 1 to start /dev/random # without waiting for a (re)seed. +############################################################## +### RAM Blacklist configuration ############################# +############################################################## + +ram_blacklist_load="NO" # Set this to YES to load a file + # containing a list of addresses to + # exclude from the running system. +ram_blacklist_name="/boot/blacklist.txt" # Set this to the name of the file +ram_blacklist_type="ram_blacklist" # Required for the kernel to find + # the blacklist module ############################################################## ### Loader settings ######################################## diff --git a/sys/boot/i386/common/edd.h b/sys/boot/i386/common/edd.h index 4a204f7..7d1f450 100644 --- a/sys/boot/i386/common/edd.h +++ b/sys/boot/i386/common/edd.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2011 Advanced Computing Technologies LLC + * Copyright (c) 2011 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/boot/libstand32/Makefile b/sys/boot/libstand32/Makefile index a08a8e9..c41dc21 100644 --- a/sys/boot/libstand32/Makefile +++ b/sys/boot/libstand32/Makefile @@ -11,9 +11,10 @@ MAN= .include <src.opts.mk> MK_SSP= no -S= ${.CURDIR}/../../../lib/libstand +LIBSTAND_SRC= ${.CURDIR}/../../../lib/libstand +LIBC_SRC= ${LIBSTAND_SRC}/../libc -.PATH: ${S} +.PATH: ${LIBSTAND_SRC} LIB= stand INTERNALLIB= MK_PROFILE= no @@ -22,7 +23,7 @@ NO_PIC= WARNS?= 0 CFLAGS+= -ffreestanding -Wformat -CFLAGS+= -I${S} +CFLAGS+= -I${LIBSTAND_SRC} .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" CFLAGS.gcc+= -mpreferred-stack-boundary=2 @@ -52,56 +53,54 @@ SRCS+= gzguts.h zutil.h __main.c assert.c bcd.c bswap.c environment.c getopt.c g # private (pruned) versions of libc string functions SRCS+= strcasecmp.c -LIBC= ${S}/../libc - -.PATH: ${LIBC}/net +.PATH: ${LIBC_SRC}/net SRCS+= ntoh.c # string functions from libc -.PATH: ${LIBC}/string +.PATH: ${LIBC_SRC}/string SRCS+= bcmp.c bcopy.c bzero.c ffs.c memccpy.c memchr.c memcmp.c memcpy.c \ memmove.c memset.c qdivrem.c strcat.c strchr.c strcmp.c strcpy.c \ strcspn.c strlen.c strncat.c strncmp.c strncpy.c strpbrk.c \ strrchr.c strsep.c strspn.c strstr.c strtok.c swab.c .if ${MACHINE_CPUARCH} == "arm" -.PATH: ${LIBC}/arm/gen +.PATH: ${LIBC_SRC}/arm/gen # Compiler support functions -.PATH: ${.CURDIR}/../../../contrib/compiler-rt/lib/builtins/ +.PATH: ${LIBSTAND_SRC}/../../contrib/compiler-rt/lib/builtins/ # __clzsi2 and ctzsi2 for various builtin functions SRCS+= clzsi2.c ctzsi2.c # Divide and modulus functions called by the compiler SRCS+= divmoddi4.c divmodsi4.c divdi3.c divsi3.c moddi3.c modsi3.c SRCS+= udivmoddi4.c udivmodsi4.c udivdi3.c udivsi3.c umoddi3.c umodsi3.c -.PATH: ${.CURDIR}/../../../contrib/compiler-rt/lib/builtins/arm/ +.PATH: ${LIBSTAND_SRC}/../../contrib/compiler-rt/lib/builtins/arm/ SRCS+= aeabi_idivmod.S aeabi_ldivmod.S aeabi_uidivmod.S aeabi_uldivmod.S SRCS+= aeabi_memcmp.S aeabi_memcpy.S aeabi_memmove.S aeabi_memset.S .endif .if ${MACHINE_CPUARCH} == "powerpc" -.PATH: ${LIBC}/quad +.PATH: ${LIBC_SRC}/quad SRCS+= ashldi3.c ashrdi3.c SRCS+= syncicache.c .endif # uuid functions from libc -.PATH: ${LIBC}/uuid +.PATH: ${LIBC_SRC}/uuid SRCS+= uuid_equal.c uuid_is_nil.c # _setjmp/_longjmp .if ${MACHINE_CPUARCH} == "amd64" -.PATH: ${S}/i386 +.PATH: ${LIBSTAND_SRC}/i386 .else -.PATH: ${S}/${MACHINE_CPUARCH} +.PATH: ${LIBSTAND_SRC}/${MACHINE_CPUARCH} .endif SRCS+= _setjmp.S # decompression functionality from libbz2 # NOTE: to actually test this functionality after libbz2 upgrade compile # loader(8) with LOADER_BZIP2_SUPPORT defined -.PATH: ${.CURDIR}/../../../contrib/bzip2 +.PATH: ${LIBSTAND_SRC}/../../contrib/bzip2 CFLAGS+= -DBZ_NO_STDIO -DBZ_NO_COMPRESS SRCS+= libstand_bzlib_private.h @@ -110,7 +109,8 @@ SRCS+= _${file} CLEANFILES+= _${file} _${file}: ${file} - sed "s|bzlib_private\.h|libstand_bzlib_private.h|" ${.ALLSRC} > ${.TARGET} + sed "s|bzlib_private\.h|libstand_bzlib_private.h|" \ + ${.ALLSRC} > ${.TARGET} .endfor CLEANFILES+= libstand_bzlib_private.h @@ -119,8 +119,8 @@ libstand_bzlib_private.h: bzlib_private.h ${.ALLSRC} > ${.TARGET} # decompression functionality from libz -.PATH: ${S}/../libz -CFLAGS+=-DHAVE_MEMCPY -I${S}/../libz +.PATH: ${LIBSTAND_SRC}/../libz +CFLAGS+=-DHAVE_MEMCPY -I${LIBSTAND_SRC}/../libz SRCS+= adler32.c crc32.c libstand_zutil.h libstand_gzguts.h .for file in infback.c inffast.c inflate.c inftrees.c zutil.c diff --git a/sys/boot/userboot/libstand/Makefile b/sys/boot/userboot/libstand/Makefile index 35903fc..88eb98f 100644 --- a/sys/boot/userboot/libstand/Makefile +++ b/sys/boot/userboot/libstand/Makefile @@ -11,9 +11,10 @@ MAN= .include <bsd.own.mk> MK_SSP= no -S= ${.CURDIR}/../../../../lib/libstand +LIBSTAND_SRC= ${.CURDIR}/../../../../lib/libstand +LIBC_SRC= ${LIBSTAND_SRC}/../libc -.PATH: ${S} +.PATH: ${LIBSTAND_SRC} LIB= stand INTERNALLIB= MK_PROFILE= no @@ -22,7 +23,7 @@ NO_PIC= WARNS?= 0 CFLAGS+= -ffreestanding -Wformat -fPIC -CFLAGS+= -I${.CURDIR}/../../../../lib/libstand +CFLAGS+= -I${LIBSTAND_SRC} .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" CFLAGS+= -mno-mmx -mno-3dnow -mno-sse -mno-sse2 @@ -49,14 +50,12 @@ SRCS+= gzguts.h zutil.h __main.c assert.c bcd.c bswap.c environment.c getopt.c g # private (pruned) versions of libc string functions SRCS+= strcasecmp.c -LIBC= ${.CURDIR}/../../../../lib/libc - -.PATH: ${LIBC}/net +.PATH: ${LIBC_SRC}/net SRCS+= ntoh.c # string functions from libc -.PATH: ${LIBC}/string +.PATH: ${LIBC_SRC}/string .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "powerpc" || \ ${MACHINE_CPUARCH} == "sparc64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "arm" @@ -66,34 +65,34 @@ SRCS+= bcmp.c bcopy.c bzero.c ffs.c memccpy.c memchr.c memcmp.c memcpy.c \ strrchr.c strsep.c strspn.c strstr.c strtok.c swab.c .endif .if ${MACHINE_CPUARCH} == "arm" -.PATH: ${LIBC}/arm/gen +.PATH: ${LIBC_SRC}/arm/gen SRCS+= divsi3.S .endif .if ${MACHINE_CPUARCH} == "powerpc" -.PATH: ${LIBC}/libc/quad +.PATH: ${LIBC_SRC}/quad SRCS+= ashldi3.c ashrdi3.c -.PATH: ${LIBC}/powerpc/gen +.PATH: ${LIBC_SRC}/powerpc/gen SRCS+= syncicache.c .endif # uuid functions from libc -.PATH: ${LIBC}/uuid +.PATH: ${LIBC_SRC}/uuid SRCS+= uuid_equal.c uuid_is_nil.c # _setjmp/_longjmp .if ${MACHINE_CPUARCH} == "amd64" -.PATH: ${S}/amd64 +.PATH: ${LIBSTAND_SRC}/amd64 .elif ${MACHINE_ARCH} == "powerpc64" -.PATH: ${S}/powerpc +.PATH: ${LIBSTAND_SRC}/powerpc .else -.PATH: ${S}/${MACHINE_CPUARCH} +.PATH: ${LIBSTAND_SRC}/${MACHINE_CPUARCH} .endif SRCS+= _setjmp.S # decompression functionality from libbz2 # NOTE: to actually test this functionality after libbz2 upgrade compile # loader(8) with LOADER_BZIP2_SUPPORT defined -.PATH: ${.CURDIR}/../../../../contrib/bzip2 +.PATH: ${LIBSTAND_SRC}/../../contrib/bzip2 CFLAGS+= -DBZ_NO_STDIO -DBZ_NO_COMPRESS SRCS+= libstand_bzlib_private.h @@ -102,7 +101,8 @@ SRCS+= _${file} CLEANFILES+= _${file} _${file}: ${file} - sed "s|bzlib_private\.h|libstand_bzlib_private.h|" ${.ALLSRC} > ${.TARGET} + sed "s|bzlib_private\.h|libstand_bzlib_private.h|" \ + ${.ALLSRC} > ${.TARGET} .endfor CLEANFILES+= libstand_bzlib_private.h @@ -111,8 +111,8 @@ libstand_bzlib_private.h: bzlib_private.h ${.ALLSRC} > ${.TARGET} # decompression functionality from libz -.PATH: ${.CURDIR}/../../../../lib/libz -CFLAGS+=-DHAVE_MEMCPY -I${.CURDIR}/../../../../lib/libz +.PATH: ${LIBSTAND_SRC}/../libz +CFLAGS+=-DHAVE_MEMCPY -I${LIBSTAND_SRC}/../libz SRCS+= adler32.c crc32.c libstand_zutil.h libstand_gzguts.h .for file in infback.c inffast.c inflate.c inftrees.c zutil.c @@ -121,7 +121,8 @@ CLEANFILES+= _${file} _${file}: ${file} sed -e "s|zutil\.h|libstand_zutil.h|" \ - -e "s|gzguts\.h|libstand_gzguts.h|" ${.ALLSRC} > ${.TARGET} + -e "s|gzguts\.h|libstand_gzguts.h|" \ + ${.ALLSRC} > ${.TARGET} .endfor # depend on stand.h being able to be included multiple times diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c index 53ed5f2..cd6af4e 100644 --- a/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c +++ b/sys/cddl/contrib/opensolaris/common/nvpair/nvpair.c @@ -1227,6 +1227,7 @@ nvpair_type_is_array(nvpair_t *nvp) data_type_t type = NVP_TYPE(nvp); if ((type == DATA_TYPE_BYTE_ARRAY) || + (type == DATA_TYPE_INT8_ARRAY) || (type == DATA_TYPE_UINT8_ARRAY) || (type == DATA_TYPE_INT16_ARRAY) || (type == DATA_TYPE_UINT16_ARRAY) || diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index dc7c283..b31e8bb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -611,7 +611,11 @@ dtrace_panic(const char *format, ...) va_list alist; va_start(alist, format); +#ifdef __FreeBSD__ + vpanic(format, alist); +#else dtrace_vpanic(format, alist); +#endif va_end(alist); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 36ac27a..fb9c8a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -879,11 +880,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } - /* You can't clone across pools. */ - if (pdd->dd_pool != dp) { - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(EXDEV)); - } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); if (error != 0) { @@ -896,12 +893,6 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); - /* You can't clone across pools. */ - if (origin->ds_dir->dd_pool != dp) { - dsl_dataset_rele(origin, FTAG); - return (SET_ERROR(EXDEV)); - } - /* You can only clone snapshots, not the head datasets. */ if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 7b1d6be..faa6d0c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -306,21 +306,22 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + dnode_phys_t *cdnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - dnp = buf->b_data; + cdnp = buf->b_data; for (i = 0; i < epb; i++) { - prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, + prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { - err = traverse_dnode(td, &dnp[i], zb->zb_objset, + err = traverse_dnode(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; @@ -328,7 +329,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; - dnode_phys_t *dnp; + dnode_phys_t *mdnp, *gdnp, *udnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); @@ -336,26 +337,27 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, goto post; osp = buf->b_data; - dnp = &osp->os_meta_dnode; - prefetch_dnode_metadata(td, dnp, zb->zb_objset, + mdnp = &osp->os_meta_dnode; + gdnp = &osp->os_groupused_dnode; + udnp = &osp->os_userused_dnode; + + prefetch_dnode_metadata(td, mdnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - prefetch_dnode_metadata(td, &osp->os_groupused_dnode, - zb->zb_objset, DMU_GROUPUSED_OBJECT); - prefetch_dnode_metadata(td, &osp->os_userused_dnode, - zb->zb_objset, DMU_USERUSED_OBJECT); + prefetch_dnode_metadata(td, gdnp, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, udnp, zb->zb_objset, + DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, dnp, zb->zb_objset, + err = traverse_dnode(td, mdnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - dnp = &osp->os_groupused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, + err = traverse_dnode(td, gdnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - dnp = &osp->os_userused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, + err = traverse_dnode(td, udnp, zb->zb_objset, DMU_USERUSED_OBJECT); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index ce22f4e..58db4dd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -413,7 +413,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, offsetof(dmu_sendarg_t, dsa_link)); if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS); + int zaperr = zap_contains(mos, dsobj, + DS_FIELD_LARGE_BLOCKS); if (zaperr != ENOENT) { VERIFY0(zaperr); ds->ds_large_blocks = B_TRUE; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 1862f8c..26f5c2d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -24,6 +24,7 @@ * All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ #include <sys/dmu.h> @@ -408,7 +409,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(EXDEV)); ASSERT(dsl_pool_config_held(dp)); @@ -1706,7 +1707,7 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); - return (SET_ERROR(ENXIO)); + return (SET_ERROR(EXDEV)); } /* new name should not already exist */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c index 6a90b9c..fe02399 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c @@ -137,7 +137,7 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp) *minorp = (minor_t)(uintptr_t)data; curthread->td_fpop = tmpfp; if (error != 0) - return (error); + return (SET_ERROR(EBADF)); return (zfs_onexit_minor_to_state(*minorp, &zo)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index d925712..e3b314f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -1320,15 +1320,25 @@ zfs_rezget(znode_t *zp) } /* - * XXXPJD: Not sure how is that possible, but under heavy - * zfs recv -F load it happens that z_gen is the same, but - * vnode type is different than znode type. This would mean - * that for example regular file was replaced with directory - * which has the same object number. + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. */ vp = ZTOV(zp); - if (vp != NULL && - vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + if (vp != NULL && vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EIO); diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_asm.S b/sys/cddl/dev/dtrace/amd64/dtrace_asm.S index cf8314c..a6c079e 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/amd64/dtrace_asm.S @@ -363,211 +363,3 @@ dtrace_interrupt_enable(dtrace_icookie_t cookie) popfq ret END(dtrace_interrupt_enable) - -/* - * The panic() and cmn_err() functions invoke vpanic() as a common entry point - * into the panic code implemented in panicsys(). vpanic() is responsible - * for passing through the format string and arguments, and constructing a - * regs structure on the stack into which it saves the current register - * values. If we are not dying due to a fatal trap, these registers will - * then be preserved in panicbuf as the current processor state. Before - * invoking panicsys(), vpanic() activates the first panic trigger (see - * common/os/panic.c) and switches to the panic_stack if successful. Note that - * DTrace takes a slightly different panic path if it must panic from probe - * context. Instead of calling panic, it calls into dtrace_vpanic(), which - * sets up the initial stack as vpanic does, calls dtrace_panic_trigger(), and - * branches back into vpanic(). - */ - -/* -void -vpanic(const char *format, va_list alist) -*/ - ENTRY(vpanic) /* Initial stack layout: */ - - pushq %rbp /* | %rip | 0x60 */ - movq %rsp, %rbp /* | %rbp | 0x58 */ - pushfq /* | rfl | 0x50 */ - pushq %r11 /* | %r11 | 0x48 */ - pushq %r10 /* | %r10 | 0x40 */ - pushq %rbx /* | %rbx | 0x38 */ - pushq %rax /* | %rax | 0x30 */ - pushq %r9 /* | %r9 | 0x28 */ - pushq %r8 /* | %r8 | 0x20 */ - pushq %rcx /* | %rcx | 0x18 */ - pushq %rdx /* | %rdx | 0x10 */ - pushq %rsi /* | %rsi | 0x8 alist */ - pushq %rdi /* | %rdi | 0x0 format */ - - movq %rsp, %rbx /* %rbx = current %rsp */ - - leaq panic_quiesce(%rip), %rdi /* %rdi = &panic_quiesce */ - call panic_trigger /* %eax = panic_trigger() */ - -vpanic_common: - /* - * The panic_trigger result is in %eax from the call above, and - * dtrace_panic places it in %eax before branching here. - * The rdmsr instructions that follow below will clobber %eax so - * we stash the panic_trigger result in %r11d. - */ - movl %eax, %r11d - cmpl $0, %r11d - je 0f - - /* - * If panic_trigger() was successful, we are the first to initiate a - * panic: we now switch to the reserved panic_stack before continuing. - */ - leaq panic_stack(%rip), %rsp - addq $PANICSTKSIZE, %rsp -0: subq $REGSIZE, %rsp - /* - * Now that we've got everything set up, store the register values as - * they were when we entered vpanic() to the designated location in - * the regs structure we allocated on the stack. - */ -#ifdef notyet - movq 0x0(%rbx), %rcx - movq %rcx, REGOFF_RDI(%rsp) - movq 0x8(%rbx), %rcx - movq %rcx, REGOFF_RSI(%rsp) - movq 0x10(%rbx), %rcx - movq %rcx, REGOFF_RDX(%rsp) - movq 0x18(%rbx), %rcx - movq %rcx, REGOFF_RCX(%rsp) - movq 0x20(%rbx), %rcx - - movq %rcx, REGOFF_R8(%rsp) - movq 0x28(%rbx), %rcx - movq %rcx, REGOFF_R9(%rsp) - movq 0x30(%rbx), %rcx - movq %rcx, REGOFF_RAX(%rsp) - movq 0x38(%rbx), %rcx - movq %rcx, REGOFF_RBX(%rsp) - movq 0x58(%rbx), %rcx - - movq %rcx, REGOFF_RBP(%rsp) - movq 0x40(%rbx), %rcx - movq %rcx, REGOFF_R10(%rsp) - movq 0x48(%rbx), %rcx - movq %rcx, REGOFF_R11(%rsp) - movq %r12, REGOFF_R12(%rsp) - - movq %r13, REGOFF_R13(%rsp) - movq %r14, REGOFF_R14(%rsp) - movq %r15, REGOFF_R15(%rsp) - - xorl %ecx, %ecx - movw %ds, %cx - movq %rcx, REGOFF_DS(%rsp) - movw %es, %cx - movq %rcx, REGOFF_ES(%rsp) - movw %fs, %cx - movq %rcx, REGOFF_FS(%rsp) - movw %gs, %cx - movq %rcx, REGOFF_GS(%rsp) - - movq $0, REGOFF_TRAPNO(%rsp) - - movq $0, REGOFF_ERR(%rsp) - leaq vpanic(%rip), %rcx - movq %rcx, REGOFF_RIP(%rsp) - movw %cs, %cx - movzwq %cx, %rcx - movq %rcx, REGOFF_CS(%rsp) - movq 0x50(%rbx), %rcx - movq %rcx, REGOFF_RFL(%rsp) - movq %rbx, %rcx - addq $0x60, %rcx - movq %rcx, REGOFF_RSP(%rsp) - movw %ss, %cx - movzwq %cx, %rcx - movq %rcx, REGOFF_SS(%rsp) - - /* - * panicsys(format, alist, rp, on_panic_stack) - */ - movq REGOFF_RDI(%rsp), %rdi /* format */ - movq REGOFF_RSI(%rsp), %rsi /* alist */ - movq %rsp, %rdx /* struct regs */ - movl %r11d, %ecx /* on_panic_stack */ - call panicsys - addq $REGSIZE, %rsp -#endif - popq %rdi - popq %rsi - popq %rdx - popq %rcx - popq %r8 - popq %r9 - popq %rax - popq %rbx - popq %r10 - popq %r11 - popfq - leave - ret - END(vpanic) - -/* -void -dtrace_vpanic(const char *format, va_list alist) -*/ - ENTRY(dtrace_vpanic) /* Initial stack layout: */ - - pushq %rbp /* | %rip | 0x60 */ - movq %rsp, %rbp /* | %rbp | 0x58 */ - pushfq /* | rfl | 0x50 */ - pushq %r11 /* | %r11 | 0x48 */ - pushq %r10 /* | %r10 | 0x40 */ - pushq %rbx /* | %rbx | 0x38 */ - pushq %rax /* | %rax | 0x30 */ - pushq %r9 /* | %r9 | 0x28 */ - pushq %r8 /* | %r8 | 0x20 */ - pushq %rcx /* | %rcx | 0x18 */ - pushq %rdx /* | %rdx | 0x10 */ - pushq %rsi /* | %rsi | 0x8 alist */ - pushq %rdi /* | %rdi | 0x0 format */ - - movq %rsp, %rbx /* %rbx = current %rsp */ - - leaq panic_quiesce(%rip), %rdi /* %rdi = &panic_quiesce */ - call dtrace_panic_trigger /* %eax = dtrace_panic_trigger() */ - jmp vpanic_common - - END(dtrace_vpanic) - -/* -int -panic_trigger(int *tp) -*/ - ENTRY(panic_trigger) - xorl %eax, %eax - movl $0xdefacedd, %edx - lock - xchgl %edx, (%rdi) - cmpl $0, %edx - je 0f - movl $0, %eax - ret -0: movl $1, %eax - ret - END(panic_trigger) - -/* -int -dtrace_panic_trigger(int *tp) -*/ - ENTRY(dtrace_panic_trigger) - xorl %eax, %eax - movl $0xdefacedd, %edx - lock - xchgl %edx, (%rdi) - cmpl $0, %edx - je 0f - movl $0, %eax - ret -0: movl $1, %eax - ret - END(dtrace_panic_trigger) diff --git a/sys/cddl/dev/dtrace/arm/dtrace_asm.S b/sys/cddl/dev/dtrace/arm/dtrace_asm.S index ce27b14..06e91d2 100644 --- a/sys/cddl/dev/dtrace/arm/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/arm/dtrace_asm.S @@ -170,24 +170,6 @@ ENTRY(dtrace_copystr) END(dtrace_copystr) /* -void -vpanic(const char *format, va_list alist) -*/ -ENTRY(vpanic) /* Initial stack layout: */ -vpanic_common: - RET -END(vpanic) - -/* -void -dtrace_vpanic(const char *format, va_list alist) -*/ -ENTRY(dtrace_vpanic) /* Initial stack layout: */ - b vpanic - RET -END(dtrace_vpanic) /* Initial stack layout: */ - -/* uintptr_t dtrace_caller(int aframes) */ diff --git a/sys/cddl/dev/dtrace/dtrace_hacks.c b/sys/cddl/dev/dtrace/dtrace_hacks.c index 21da9f8..3f89973 100644 --- a/sys/cddl/dev/dtrace/dtrace_hacks.c +++ b/sys/cddl/dev/dtrace/dtrace_hacks.c @@ -3,9 +3,6 @@ dtrace_cacheid_t dtrace_predcache_id; -int panic_quiesce; -char panic_stack[PANICSTKSIZE]; - boolean_t priv_policy_only(const cred_t *a, int b, boolean_t c) { diff --git a/sys/cddl/dev/dtrace/i386/dtrace_asm.S b/sys/cddl/dev/dtrace/i386/dtrace_asm.S index 6338719..d44f6c3 100644 --- a/sys/cddl/dev/dtrace/i386/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/i386/dtrace_asm.S @@ -355,167 +355,3 @@ void dtrace_interrupt_enable(dtrace_icookie_t cookie) popfl ret END(dtrace_interrupt_enable) - -/* - * The panic() and cmn_err() functions invoke vpanic() as a common entry point - * into the panic code implemented in panicsys(). vpanic() is responsible - * for passing through the format string and arguments, and constructing a - * regs structure on the stack into which it saves the current register - * values. If we are not dying due to a fatal trap, these registers will - * then be preserved in panicbuf as the current processor state. Before - * invoking panicsys(), vpanic() activates the first panic trigger (see - * common/os/panic.c) and switches to the panic_stack if successful. Note that - * DTrace takes a slightly different panic path if it must panic from probe - * context. Instead of calling panic, it calls into dtrace_vpanic(), which - * sets up the initial stack as vpanic does, calls dtrace_panic_trigger(), and - * branches back into vpanic(). - */ -/* -void vpanic(const char *format, va_list alist) -*/ - ENTRY(vpanic) /* Initial stack layout: */ - - pushl %ebp /* | %eip | 20 */ - movl %esp, %ebp /* | %ebp | 16 */ - pushl %eax /* | %eax | 12 */ - pushl %ebx /* | %ebx | 8 */ - pushl %ecx /* | %ecx | 4 */ - pushl %edx /* | %edx | 0 */ - - movl %esp, %ebx /* %ebx = current stack pointer */ - - lea panic_quiesce, %eax /* %eax = &panic_quiesce */ - pushl %eax /* push &panic_quiesce */ - call panic_trigger /* %eax = panic_trigger() */ - addl $4, %esp /* reset stack pointer */ - -vpanic_common: - cmpl $0, %eax /* if (%eax == 0) */ - je 0f /* goto 0f; */ - - /* - * If panic_trigger() was successful, we are the first to initiate a - * panic: we now switch to the reserved panic_stack before continuing. - */ - lea panic_stack, %esp /* %esp = panic_stack */ - addl $PANICSTKSIZE, %esp /* %esp += PANICSTKSIZE */ - -0: subl $REGSIZE, %esp /* allocate struct regs */ - - /* - * Now that we've got everything set up, store the register values as - * they were when we entered vpanic() to the designated location in - * the regs structure we allocated on the stack. - */ -#ifdef notyet - mov %gs, %edx - mov %edx, REGOFF_GS(%esp) - mov %fs, %edx - mov %edx, REGOFF_FS(%esp) - mov %es, %edx - mov %edx, REGOFF_ES(%esp) - mov %ds, %edx - mov %edx, REGOFF_DS(%esp) - movl %edi, REGOFF_EDI(%esp) - movl %esi, REGOFF_ESI(%esp) - movl 16(%ebx), %ecx - movl %ecx, REGOFF_EBP(%esp) - movl %ebx, %ecx - addl $20, %ecx - movl %ecx, REGOFF_ESP(%esp) - movl 8(%ebx), %ecx - movl %ecx, REGOFF_EBX(%esp) - movl 0(%ebx), %ecx - movl %ecx, REGOFF_EDX(%esp) - movl 4(%ebx), %ecx - movl %ecx, REGOFF_ECX(%esp) - movl 12(%ebx), %ecx - movl %ecx, REGOFF_EAX(%esp) - movl $0, REGOFF_TRAPNO(%esp) - movl $0, REGOFF_ERR(%esp) - lea vpanic, %ecx - movl %ecx, REGOFF_EIP(%esp) - mov %cs, %edx - movl %edx, REGOFF_CS(%esp) - pushfl - popl %ecx - movl %ecx, REGOFF_EFL(%esp) - movl $0, REGOFF_UESP(%esp) - mov %ss, %edx - movl %edx, REGOFF_SS(%esp) - - movl %esp, %ecx /* %ecx = ®s */ - pushl %eax /* push on_panic_stack */ - pushl %ecx /* push ®s */ - movl 12(%ebp), %ecx /* %ecx = alist */ - pushl %ecx /* push alist */ - movl 8(%ebp), %ecx /* %ecx = format */ - pushl %ecx /* push format */ - call panicsys /* panicsys(); */ - addl $16, %esp /* pop arguments */ - - addl $REGSIZE, %esp -#endif - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - END(vpanic) - -/* -void dtrace_vpanic(const char *format, va_list alist) -*/ - ENTRY(dtrace_vpanic) /* Initial stack layout: */ - - pushl %ebp /* | %eip | 20 */ - movl %esp, %ebp /* | %ebp | 16 */ - pushl %eax /* | %eax | 12 */ - pushl %ebx /* | %ebx | 8 */ - pushl %ecx /* | %ecx | 4 */ - pushl %edx /* | %edx | 0 */ - - movl %esp, %ebx /* %ebx = current stack pointer */ - - lea panic_quiesce, %eax /* %eax = &panic_quiesce */ - pushl %eax /* push &panic_quiesce */ - call dtrace_panic_trigger /* %eax = dtrace_panic_trigger() */ - addl $4, %esp /* reset stack pointer */ - jmp vpanic_common /* jump back to common code */ - - END(dtrace_vpanic) - -/* -int -panic_trigger(int *tp) -*/ - ENTRY(panic_trigger) - xorl %eax, %eax - movl $0xdefacedd, %edx - lock - xchgl %edx, (%edi) - cmpl $0, %edx - je 0f - movl $0, %eax - ret -0: movl $1, %eax - ret - END(panic_trigger) - -/* -int -dtrace_panic_trigger(int *tp) -*/ - ENTRY(dtrace_panic_trigger) - xorl %eax, %eax - movl $0xdefacedd, %edx - lock - xchgl %edx, (%edi) - cmpl $0, %edx - je 0f - movl $0, %eax - ret -0: movl $1, %eax - ret - END(dtrace_panic_trigger) diff --git a/sys/cddl/dev/dtrace/mips/dtrace_asm.S b/sys/cddl/dev/dtrace/mips/dtrace_asm.S index 50f6c54..199d16be 100644 --- a/sys/cddl/dev/dtrace/mips/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/mips/dtrace_asm.S @@ -249,49 +249,6 @@ LEAF(dtrace_invop_uninit) END(dtrace_invop_uninit) /* - * The panic() and cmn_err() functions invoke vpanic() as a common entry point - * into the panic code implemented in panicsys(). vpanic() is responsible - * for passing through the format string and arguments, and constructing a - * regs structure on the stack into which it saves the current register - * values. If we are not dying due to a fatal trap, these registers will - * then be preserved in panicbuf as the current processor state. Before - * invoking panicsys(), vpanic() activates the first panic trigger (see - * common/os/panic.c) and switches to the panic_stack if successful. Note that - * DTrace takes a slightly different panic path if it must panic from probe - * context. Instead of calling panic, it calls into dtrace_vpanic(), which - * sets up the initial stack as vpanic does, calls dtrace_panic_trigger(), and - * branches back into vpanic(). - */ - -/* -void -vpanic(const char *format, va_list alist) -*/ -LEAF(vpanic) /* Initial stack layout: */ - -vpanic_common: - j ra - nop -END(vpanic) - - - -/* -void -dtrace_vpanic(const char *format, va_list alist) -*/ -LEAF(dtrace_vpanic) /* Initial stack layout: */ - -#if 0 - jal dtrace_panic_trigger /* %eax = dtrace_panic_trigger() */ - nop -#endif - j vpanic_common - nop - -END(dtrace_vpanic) - -/* uintptr_t dtrace_caller(int aframes) */ @@ -300,4 +257,3 @@ LEAF(dtrace_caller) j ra nop END(dtrace_caller) - diff --git a/sys/cddl/dev/dtrace/powerpc/dtrace_asm.S b/sys/cddl/dev/dtrace/powerpc/dtrace_asm.S index 5676360..3adfbe3 100644 --- a/sys/cddl/dev/dtrace/powerpc/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/powerpc/dtrace_asm.S @@ -161,45 +161,6 @@ ASENTRY_NOPROF(dtrace_copystr) END(dtrace_copystr) /* - * The panic() and cmn_err() functions invoke vpanic() as a common entry point - * into the panic code implemented in panicsys(). vpanic() is responsible - * for passing through the format string and arguments, and constructing a - * regs structure on the stack into which it saves the current register - * values. If we are not dying due to a fatal trap, these registers will - * then be preserved in panicbuf as the current processor state. Before - * invoking panicsys(), vpanic() activates the first panic trigger (see - * common/os/panic.c) and switches to the panic_stack if successful. Note that - * DTrace takes a slightly different panic path if it must panic from probe - * context. Instead of calling panic, it calls into dtrace_vpanic(), which - * sets up the initial stack as vpanic does, calls dtrace_panic_trigger(), and - * branches back into vpanic(). - */ - -/* -void -vpanic(const char *format, va_list alist) -*/ -ASENTRY_NOPROF(vpanic) /* Initial stack layout: */ - -vpanic_common: - blr -END(vpanic) - - - -/* -void -dtrace_vpanic(const char *format, va_list alist) -*/ -ASENTRY_NOPROF(dtrace_vpanic) /* Initial stack layout: */ - -#if 0 - bl dtrace_panic_trigger /* %eax = dtrace_panic_trigger() */ -#endif - b vpanic_common -END(dtrace_vpanic) - -/* uintptr_t dtrace_caller(int aframes) */ @@ -207,4 +168,3 @@ ASENTRY_NOPROF(dtrace_caller) li %r3, -1 blr END(dtrace_caller) - diff --git a/sys/cddl/dev/profile/profile.c b/sys/cddl/dev/profile/profile.c index 338a958..d31d5c8 100644 --- a/sys/cddl/dev/profile/profile.c +++ b/sys/cddl/dev/profile/profile.c @@ -134,8 +134,10 @@ struct profile_probe_percpu; #endif #ifdef __arm__ -/* bogus */ -#define PROF_ARTIFICIAL_FRAMES 9 +/* + * At least on ARMv7, this appears to work quite well. + */ +#define PROF_ARTIFICIAL_FRAMES 10 #endif typedef struct profile_probe { diff --git a/sys/compat/linux/linux_getcwd.c b/sys/compat/linux/linux_getcwd.c index 7c0b9ba..ebcf970 100644 --- a/sys/compat/linux/linux_getcwd.c +++ b/sys/compat/linux/linux_getcwd.c @@ -2,11 +2,15 @@ /* $NetBSD: vfs_getcwd.c,v 1.3.2.3 1999/07/11 10:24:09 sommerfeld Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. + * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Bill Sommerfeld. * + * Portions of this software were developed by Edward Tomasz Napierala + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -36,19 +40,9 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/namei.h> -#include <sys/filedesc.h> -#include <sys/kernel.h> -#include <sys/file.h> -#include <sys/stat.h> #include <sys/syscallsubr.h> -#include <sys/vnode.h> -#include <sys/mount.h> #include <sys/proc.h> -#include <sys/uio.h> #include <sys/malloc.h> -#include <sys/dirent.h> -#include <ufs/ufs/dir.h> /* XXX only for DIRBLKSIZ */ #ifdef COMPAT_LINUX32 #include <machine/../linux32/linux.h> @@ -60,408 +54,37 @@ __FBSDID("$FreeBSD$"); #include <compat/linux/linux_misc.h> #include <compat/linux/linux_util.h> -#include <security/mac/mac_framework.h> - -static int -linux_getcwd_scandir(struct vnode **, struct vnode **, - char **, char *, struct thread *); -static int -linux_getcwd_common(struct vnode *, struct vnode *, - char **, char *, int, int, struct thread *); - -#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) - -/* - * Vnode variable naming conventions in this file: - * - * rvp: the current root we're aiming towards. - * lvp, *lvpp: the "lower" vnode - * uvp, *uvpp: the "upper" vnode. - * - * Since all the vnodes we're dealing with are directories, and the - * lookups are going *up* in the filesystem rather than *down*, the - * usual "pvp" (parent) or "dvp" (directory) naming conventions are - * too confusing. - */ - -/* - * XXX Will infinite loop in certain cases if a directory read reliably - * returns EINVAL on last block. - * XXX is EINVAL the right thing to return if a directory is malformed? - */ - -/* - * XXX Untested vs. mount -o union; probably does the wrong thing. - */ - -/* - * Find parent vnode of *lvpp, return in *uvpp - * - * If we care about the name, scan it looking for name of directory - * entry pointing at lvp. - * - * Place the name in the buffer which starts at bufp, immediately - * before *bpp, and move bpp backwards to point at the start of it. - * - * On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed - * On exit, *uvpp is either NULL or is a locked vnode reference. - */ -static int -linux_getcwd_scandir(lvpp, uvpp, bpp, bufp, td) - struct vnode **lvpp; - struct vnode **uvpp; - char **bpp; - char *bufp; - struct thread *td; -{ - int error = 0; - int eofflag; - off_t off; - int tries; - struct uio uio; - struct iovec iov; - char *dirbuf = NULL; - int dirbuflen; - ino_t fileno; - struct vattr va; - struct vnode *uvp = NULL; - struct vnode *lvp = *lvpp; - struct componentname cn; - int len, reclen; - tries = 0; - - /* - * If we want the filename, get some info we need while the - * current directory is still locked. - */ - if (bufp != NULL) { - error = VOP_GETATTR(lvp, &va, td->td_ucred); - if (error) { - vput(lvp); - *lvpp = NULL; - *uvpp = NULL; - return error; - } - } - - /* - * Ok, we have to do it the hard way.. - * Next, get parent vnode using lookup of .. - */ - cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY; - cn.cn_thread = td; - cn.cn_cred = td->td_ucred; - cn.cn_pnbuf = NULL; - cn.cn_nameptr = ".."; - cn.cn_namelen = 2; - cn.cn_consume = 0; - cn.cn_lkflags = LK_SHARED; - - /* - * At this point, lvp is locked and will be unlocked by the lookup. - * On successful return, *uvpp will be locked - */ -#ifdef MAC - error = mac_vnode_check_lookup(td->td_ucred, lvp, &cn); - if (error == 0) -#endif - error = VOP_LOOKUP(lvp, uvpp, &cn); - if (error) { - vput(lvp); - *lvpp = NULL; - *uvpp = NULL; - return error; - } - uvp = *uvpp; - - /* If we don't care about the pathname, we're done */ - if (bufp == NULL) { - vput(lvp); - *lvpp = NULL; - return 0; - } - - fileno = va.va_fileid; - - dirbuflen = DIRBLKSIZ; - if (dirbuflen < va.va_blocksize) - dirbuflen = va.va_blocksize; - dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); - -#if 0 -unionread: -#endif - off = 0; - do { - /* call VOP_READDIR of parent */ - iov.iov_base = dirbuf; - iov.iov_len = dirbuflen; - - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = off; - uio.uio_resid = dirbuflen; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - uio.uio_td = td; - - eofflag = 0; - -#ifdef MAC - error = mac_vnode_check_readdir(td->td_ucred, uvp); - if (error == 0) -#endif /* MAC */ - error = VOP_READDIR(uvp, &uio, td->td_ucred, &eofflag, - 0, 0); - - off = uio.uio_offset; - - /* - * Try again if NFS tosses its cookies. - * XXX this can still loop forever if the directory is busted - * such that the second or subsequent page of it always - * returns EINVAL - */ - if ((error == EINVAL) && (tries < 3)) { - off = 0; - tries++; - continue; /* once more, with feeling */ - } - - if (!error) { - char *cpos; - struct dirent *dp; - - cpos = dirbuf; - tries = 0; - - /* scan directory page looking for matching vnode */ - for (len = (dirbuflen - uio.uio_resid); len > 0; len -= reclen) { - dp = (struct dirent *) cpos; - reclen = dp->d_reclen; - - /* check for malformed directory.. */ - if (reclen < DIRENT_MINSIZE) { - error = EINVAL; - goto out; - } - /* - * XXX should perhaps do VOP_LOOKUP to - * check that we got back to the right place, - * but getting the locking games for that - * right would be heinous. - */ - if ((dp->d_type != DT_WHT) && - (dp->d_fileno == fileno)) { - char *bp = *bpp; - bp -= dp->d_namlen; - - if (bp <= bufp) { - error = ERANGE; - goto out; - } - bcopy(dp->d_name, bp, dp->d_namlen); - error = 0; - *bpp = bp; - goto out; - } - cpos += reclen; - } - } - } while (!eofflag); - error = ENOENT; - -out: - vput(lvp); - *lvpp = NULL; - free(dirbuf, M_TEMP); - return error; -} - - -/* - * common routine shared by sys___getcwd() and linux_vn_isunder() - */ - -#define GETCWD_CHECK_ACCESS 0x0001 - -static int -linux_getcwd_common (lvp, rvp, bpp, bufp, limit, flags, td) - struct vnode *lvp; - struct vnode *rvp; - char **bpp; - char *bufp; - int limit; - int flags; - struct thread *td; -{ - struct filedesc *fdp = td->td_proc->p_fd; - struct vnode *uvp = NULL; - char *bp = NULL; - int error; - accmode_t accmode = VEXEC; - - if (rvp == NULL) { - rvp = fdp->fd_rdir; - if (rvp == NULL) - rvp = rootvnode; - } - - VREF(rvp); - VREF(lvp); - - /* - * Error handling invariant: - * Before a `goto out': - * lvp is either NULL, or locked and held. - * uvp is either NULL, or locked and held. - */ - - error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); - if (error != 0) - panic("vn_lock LK_RETRY returned error %d", error); - if (bufp) - bp = *bpp; - /* - * this loop will terminate when one of the following happens: - * - we hit the root - * - getdirentries or lookup fails - * - we run out of space in the buffer. - */ - if (lvp == rvp) { - if (bp) - *(--bp) = '/'; - goto out; - } - do { - if (lvp->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - - /* - * access check here is optional, depending on - * whether or not caller cares. - */ - if (flags & GETCWD_CHECK_ACCESS) { - error = VOP_ACCESS(lvp, accmode, td->td_ucred, td); - if (error) - goto out; - accmode = VEXEC|VREAD; - } - - /* - * step up if we're a covered vnode.. - */ - while (lvp->v_vflag & VV_ROOT) { - struct vnode *tvp; - - if (lvp == rvp) - goto out; - - tvp = lvp; - lvp = lvp->v_mount->mnt_vnodecovered; - vput(tvp); - /* - * hodie natus est radici frater - */ - if (lvp == NULL) { - error = ENOENT; - goto out; - } - VREF(lvp); - error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); - if (error != 0) - panic("vn_lock LK_RETRY returned %d", error); - } - error = linux_getcwd_scandir(&lvp, &uvp, &bp, bufp, td); - if (error) - goto out; -#ifdef DIAGNOSTIC - if (lvp != NULL) - panic("getcwd: oops, forgot to null lvp"); - if (bufp && (bp <= bufp)) { - panic("getcwd: oops, went back too far"); - } -#endif - if (bp) - *(--bp) = '/'; - lvp = uvp; - uvp = NULL; - limit--; - } while ((lvp != rvp) && (limit > 0)); - -out: - if (bpp) - *bpp = bp; - if (uvp) - vput(uvp); - if (lvp) - vput(lvp); - vrele(rvp); - return error; -} - - /* * Find pathname of process's current directory. - * - * Use vfs vnode-to-name reverse cache; if that fails, fall back - * to reading directory contents. */ - int linux_getcwd(struct thread *td, struct linux_getcwd_args *args) { - char *bp, *bend, *path; - int error, len, lenused; + char *path; + int error, lenused; #ifdef DEBUG if (ldebug(getcwd)) printf(ARGS(getcwd, "%p, %ld"), args->buf, (long)args->bufsize); #endif - len = args->bufsize; - - if (len > LINUX_PATH_MAX) - len = LINUX_PATH_MAX; - else if (len < 2) - return ERANGE; + /* + * Linux returns ERANGE instead of EINVAL. + */ + if (args->bufsize < 2) + return (ERANGE); - path = malloc(len, M_TEMP, M_WAITOK); + path = malloc(LINUX_PATH_MAX, M_TEMP, M_WAITOK); - error = kern___getcwd(td, path, UIO_SYSSPACE, len, LINUX_PATH_MAX); - if (!error) { + error = kern___getcwd(td, path, UIO_SYSSPACE, args->bufsize, + LINUX_PATH_MAX); + if (error == 0) { lenused = strlen(path) + 1; - if (lenused <= args->bufsize) { + error = copyout(path, args->buf, lenused); + if (error == 0) td->td_retval[0] = lenused; - error = copyout(path, args->buf, lenused); - } - else - error = ERANGE; - } else { - bp = &path[len]; - bend = bp; - *(--bp) = '\0'; - - /* - * 5th argument here is "max number of vnodes to traverse". - * Since each entry takes up at least 2 bytes in the output buffer, - * limit it to N/2 vnodes for an N byte buffer. - */ - - error = linux_getcwd_common (td->td_proc->p_fd->fd_cdir, NULL, - &bp, path, len/2, GETCWD_CHECK_ACCESS, td); - if (error) - goto out; - lenused = bend - bp; - td->td_retval[0] = lenused; - /* put the result into user buffer */ - error = copyout(bp, args->buf, lenused); } -out: + free(path, M_TEMP); return (error); } - diff --git a/sys/conf/files b/sys/conf/files index ce28fa1..caaed94 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -584,7 +584,6 @@ dev/acpica/acpi_button.c optional acpi dev/acpica/acpi_cmbat.c optional acpi dev/acpica/acpi_cpu.c optional acpi dev/acpica/acpi_ec.c optional acpi -dev/acpica/acpi_hpet.c optional acpi dev/acpica/acpi_isab.c optional acpi isa dev/acpica/acpi_lid.c optional acpi dev/acpica/acpi_package.c optional acpi @@ -1780,6 +1779,8 @@ dev/ixgbe/ixgbe_82599.c optional ix ixv inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_x540.c optional ix ixv inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" +dev/ixgbe/ixgbe_x550.c optional ix ixv inet \ + compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_dcb.c optional ix ixv inet \ compile-with "${NORMAL_C} -I$S/dev/ixgbe" dev/ixgbe/ixgbe_dcb_82598.c optional ix ixv inet \ @@ -2677,24 +2678,24 @@ wpi.fw optional wpifw \ clean "wpi.fw" dev/xe/if_xe.c optional xe dev/xe/if_xe_pccard.c optional xe pccard -dev/xen/balloon/balloon.c optional xen | xenhvm -dev/xen/blkfront/blkfront.c optional xen | xenhvm -dev/xen/blkback/blkback.c optional xen | xenhvm -dev/xen/console/console.c optional xen | xenhvm -dev/xen/console/xencons_ring.c optional xen | xenhvm -dev/xen/control/control.c optional xen | xenhvm -dev/xen/grant_table/grant_table.c optional xen | xenhvm -dev/xen/netback/netback.c optional xen | xenhvm -dev/xen/netfront/netfront.c optional xen | xenhvm +dev/xen/balloon/balloon.c optional xenhvm +dev/xen/blkfront/blkfront.c optional xenhvm +dev/xen/blkback/blkback.c optional xenhvm +dev/xen/console/console.c optional xenhvm +dev/xen/console/xencons_ring.c optional xenhvm +dev/xen/control/control.c optional xenhvm +dev/xen/grant_table/grant_table.c optional xenhvm +dev/xen/netback/netback.c optional xenhvm +dev/xen/netfront/netfront.c optional xenhvm dev/xen/xenpci/xenpci.c optional xenpci -dev/xen/timer/timer.c optional xen | xenhvm -dev/xen/pvcpu/pvcpu.c optional xen | xenhvm -dev/xen/xenstore/xenstore.c optional xen | xenhvm -dev/xen/xenstore/xenstore_dev.c optional xen | xenhvm -dev/xen/xenstore/xenstored_dev.c optional xen | xenhvm -dev/xen/evtchn/evtchn_dev.c optional xen | xenhvm -dev/xen/privcmd/privcmd.c optional xen | xenhvm -dev/xen/debug/debug.c optional xen | xenhvm +dev/xen/timer/timer.c optional xenhvm +dev/xen/pvcpu/pvcpu.c optional xenhvm +dev/xen/xenstore/xenstore.c optional xenhvm +dev/xen/xenstore/xenstore_dev.c optional xenhvm +dev/xen/xenstore/xenstored_dev.c optional xenhvm +dev/xen/evtchn/evtchn_dev.c optional xenhvm +dev/xen/privcmd/privcmd.c optional xenhvm +dev/xen/debug/debug.c optional xenhvm dev/xl/if_xl.c optional xl pci dev/xl/xlphy.c optional xl pci fs/autofs/autofs.c optional autofs @@ -3262,6 +3263,9 @@ libkern/strtoul.c standard libkern/strtouq.c standard libkern/strvalid.c standard libkern/timingsafe_bcmp.c standard +libkern/zlib.c optional crypto | geom_uzip | ipsec | \ + mxge | netgraph_deflate | \ + ddb_ctf | gzio | geom_uncompress net/altq/altq_cbq.c optional altq net/altq/altq_cdnr.c optional altq net/altq/altq_hfsc.c optional altq @@ -3324,9 +3328,6 @@ net/slcompress.c optional netgraph_vjc | sppp | \ netgraph_sppp net/toeplitz.c optional inet rss | inet6 rss net/vnet.c optional vimage -net/zlib.c optional crypto | geom_uzip | ipsec | \ - mxge | netgraph_deflate | \ - ddb_ctf | gzio | geom_uncompress net80211/ieee80211.c optional wlan net80211/ieee80211_acl.c optional wlan wlan_acl net80211/ieee80211_action.c optional wlan @@ -3503,6 +3504,7 @@ netinet/sctp_sysctl.c optional inet sctp | inet6 sctp netinet/sctp_timer.c optional inet sctp | inet6 sctp netinet/sctp_usrreq.c optional inet sctp | inet6 sctp netinet/sctputil.c optional inet sctp | inet6 sctp +netinet/siftr.c optional inet siftr alq | inet6 siftr alq netinet/tcp_debug.c optional tcpdebug netinet/tcp_hostcache.c optional inet | inet6 netinet/tcp_input.c optional inet | inet6 @@ -4043,13 +4045,13 @@ vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard vm/vnode_pager.c standard -xen/features.c optional xen | xenhvm -xen/xenbus/xenbus_if.m optional xen | xenhvm -xen/xenbus/xenbus.c optional xen | xenhvm -xen/xenbus/xenbusb_if.m optional xen | xenhvm -xen/xenbus/xenbusb.c optional xen | xenhvm -xen/xenbus/xenbusb_front.c optional xen | xenhvm -xen/xenbus/xenbusb_back.c optional xen | xenhvm +xen/features.c optional xenhvm +xen/xenbus/xenbus_if.m optional xenhvm +xen/xenbus/xenbus.c optional xenhvm +xen/xenbus/xenbusb_if.m optional xenhvm +xen/xenbus/xenbusb.c optional xenhvm +xen/xenbus/xenbusb_front.c optional xenhvm +xen/xenbus/xenbusb_back.c optional xenhvm xdr/xdr.c optional krpc | nfslockd | nfscl | nfsd xdr/xdr_array.c optional krpc | nfslockd | nfscl | nfsd xdr/xdr_mbuf.c optional krpc | nfslockd | nfscl | nfsd diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index ae71c39..8aadcf5 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -145,6 +145,7 @@ crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock dev/acpica/acpi_if.m standard +dev/acpica/acpi_hpet.c optional acpi dev/acpi_support/acpi_wmi_if.m standard dev/agp/agp_amd64.c optional agp dev/agp/agp_i810.c optional agp @@ -513,10 +514,10 @@ compat/ndis/winx64_wrap.S optional ndisapi pci libkern/memmove.c standard libkern/memset.c standard # -# x86 real mode BIOS emulator, required by atkbdc/dpms/pci/vesa +# x86 real mode BIOS emulator, required by dpms/pci/vesa # -compat/x86bios/x86bios.c optional x86bios | atkbd | dpms | pci | vesa -contrib/x86emu/x86emu.c optional x86bios | atkbd | dpms | pci | vesa +compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa +contrib/x86emu/x86emu.c optional x86bios | dpms | pci | vesa # # bvm console # @@ -569,13 +570,14 @@ x86/x86/local_apic.c standard x86/x86/mca.c standard x86/x86/mptable.c optional mptable x86/x86/mptable_pci.c optional mptable pci +x86/x86/mp_x86.c optional smp x86/x86/msi.c optional pci x86/x86/nexus.c standard x86/x86/pvclock.c standard x86/x86/tsc.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm -x86/xen/xen_intr.c optional xen | xenhvm +x86/xen/xen_intr.c optional xenhvm x86/xen/pv.c optional xenhvm x86/xen/pvcpu_enum.c optional xenhvm x86/xen/xen_apic.c optional xenhvm diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 758c9a6..f80fecd 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -10,6 +10,10 @@ arm64/arm64/clock.c standard arm64/arm64/copyinout.S standard arm64/arm64/copystr.c standard arm64/arm64/cpufunc_asm.S standard +arm64/arm64/db_disasm.c optional ddb +arm64/arm64/db_interface.c optional ddb +arm64/arm64/db_trace.c optional ddb +arm64/arm64/debug_monitor.c optional kdb arm64/arm64/dump_machdep.c standard arm64/arm64/elf_machdep.c standard arm64/arm64/exception.S standard diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index f072247..68dd6a9 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -285,6 +285,7 @@ dev/uart/uart_cpu_x86.c optional uart dev/viawd/viawd.c optional viawd dev/vmware/vmxnet3/if_vmx.c optional vmx dev/acpica/acpi_if.m standard +dev/acpica/acpi_hpet.c optional acpi dev/acpi_support/acpi_wmi_if.m standard dev/wbwd/wbwd.c optional wbwd dev/wpi/if_wpi.c optional wpi @@ -427,16 +428,15 @@ i386/bios/smapi_bios.S optional smapi i386/i386/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/i386/autoconf.c standard -i386/i386/bios.c optional native -i386/i386/bioscall.s optional native +i386/i386/bios.c standard +i386/i386/bioscall.s standard i386/i386/bpf_jit_machdep.c optional bpf_jitter i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb i386/i386/db_trace.c optional ddb i386/i386/elan-mmcr.c optional cpu_elan | cpu_soekris i386/i386/elf_machdep.c standard -i386/i386/exception.s optional native -i386/xen/exception.s optional xen +i386/i386/exception.s standard i386/i386/gdb_machdep.c optional gdb i386/i386/geode.c optional cpu_geode i386/i386/i686_mem.c optional mem @@ -444,22 +444,17 @@ i386/i386/in_cksum.c optional inet | inet6 i386/i386/initcpu.c standard i386/i386/io.c optional io i386/i386/k6_mem.c optional mem -i386/i386/locore.s optional native no-obj -i386/xen/locore.s optional xen no-obj +i386/i386/locore.s standard no-obj i386/i386/longrun.c optional cpu_enable_longrun i386/i386/machdep.c standard -i386/xen/xen_machdep.c optional xen i386/i386/mem.c optional mem i386/i386/minidump_machdep.c standard i386/i386/mp_clock.c optional smp -i386/i386/mp_machdep.c optional native smp -i386/xen/mp_machdep.c optional xen smp +i386/i386/mp_machdep.c optional smp i386/i386/mp_watchdog.c optional mp_watchdog smp -i386/i386/mpboot.s optional smp native -i386/xen/mptable.c optional apic xen +i386/i386/mpboot.s optional smp i386/i386/perfmon.c optional perfmon -i386/i386/pmap.c optional native -i386/xen/pmap.c optional xen +i386/i386/pmap.c standard i386/i386/ptrace_machdep.c standard i386/i386/stack_machdep.c optional ddb | stack i386/i386/support.s standard @@ -488,7 +483,6 @@ i386/ibcs2/ibcs2_util.c optional ibcs2 i386/ibcs2/ibcs2_xenix.c optional ibcs2 i386/ibcs2/ibcs2_xenix_sysent.c optional ibcs2 i386/ibcs2/imgact_coff.c optional ibcs2 -i386/xen/clock.c optional xen i386/isa/elink.c optional ep | ie i386/isa/npx.c optional npx i386/isa/pmtimer.c optional pmtimer @@ -531,9 +525,9 @@ i386/xbox/xboxfb.c optional xboxfb dev/fb/boot_font.c optional xboxfb i386/xbox/pic16l.s optional xbox # -# x86 real mode BIOS support, required by atkbdc/dpms/pci/vesa +# x86 real mode BIOS support, required by dpms/pci/vesa # -compat/x86bios/x86bios.c optional x86bios | atkbd | dpms | pci | vesa +compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa # # bvm console # @@ -565,9 +559,9 @@ x86/iommu/intel_qi.c optional acpi acpi_dmar pci x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic -x86/isa/atrtc.c optional native -x86/isa/clock.c optional native -x86/isa/elcr.c optional atpic | apic native +x86/isa/atrtc.c standard +x86/isa/clock.c standard +x86/isa/elcr.c optional atpic | apic x86/isa/isa.c optional isa x86/isa/isa_dma.c optional isa x86/isa/nmi.c standard @@ -582,19 +576,20 @@ x86/x86/fdt_machdep.c optional fdt x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/io_apic.c optional apic -x86/x86/legacy.c optional native +x86/x86/legacy.c standard x86/x86/local_apic.c optional apic x86/x86/mca.c standard -x86/x86/mptable.c optional apic native -x86/x86/mptable_pci.c optional apic native pci +x86/x86/mptable.c optional apic +x86/x86/mptable_pci.c optional apic pci +x86/x86/mp_x86.c optional smp x86/x86/msi.c optional apic pci x86/x86/nexus.c standard x86/x86/tsc.c standard x86/x86/pvclock.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm -x86/xen/xen_intr.c optional xen | xenhvm +x86/xen/xen_intr.c optional xenhvm x86/xen/xen_apic.c optional xenhvm -x86/xen/xenpv.c optional xen | xenhvm -x86/xen/xen_nexus.c optional xen | xenhvm -x86/xen/xen_msi.c optional xen | xenhvm +x86/xen/xenpv.c optional xenhvm +x86/xen/xen_nexus.c optional xenhvm +x86/xen/xen_msi.c optional xenhvm diff --git a/sys/conf/files.pc98 b/sys/conf/files.pc98 index f95d0bb..ae165fc 100644 --- a/sys/conf/files.pc98 +++ b/sys/conf/files.pc98 @@ -256,6 +256,7 @@ x86/x86/io_apic.c optional apic x86/x86/legacy.c standard x86/x86/local_apic.c optional apic x86/x86/mca.c standard +x86/x86/mp_x86.c optional smp x86/x86/mptable.c optional apic x86/x86/mptable_pci.c optional apic pci x86/x86/msi.c optional apic pci diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index ab585b0..92f4101 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -97,17 +97,16 @@ libkern/udivdi3.c optional powerpc libkern/umoddi3.c optional powerpc powerpc/aim/interrupt.c optional aim powerpc/aim/locore.S optional aim no-obj -powerpc/aim/machdep.c optional aim +powerpc/aim/aim_machdep.c optional aim powerpc/aim/mmu_oea.c optional aim powerpc powerpc/aim/mmu_oea64.c optional aim powerpc/aim/moea64_if.m optional aim powerpc/aim/moea64_native.c optional aim powerpc/aim/mp_cpudep.c optional aim powerpc/aim/slb.c optional aim powerpc64 -powerpc/aim/uma_machdep.c optional aim powerpc/booke/interrupt.c optional booke powerpc/booke/locore.S optional booke no-obj -powerpc/booke/machdep.c optional booke +powerpc/booke/booke_machdep.c optional booke powerpc/booke/machdep_e500.c optional booke_e500 powerpc/booke/mp_cpudep.c optional booke smp powerpc/booke/platform_bare.c optional booke @@ -195,6 +194,7 @@ powerpc/powerpc/gdb_machdep.c optional gdb powerpc/powerpc/in_cksum.c optional inet | inet6 powerpc/powerpc/intr_machdep.c standard powerpc/powerpc/iommu_if.m standard +powerpc/powerpc/machdep.c standard powerpc/powerpc/mem.c optional mem powerpc/powerpc/mmu_if.m standard powerpc/powerpc/mp_machdep.c optional smp @@ -217,6 +217,7 @@ powerpc/powerpc/syncicache.c standard powerpc/powerpc/sys_machdep.c standard powerpc/powerpc/trap.c standard powerpc/powerpc/uio_machdep.c standard +powerpc/powerpc/uma_machdep.c standard powerpc/powerpc/vm_machdep.c standard powerpc/ps3/ehci_ps3.c optional ps3 ehci powerpc/ps3/ohci_ps3.c optional ps3 ohci diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk index 9a7ad61..8e76268 100644 --- a/sys/conf/kern.mk +++ b/sys/conf/kern.mk @@ -187,7 +187,7 @@ CFLAGS+= -fstack-protector CFLAGS+= -gdwarf-2 .endif -CFLAGS+= ${CWARNEXTRA} ${CWARNFLAGS} ${CWARNFLAGS.${.IMPSRC:T}} +CFLAGS+= ${CWARNFLAGS} ${CWARNFLAGS.${.IMPSRC:T}} CFLAGS+= ${CFLAGS.${COMPILER_TYPE}} ${CFLAGS.${.IMPSRC:T}} # Tell bmake not to mistake standard targets for things to be searched for diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index 99ada05..8c3b9c6 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -87,7 +87,7 @@ INCLUDES+= -I$S/dev/cxgb -I$S/dev/cxgbe .endif -CFLAGS= ${COPTFLAGS} ${DEBUG} ${CWARNFLAGS} +CFLAGS= ${COPTFLAGS} ${DEBUG} CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100 CFLAGS_PARAM_LARGE_FUNCTION_GROWTH?=1000 diff --git a/sys/conf/options b/sys/conf/options index 2405442..b698717 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -432,6 +432,7 @@ ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG +SIFTR TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_SIGNATURE opt_inet.h VLAN_ARRAY opt_vlan.h @@ -928,6 +929,7 @@ IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h +RACCT_DISABLED opt_global.h # Resource Limits RCTL opt_global.h diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64 index f1d4b4a..0e59187 100644 --- a/sys/conf/options.amd64 +++ b/sys/conf/options.amd64 @@ -63,5 +63,7 @@ BPF_JITTER opt_bpf.h XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h diff --git a/sys/conf/options.arm b/sys/conf/options.arm index 239be2a..b712b02 100644 --- a/sys/conf/options.arm +++ b/sys/conf/options.arm @@ -40,6 +40,9 @@ PV_STATS opt_pmap.h QEMU_WORKAROUNDS opt_global.h SOC_BCM2835 opt_global.h SOC_BCM2836 opt_global.h +SOC_IMX51 opt_global.h +SOC_IMX53 opt_global.h +SOC_IMX6 opt_global.h SOC_MV_ARMADAXP opt_global.h SOC_MV_DISCOVERY opt_global.h SOC_MV_DOVE opt_global.h diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 index fff47ab..1dfcbec 100644 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -1,4 +1,6 @@ # $FreeBSD$ ARM64 opt_global.h +SOCDEV_PA opt_global.h +SOCDEV_VA opt_global.h VFP opt_global.h diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index 0a1a52f..69eb7e3 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -121,9 +121,9 @@ NPX_DEBUG opt_npx.h # BPF just-in-time compiler BPF_JITTER opt_bpf.h -NATIVE opt_global.h -XEN opt_global.h XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c index 47bc791..7796fbe 100644 --- a/sys/dev/acpica/acpi.c +++ b/sys/dev/acpica/acpi.c @@ -606,7 +606,7 @@ acpi_attach(device_t dev) sc->acpi_handle_reboot = 1; /* Only enable S4BIOS by default if the FACS says it is available. */ - if (AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT) + if (AcpiGbl_FACS != NULL && AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT) sc->acpi_s4bios = 1; /* Probe all supported sleep states. */ diff --git a/sys/dev/atkbdc/atkbd.c b/sys/dev/atkbdc/atkbd.c index d93c1c6..a90f5d2 100644 --- a/sys/dev/atkbdc/atkbd.c +++ b/sys/dev/atkbdc/atkbd.c @@ -44,19 +44,6 @@ __FBSDID("$FreeBSD$"); #include <machine/bus.h> #include <machine/resource.h> -#if defined(__i386__) || defined(__amd64__) -#include <machine/md_var.h> -#include <machine/psl.h> -#include <compat/x86bios/x86bios.h> -#include <machine/pc/bios.h> - -#include <vm/vm.h> -#include <vm/pmap.h> -#include <vm/vm_param.h> - -#include <isa/isareg.h> -#endif /* __i386__ || __amd64__ */ - #include <sys/kbio.h> #include <dev/kbd/kbdreg.h> #include <dev/atkbdc/atkbdreg.h> @@ -82,6 +69,9 @@ static int atkbd_reset(KBDC kbdc, int flags, int c); #define HAS_QUIRK(p, q) (((atkbdc_softc_t *)(p))->quirks & q) #define ALLOW_DISABLE_KBD(kbdc) !HAS_QUIRK(kbdc, KBDC_QUIRK_KEEP_ACTIVATED) +#define DEFAULT_DELAY 0x1 /* 500ms */ +#define DEFAULT_RATE 0x10 /* 14Hz */ + int atkbd_probe_unit(device_t dev, int irq, int flags) { @@ -249,7 +239,7 @@ static keyboard_switch_t atkbdsw = { KEYBOARD_DRIVER(atkbd, atkbdsw, atkbd_configure); /* local functions */ -static int get_typematic(keyboard_t *kbd); +static int set_typematic(keyboard_t *kbd); static int setup_kbd_port(KBDC kbdc, int port, int intr); static int get_kbd_echo(KBDC kbdc); static int probe_keyboard(KBDC kbdc, int flags); @@ -443,7 +433,7 @@ atkbd_init(int unit, keyboard_t **kbdp, void *arg, int flags) goto bad; } atkbd_ioctl(kbd, KDSETLED, (caddr_t)&state->ks_state); - get_typematic(kbd); + set_typematic(kbd); delay[0] = kbd->kb_delay1; delay[1] = kbd->kb_delay2; atkbd_ioctl(kbd, KDSETREPEAT, (caddr_t)delay); @@ -503,7 +493,7 @@ atkbd_intr(keyboard_t *kbd, void *arg) init_keyboard(state->kbdc, &kbd->kb_type, kbd->kb_config); KBD_FOUND_DEVICE(kbd); atkbd_ioctl(kbd, KDSETLED, (caddr_t)&state->ks_state); - get_typematic(kbd); + set_typematic(kbd); delay[0] = kbd->kb_delay1; delay[1] = kbd->kb_delay2; atkbd_ioctl(kbd, KDSETREPEAT, (caddr_t)delay); @@ -1135,57 +1125,19 @@ atkbd_reset(KBDC kbdc, int flags, int c) /* local functions */ static int -get_typematic(keyboard_t *kbd) +set_typematic(keyboard_t *kbd) { -#if defined(__i386__) || defined(__amd64__) - /* - * Only some systems allow us to retrieve the keyboard repeat - * rate previously set via the BIOS... - */ - x86regs_t regs; - uint8_t *p; + int val, error; + atkbd_state_t *state = kbd->kb_data; - /* - * Traditional entry points of int 0x15 and 0x16 are fixed - * and later BIOSes follow them. (U)EFI CSM specification - * also mandates these fixed entry points. - * - * Validate the entry points here before we proceed further. - * It's known that some recent laptops does not have the - * same entry point and hang on boot if we call it. - */ - if (x86bios_get_intr(0x15) != 0xf000f859 || - x86bios_get_intr(0x16) != 0xf000e82e) - return (ENODEV); - - /* Is BIOS system configuration table supported? */ - x86bios_init_regs(®s); - regs.R_AH = 0xc0; - x86bios_intr(®s, 0x15); - if ((regs.R_FLG & PSL_C) != 0 || regs.R_AH != 0) - return (ENODEV); - - /* Is int 0x16, function 0x09 supported? */ - p = x86bios_offset((regs.R_ES << 4) + regs.R_BX); - if (readw(p) < 5 || (readb(p + 6) & 0x40) == 0) - return (ENODEV); - - /* Is int 0x16, function 0x0306 supported? */ - x86bios_init_regs(®s); - regs.R_AH = 0x09; - x86bios_intr(®s, 0x16); - if ((regs.R_AL & 0x08) == 0) - return (ENODEV); - - x86bios_init_regs(®s); - regs.R_AX = 0x0306; - x86bios_intr(®s, 0x16); - kbd->kb_delay1 = typematic_delay(regs.R_BH << 5); - kbd->kb_delay2 = typematic_rate(regs.R_BL); - return (0); -#else - return (ENODEV); -#endif /* __i386__ || __amd64__ */ + val = typematic(DEFAULT_DELAY, DEFAULT_RATE); + error = write_kbd(state->kbdc, KBDC_SET_TYPEMATIC, val); + if (error == 0) { + kbd->kb_delay1 = typematic_delay(val); + kbd->kb_delay2 = typematic_rate(val); + } + + return (error); } static int diff --git a/sys/dev/bxe/bxe.h b/sys/dev/bxe/bxe.h index f37b93f..eb695d0 100644 --- a/sys/dev/bxe/bxe.h +++ b/sys/dev/bxe/bxe.h @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include <sys/limits.h> #include <sys/queue.h> #include <sys/taskqueue.h> +#include <sys/zlib.h> #include <net/if.h> #include <net/if_types.h> @@ -60,7 +61,6 @@ __FBSDID("$FreeBSD$"); #include <net/if_var.h> #include <net/if_media.h> #include <net/if_vlan_var.h> -#include <net/zlib.h> #include <net/bpf.h> #include <netinet/in.h> diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 40098b8..4bbb55e 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -1571,9 +1571,6 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) struct ifmedia *media = NULL; struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; -#ifdef INVARIANTS - int data = (pi->port_type << 8) | pi->mod_type; -#endif if (ifp == pi->ifp) media = &pi->media; @@ -1584,7 +1581,6 @@ cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) MPASS(media != NULL); cur = media->ifm_cur; - MPASS(cur->ifm_data == data); ifmr->ifm_status = IFM_AVALID; if (!pi->link_cfg.link_ok) @@ -2884,30 +2880,29 @@ t4_set_desc(struct adapter *sc) static void build_medialist(struct port_info *pi, struct ifmedia *media) { - int data, m; + int m; PORT_LOCK(pi); ifmedia_removeall(media); m = IFM_ETHER | IFM_FDX; - data = (pi->port_type << 8) | pi->mod_type; switch(pi->port_type) { case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: - ifmedia_add(media, m | IFM_10G_T, data, NULL); + ifmedia_add(media, m | IFM_10G_T, 0, NULL); /* fall through */ case FW_PORT_TYPE_BT_SGMII: - ifmedia_add(media, m | IFM_1000_T, data, NULL); - ifmedia_add(media, m | IFM_100_TX, data, NULL); - ifmedia_add(media, IFM_ETHER | IFM_AUTO, data, NULL); + ifmedia_add(media, m | IFM_1000_T, 0, NULL); + ifmedia_add(media, m | IFM_100_TX, 0, NULL); + ifmedia_add(media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); break; case FW_PORT_TYPE_CX4: - ifmedia_add(media, m | IFM_10G_CX4, data, NULL); + ifmedia_add(media, m | IFM_10G_CX4, 0, NULL); ifmedia_set(media, m | IFM_10G_CX4); break; @@ -2918,29 +2913,29 @@ build_medialist(struct port_info *pi, struct ifmedia *media) switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: - ifmedia_add(media, m | IFM_10G_LR, data, NULL); + ifmedia_add(media, m | IFM_10G_LR, 0, NULL); ifmedia_set(media, m | IFM_10G_LR); break; case FW_PORT_MOD_TYPE_SR: - ifmedia_add(media, m | IFM_10G_SR, data, NULL); + ifmedia_add(media, m | IFM_10G_SR, 0, NULL); ifmedia_set(media, m | IFM_10G_SR); break; case FW_PORT_MOD_TYPE_LRM: - ifmedia_add(media, m | IFM_10G_LRM, data, NULL); + ifmedia_add(media, m | IFM_10G_LRM, 0, NULL); ifmedia_set(media, m | IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: - ifmedia_add(media, m | IFM_10G_TWINAX, data, NULL); + ifmedia_add(media, m | IFM_10G_TWINAX, 0, NULL); ifmedia_set(media, m | IFM_10G_TWINAX); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; - ifmedia_add(media, m | IFM_NONE, data, NULL); + ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; @@ -2950,7 +2945,7 @@ build_medialist(struct port_info *pi, struct ifmedia *media) device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); - ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); + ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } @@ -2960,24 +2955,24 @@ build_medialist(struct port_info *pi, struct ifmedia *media) switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: - ifmedia_add(media, m | IFM_40G_LR4, data, NULL); + ifmedia_add(media, m | IFM_40G_LR4, 0, NULL); ifmedia_set(media, m | IFM_40G_LR4); break; case FW_PORT_MOD_TYPE_SR: - ifmedia_add(media, m | IFM_40G_SR4, data, NULL); + ifmedia_add(media, m | IFM_40G_SR4, 0, NULL); ifmedia_set(media, m | IFM_40G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: - ifmedia_add(media, m | IFM_40G_CR4, data, NULL); + ifmedia_add(media, m | IFM_40G_CR4, 0, NULL); ifmedia_set(media, m | IFM_40G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; - ifmedia_add(media, m | IFM_NONE, data, NULL); + ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; @@ -2985,7 +2980,7 @@ build_medialist(struct port_info *pi, struct ifmedia *media) device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); - ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); + ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } @@ -2995,7 +2990,7 @@ build_medialist(struct port_info *pi, struct ifmedia *media) device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); - ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); + ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 49cc16a..6ac6eb6 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -1046,8 +1046,7 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) } drbr_advance(ifp, txr->br); enq++; - if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); - if (next->m_flags & M_MCAST) + if (next->m_flags & M_MCAST && adapter->vf_ifp) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) @@ -4055,7 +4054,9 @@ static bool igb_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; +#ifdef DEV_NETMAP struct ifnet *ifp = adapter->ifp; +#endif /* DEV_NETMAP */ u32 work, processed = 0; u16 limit = txr->process_limit; struct igb_tx_buf *buf; @@ -4130,7 +4131,6 @@ igb_txeof(struct tx_ring *txr) } ++txr->packets; ++processed; - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); txr->watchdog_time = ticks; /* Try the next packet */ @@ -5127,7 +5127,6 @@ igb_rxeof(struct igb_queue *que, int count, int *done) if (eop) { rxr->fmp->m_pkthdr.rcvif = ifp; - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->rx_packets++; /* capture data for AIM */ rxr->packets++; @@ -5560,24 +5559,94 @@ igb_led_func(void *arg, int onoff) } static uint64_t +igb_get_vf_counter(if_t ifp, ift_counter cnt) +{ + struct adapter *adapter; + struct e1000_vf_stats *stats; +#ifndef IGB_LEGACY_TX + struct tx_ring *txr; + uint64_t rv; +#endif + + adapter = if_getsoftc(ifp); + stats = (struct e1000_vf_stats *)adapter->stats; + + switch (cnt) { + case IFCOUNTER_IPACKETS: + return (stats->gprc); + case IFCOUNTER_OPACKETS: + return (stats->gptc); + case IFCOUNTER_IBYTES: + return (stats->gorc); + case IFCOUNTER_OBYTES: + return (stats->gotc); + case IFCOUNTER_IMCASTS: + return (stats->mprc); + case IFCOUNTER_IERRORS: + return (adapter->dropped_pkts); + case IFCOUNTER_OERRORS: + return (adapter->watchdog_events); +#ifndef IGB_LEGACY_TX + case IFCOUNTER_OQDROPS: + rv = 0; + txr = adapter->tx_rings; + for (int i = 0; i < adapter->num_queues; i++, txr++) + rv += txr->br->br_drops; + return (rv); +#endif + default: + return (if_get_counter_default(ifp, cnt)); + } +} + +static uint64_t igb_get_counter(if_t ifp, ift_counter cnt) { struct adapter *adapter; struct e1000_hw_stats *stats; +#ifndef IGB_LEGACY_TX + struct tx_ring *txr; + uint64_t rv; +#endif adapter = if_getsoftc(ifp); + if (adapter->vf_ifp) + return (igb_get_vf_counter(ifp, cnt)); + stats = (struct e1000_hw_stats *)adapter->stats; switch (cnt) { + case IFCOUNTER_IPACKETS: + return (stats->gprc); + case IFCOUNTER_OPACKETS: + return (stats->gptc); + case IFCOUNTER_IBYTES: + return (stats->gorc); + case IFCOUNTER_OBYTES: + return (stats->gotc); + case IFCOUNTER_IMCASTS: + return (stats->mprc); + case IFCOUNTER_OMCASTS: + return (stats->mptc); case IFCOUNTER_IERRORS: return (adapter->dropped_pkts + stats->rxerrc + stats->crcerrs + stats->algnerrc + - stats->ruc + stats->roc + stats->mpc + stats->cexterr); + stats->ruc + stats->roc + stats->cexterr); case IFCOUNTER_OERRORS: return (stats->ecol + stats->latecol + adapter->watchdog_events); case IFCOUNTER_COLLISIONS: return (stats->colc); + case IFCOUNTER_IQDROPS: + return (stats->mpc); +#ifndef IGB_LEGACY_TX + case IFCOUNTER_OQDROPS: + rv = 0; + txr = adapter->tx_rings; + for (int i = 0; i < adapter->num_queues; i++, txr++) + rv += txr->br->br_drops; + return (rv); +#endif default: return (if_get_counter_default(ifp, cnt)); } diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h index 8a45d89..5360b7c 100644 --- a/sys/dev/hyperv/include/hyperv.h +++ b/sys/dev/hyperv/include/hyperv.h @@ -46,6 +46,7 @@ #include <sys/systm.h> #include <sys/lock.h> #include <sys/sema.h> +#include <sys/smp.h> #include <sys/mutex.h> #include <sys/bus.h> #include <vm/vm.h> @@ -63,11 +64,22 @@ typedef uint8_t hv_bool_uint8_t; #define HV_ERROR_MACHINE_LOCKED 0x800704F7 /* - * A revision number of vmbus that is used for ensuring both ends on a - * partition are using compatible versions. + * VMBUS version is 32 bit, upper 16 bit for major_number and lower + * 16 bit for minor_number. + * + * 0.13 -- Windows Server 2008 + * 1.1 -- Windows 7 + * 2.4 -- Windows 8 + * 3.0 -- Windows 8.1 */ +#define HV_VMBUS_VERSION_WS2008 ((0 << 16) | (13)) +#define HV_VMBUS_VERSION_WIN7 ((1 << 16) | (1)) +#define HV_VMBUS_VERSION_WIN8 ((2 << 16) | (4)) +#define HV_VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) + +#define HV_VMBUS_VERSION_INVALID -1 -#define HV_VMBUS_REVISION_NUMBER 13 +#define HV_VMBUS_VERSION_CURRENT HV_VMBUS_VERSION_WIN8_1 /* * Make maximum size of pipe payload of 16K @@ -112,6 +124,18 @@ typedef struct hv_guid { unsigned char data[16]; } __packed hv_guid; +#define HV_NIC_GUID \ + .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \ + 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} + +#define HV_IDE_GUID \ + .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, \ + 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} + +#define HV_SCSI_GUID \ + .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, \ + 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} + /* * At the center of the Channel Management library is * the Channel Offer. This struct contains the @@ -147,7 +171,11 @@ typedef struct hv_vmbus_channel_offer { } __packed pipe; } u; - uint32_t padding; + /* + * Sub_channel_index, newly added in Win8. + */ + uint16_t sub_channel_index; + uint16_t padding; } __packed hv_vmbus_channel_offer; @@ -344,7 +372,25 @@ typedef struct { hv_vmbus_channel_offer offer; uint32_t child_rel_id; uint8_t monitor_id; - hv_bool_uint8_t monitor_allocated; + /* + * This field has been split into a bit field on Win7 + * and higher. + */ + uint8_t monitor_allocated:1; + uint8_t reserved:7; + /* + * Following fields were added in win7 and higher. + * Make sure to check the version before accessing these fields. + * + * If "is_dedicated_interrupt" is set, we must not set the + * associated bit in the channel bitmap while sending the + * interrupt to the host. + * + * connection_id is used in signaling the host. + */ + uint16_t is_dedicated_interrupt:1; + uint16_t reserved1:15; + uint32_t connection_id; } __packed hv_vmbus_channel_offer_channel; /* @@ -394,9 +440,11 @@ typedef struct hv_gpadl_handle ring_buffer_gpadl_handle; /* - * GPADL for the channel's server context save area. + * Before win8, all incoming channel interrupts are only + * delivered on cpu 0. Setting this value to 0 would + * preserve the earlier behavior. */ - hv_gpadl_handle server_context_area_gpadl_handle; + uint32_t target_vcpu; /* * The upstream ring buffer begins at offset zero in the memory described @@ -646,14 +694,42 @@ typedef struct { } hv_vmbus_ring_buffer_info; typedef void (*hv_vmbus_pfn_channel_callback)(void *context); +typedef void (*hv_vmbus_sc_creation_callback)(void *context); typedef enum { HV_CHANNEL_OFFER_STATE, HV_CHANNEL_OPENING_STATE, HV_CHANNEL_OPEN_STATE, + HV_CHANNEL_OPENED_STATE, HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE, } hv_vmbus_channel_state; +/* + * Connection identifier type + */ +typedef union { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u; + +} __packed hv_vmbus_connection_id; + +/* + * Definition of the hv_vmbus_signal_event hypercall input structure + */ +typedef struct { + hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +} __packed hv_vmbus_input_signal_event; + +typedef struct { + uint64_t align8; + hv_vmbus_input_signal_event event; +} __packed hv_vmbus_input_signal_event_buffer; + typedef struct hv_vmbus_channel { TAILQ_ENTRY(hv_vmbus_channel) list_entry; struct hv_device* device; @@ -688,8 +764,82 @@ typedef struct hv_vmbus_channel { hv_vmbus_pfn_channel_callback on_channel_callback; void* channel_callback_context; + /* + * If batched_reading is set to "true", mask the interrupt + * and read until the channel is empty. + * If batched_reading is set to "false", the channel is not + * going to perform batched reading. + * + * Batched reading is enabled by default; specific + * drivers that don't want this behavior can turn it off. + */ + boolean_t batched_reading; + + boolean_t is_dedicated_interrupt; + + /* + * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall. + */ + hv_vmbus_input_signal_event_buffer signal_event_buffer; + /* + * 8-bytes aligned of the buffer above + */ + hv_vmbus_input_signal_event *signal_event_param; + + /* + * From Win8, this field specifies the target virtual process + * on which to deliver the interupt from the host to guest. + * Before Win8, all channel interrupts would only be + * delivered on cpu 0. Setting this value to 0 would preserve + * the earlier behavior. + */ + uint32_t target_vcpu; + /* The corresponding CPUID in the guest */ + uint32_t target_cpu; + + /* + * Support for multi-channels. + * The initial offer is considered the primary channel and this + * offer message will indicate if the host supports multi-channels. + * The guest is free to ask for multi-channels to be offerred and can + * open these multi-channels as a normal "primary" channel. However, + * all multi-channels will have the same type and instance guids as the + * primary channel. Requests sent on a given channel will result in a + * response on the same channel. + */ + + /* + * Multi-channel creation callback. This callback will be called in + * process context when a Multi-channel offer is received from the host. + * The guest can open the Multi-channel in the context of this callback. + */ + hv_vmbus_sc_creation_callback sc_creation_callback; + + struct mtx sc_lock; + + /* + * Link list of all the multi-channels if this is a primary channel + */ + TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor; + TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry; + + /* + * The primary channel this sub-channle belongs to. + * This will be NULL for the primary channel. + */ + struct hv_vmbus_channel *primary_channel; + /* + * Support per channel state for use by vmbus drivers. + */ + void *per_channel_state; } hv_vmbus_channel; +static inline void +hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state) +{ + channel->batched_reading = state; +} + typedef struct hv_device { hv_guid class_id; hv_guid device_id; @@ -760,6 +910,8 @@ int hv_vmbus_channel_teardown_gpdal( hv_vmbus_channel* channel, uint32_t gpadl_handle); +struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); + /* * Work abstraction defines */ @@ -819,6 +971,7 @@ typedef struct hv_vmbus_service { extern uint8_t* receive_buffer[]; extern hv_vmbus_service service_table[]; +extern uint32_t hv_vmbus_protocal_version; void hv_kvp_callback(void *context); int hv_kvp_init(hv_vmbus_service *serv); diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index d00d279..f8a871b 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/proc.h> #include <sys/condvar.h> +#include <sys/time.h> #include <sys/systm.h> #include <sys/sockio.h> #include <sys/mbuf.h> @@ -53,8 +54,12 @@ __FBSDID("$FreeBSD$"); #include <sys/callout.h> #include <vm/vm.h> #include <vm/pmap.h> +#include <vm/uma.h> #include <sys/lock.h> #include <sys/sema.h> +#include <sys/sglist.h> +#include <machine/bus.h> +#include <sys/bus_dma.h> #include <cam/cam.h> #include <cam/cam_ccb.h> @@ -66,7 +71,6 @@ __FBSDID("$FreeBSD$"); #include <cam/scsi/scsi_all.h> #include <cam/scsi/scsi_message.h> - #include <dev/hyperv/include/hyperv.h> #include "hv_vstorage.h" @@ -77,8 +81,29 @@ __FBSDID("$FreeBSD$"); #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS #define STORVSC_MAX_TARGETS (2) +#define STORVSC_WIN7_MAJOR 4 +#define STORVSC_WIN7_MINOR 2 + +#define STORVSC_WIN8_MAJOR 5 +#define STORVSC_WIN8_MINOR 1 + +#define HV_ALIGN(x, a) roundup2(x, a) + struct storvsc_softc; +struct hv_sgl_node { + LIST_ENTRY(hv_sgl_node) link; + struct sglist *sgl_data; +}; + +struct hv_sgl_page_pool{ + LIST_HEAD(, hv_sgl_node) in_use_sgl_list; + LIST_HEAD(, hv_sgl_node) free_sgl_list; + boolean_t is_init; +} g_hv_sgl_page_pool; + +#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT + enum storvsc_request_type { WRITE_TYPE, READ_TYPE, @@ -96,20 +121,24 @@ struct hv_storvsc_request { struct storvsc_softc *softc; struct callout callout; struct sema synch_sema; /*Synchronize the request/response if needed */ + struct sglist *bounce_sgl; + unsigned int bounce_sgl_count; + uint64_t not_aligned_seg_bits; }; struct storvsc_softc { struct hv_device *hs_dev; - LIST_HEAD(, hv_storvsc_request) hs_free_list; - struct mtx hs_lock; - struct storvsc_driver_props *hs_drv_props; - int hs_unit; - uint32_t hs_frozen; - struct cam_sim *hs_sim; - struct cam_path *hs_path; + LIST_HEAD(, hv_storvsc_request) hs_free_list; + struct mtx hs_lock; + struct storvsc_driver_props *hs_drv_props; + int hs_unit; + uint32_t hs_frozen; + struct cam_sim *hs_sim; + struct cam_path *hs_path; uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; + boolean_t hs_open_multi_channel; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; @@ -124,7 +153,7 @@ struct storvsc_softc { * The first can be tested by "sg_senddiag -vv /dev/daX", * and the second and third can be done by * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". - */ + */ #define HVS_TIMEOUT_TEST 0 /* @@ -138,7 +167,7 @@ struct storvsc_driver_props { char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; - uint8_t drv_max_ios_per_target; + uint8_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; @@ -150,6 +179,8 @@ enum hv_storage_type { #define HS_MAX_ADAPTERS 10 +#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 + /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ static const hv_guid gStorVscDeviceType={ .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, @@ -171,13 +202,16 @@ static struct storvsc_driver_props g_drv_props_table[] = { STORVSC_RINGBUFFER_SIZE} }; +static int storvsc_current_major; +static int storvsc_current_minor; + /* static functions */ static int storvsc_probe(device_t dev); static int storvsc_attach(device_t dev); static int storvsc_detach(device_t dev); static void storvsc_poll(struct cam_sim * sim); static void storvsc_action(struct cam_sim * sim, union ccb * ccb); -static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); static enum hv_storage_type storvsc_get_storage_type(device_t dev); static void hv_storvsc_on_channel_callback(void *context); @@ -186,6 +220,14 @@ static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, struct hv_storvsc_request *request); static int hv_storvsc_connect_vsp(struct hv_device *device); static void storvsc_io_done(struct hv_storvsc_request *reqp); +static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits); +void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits); static device_method_t storvsc_methods[] = { /* Device interface */ @@ -207,7 +249,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); /** - * The host is capable of sending messages to us that are + * The host is capable of sending messages to us that are * completely unsolicited. So, we need to address the race * condition where we may be in the process of unloading the * driver when the host may send us an unsolicited message. @@ -223,7 +265,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); * destroyed. * * 3. Once the device is marked as being destroyed, we only - * permit incoming traffic to properly account for + * permit incoming traffic to properly account for * packets already sent out. */ static inline struct storvsc_softc * @@ -260,6 +302,113 @@ get_stor_device(struct hv_device *device, } /** + * @brief Callback handler, will be invoked when receive mutil-channel offer + * + * @param context new multi-channel + */ +static void +storvsc_handle_sc_creation(void *context) +{ + hv_vmbus_channel *new_channel; + struct hv_device *device; + struct storvsc_softc *sc; + struct vmstor_chan_props props; + int ret = 0; + + new_channel = (hv_vmbus_channel *)context; + device = new_channel->primary_channel->device; + sc = get_stor_device(device, TRUE); + if (sc == NULL) + return; + + if (FALSE == sc->hs_open_multi_channel) + return; + + memset(&props, 0, sizeof(props)); + + ret = hv_vmbus_channel_open(new_channel, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, + new_channel); + + return; +} + +/** + * @brief Send multi-channel creation request to host + * + * @param device a Hyper-V device pointer + * @param max_chans the max channels supported by vmbus + */ +static void +storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) +{ + struct storvsc_softc *sc; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + int request_channels_cnt = 0; + int ret; + + /* get multichannels count that need to create */ + request_channels_cnt = MIN(max_chans, mp_ncpus); + + sc = get_stor_device(dev, TRUE); + if (sc == NULL) { + printf("Storvsc_error: get sc failed while send mutilchannel " + "request\n"); + return; + } + + request = &sc->hs_init_req; + + /* Establish a handler for multi-channel */ + dev->channel->sc_creation_callback = storvsc_handle_sc_creation; + + /* request the host to create multi-channel */ + memset(request, 0, sizeof(struct hv_storvsc_request)); + + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet = &request->vstor_packet; + + vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + vstor_packet->u.multi_channels_cnt = request_channels_cnt; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)(uintptr_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + /* wait for 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) { + printf("Storvsc_error: create multi-channel timeout, %d\n", + ret); + return; + } + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + printf("Storvsc_error: create multi-channel invalid operation " + "(%d) or statue (%u)\n", + vstor_packet->operation, vstor_packet->status); + return; + } + + sc->hs_open_multi_channel = TRUE; + + if (bootverbose) + printf("Storvsc create multi-channel success!\n"); +} + +/** * @brief initialize channel connection to parent partition * * @param dev a Hyper-V device pointer @@ -272,11 +421,15 @@ hv_storvsc_channel_init(struct hv_device *dev) struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; struct storvsc_softc *sc; + uint16_t max_chans = 0; + boolean_t support_multichannel = FALSE; + + max_chans = 0; + support_multichannel = FALSE; sc = get_stor_device(dev, TRUE); - if (sc == NULL) { - return ENODEV; - } + if (sc == NULL) + return (ENODEV); request = &sc->hs_init_req; memset(request, 0, sizeof(struct hv_storvsc_request)); @@ -300,15 +453,13 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ - if (ret != 0) { + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { @@ -321,7 +472,8 @@ hv_storvsc_channel_init(struct hv_device *dev) vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; - vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT; + vstor_packet->u.version.major_minor = + VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor); /* revision is only significant for Windows guests */ vstor_packet->u.version.revision = 0; @@ -334,21 +486,19 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret) { + if (ret) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } /** * Query channel properties @@ -365,22 +515,30 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if ( ret != 0) { + if ( ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) { goto cleanup; } + /* multi-channels feature is supported by WIN8 and above version */ + max_chans = vstor_packet->u.chan_props.max_channel_cnt; + if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) && + (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) && + (vstor_packet->u.chan_props.flags & + HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { + support_multichannel = TRUE; + } + memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; @@ -397,16 +555,22 @@ hv_storvsc_channel_init(struct hv_device *dev) goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } + + /* + * If multi-channel is supported, send multichannel create + * request to host. + */ + if (support_multichannel) + storvsc_send_multichannel_request(dev, max_chans); cleanup: sema_destroy(&request->synch_sema); @@ -443,8 +607,7 @@ hv_storvsc_connect_vsp(struct hv_device *dev) (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, - dev); - + dev->channel); if (ret != 0) { return ret; @@ -490,7 +653,7 @@ hv_storvsc_host_reset(struct hv_device *dev) goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); /* KYS 5 seconds */ if (ret) { goto cleanup; @@ -498,7 +661,7 @@ hv_storvsc_host_reset(struct hv_device *dev) /* - * At this point, all outstanding requests in the adapter + * At this point, all outstanding requests in the adapter * should have been flushed out and return to us */ @@ -521,6 +684,7 @@ hv_storvsc_io_request(struct hv_device *device, { struct storvsc_softc *sc; struct vstor_packet *vstor_packet = &request->vstor_packet; + struct hv_vmbus_channel* outgoing_channel = NULL; int ret = 0; sc = get_stor_device(device, TRUE); @@ -539,19 +703,20 @@ hv_storvsc_io_request(struct hv_device *device, vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; + outgoing_channel = vmbus_select_outgoing_channel(device->channel); mtx_unlock(&request->softc->hs_lock); if (request->data_buf.length) { ret = hv_vmbus_channel_send_packet_multipagebuffer( - device->channel, + outgoing_channel, &request->data_buf, - vstor_packet, - sizeof(struct vstor_packet), + vstor_packet, + sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request); } else { ret = hv_vmbus_channel_send_packet( - device->channel, + outgoing_channel, vstor_packet, sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request, @@ -610,7 +775,8 @@ static void hv_storvsc_on_channel_callback(void *context) { int ret = 0; - struct hv_device *device = (struct hv_device *)context; + hv_vmbus_channel *channel = (hv_vmbus_channel *)context; + struct hv_device *device = NULL; struct storvsc_softc *sc; uint32_t bytes_recvd; uint64_t request_id; @@ -618,15 +784,22 @@ hv_storvsc_on_channel_callback(void *context) struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; + if (channel->primary_channel != NULL){ + device = channel->primary_channel->device; + } else { + device = channel->device; + } + + KASSERT(device, ("device is NULL")); + sc = get_stor_device(device, FALSE); if (sc == NULL) { + printf("Storvsc_error: get stor device failed.\n"); return; } - KASSERT(device, ("device")); - ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -634,21 +807,28 @@ hv_storvsc_on_channel_callback(void *context) while ((ret == 0) && (bytes_recvd > 0)) { request = (struct hv_storvsc_request *)(uintptr_t)request_id; - KASSERT(request, ("request")); if ((request == &sc->hs_init_req) || (request == &sc->hs_reset_req)) { memcpy(&request->vstor_packet, packet, sizeof(struct vstor_packet)); - sema_post(&request->synch_sema); + sema_post(&request->synch_sema); } else { vstor_packet = (struct vstor_packet *)packet; switch(vstor_packet->operation) { case VSTOR_OPERATION_COMPLETEIO: + if (request == NULL) + panic("VMBUS: storvsc received a " + "packet with NULL request id in " + "COMPLETEIO operation."); + hv_storvsc_on_iocompletion(sc, vstor_packet, request); break; case VSTOR_OPERATION_REMOVEDEVICE: + case VSTOR_OPERATION_ENUMERATE_BUS: + printf("VMBUS: storvsc operation %d not " + "implemented.\n", vstor_packet->operation); /* TODO: implement */ break; default: @@ -656,7 +836,7 @@ hv_storvsc_on_channel_callback(void *context) } } ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -680,7 +860,16 @@ storvsc_probe(device_t dev) { int ata_disk_enable = 0; int ret = ENXIO; - + + if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) || + (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){ + storvsc_current_major = STORVSC_WIN8_MAJOR; + storvsc_current_minor = STORVSC_WIN8_MINOR; + } else { + storvsc_current_major = STORVSC_WIN7_MAJOR; + storvsc_current_minor = STORVSC_WIN7_MINOR; + } + switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) @@ -721,9 +910,11 @@ storvsc_attach(device_t dev) enum hv_storage_type stor_type; struct storvsc_softc *sc; struct cam_devq *devq; - int ret, i; + int ret, i, j; struct hv_storvsc_request *reqp; struct root_hold_token *root_mount_token = NULL; + struct hv_sgl_node *sgl_node = NULL; + void *tmp_buff = NULL; /* * We need to serialize storvsc attach calls. @@ -764,8 +955,41 @@ storvsc_attach(device_t dev) LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } + /* create sg-list page pool */ + if (FALSE == g_hv_sgl_page_pool.is_init) { + g_hv_sgl_page_pool.is_init = TRUE; + LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); + + /* + * Pre-create SG list, each SG list with + * HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each + * segment has one page buffer + */ + for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) { + sgl_node = malloc(sizeof(struct hv_sgl_node), + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data = + sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT, + M_WAITOK|M_ZERO); + + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + tmp_buff = malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data->sg_segs[j].ss_paddr = + (vm_paddr_t)tmp_buff; + } + + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, + sgl_node, link); + } + } + sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; + sc->hs_open_multi_channel = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(hv_dev); @@ -834,6 +1058,20 @@ cleanup: LIST_REMOVE(reqp, link); free(reqp, M_DEVBUF); } + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (ret); } @@ -853,6 +1091,8 @@ storvsc_detach(device_t dev) struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp = NULL; struct hv_device *hv_device = vmbus_get_devctx(dev); + struct hv_sgl_node *sgl_node = NULL; + int j = 0; mtx_lock(&hv_device->channel->inbound_lock); sc->hs_destroy = TRUE; @@ -884,6 +1124,20 @@ storvsc_detach(device_t dev) free(reqp, M_DEVBUF); } mtx_unlock(&sc->hs_lock); + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (0); } @@ -939,7 +1193,7 @@ storvsc_timeout_test(struct hv_storvsc_request *reqp, ticks, __func__, (ret == 0)? "IO return detected" : "IO return not detected"); - /* + /* * Now both the timer handler and io done are running * simultaneously. We want to confirm the io done always * finishes after the timer handler exits. So reqp used by @@ -1023,7 +1277,7 @@ storvsc_poll(struct cam_sim *sim) mtx_assert(&sc->hs_lock, MA_OWNED); mtx_unlock(&sc->hs_lock); - hv_storvsc_on_channel_callback(sc->hs_dev); + hv_storvsc_on_channel_callback(sc->hs_dev->channel); mtx_lock(&sc->hs_lock); } @@ -1151,9 +1405,13 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) bzero(reqp, sizeof(struct hv_storvsc_request)); reqp->softc = sc; - - ccb->ccb_h.status |= CAM_SIM_QUEUED; - create_storvsc_request(ccb, reqp); + + ccb->ccb_h.status |= CAM_SIM_QUEUED; + if ((res = create_storvsc_request(ccb, reqp)) != 0) { + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_init(&reqp->callout, CALLOUT_MPSAFE); @@ -1194,6 +1452,212 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) } /** + * @brief destroy bounce buffer + * + * This function is responsible for destroy a Scatter/Gather list + * that create by storvsc_create_bounce_buffer() + * + * @param sgl- the Scatter/Gather need be destroy + * @param sg_count- page count of the SG list. + * + */ +static void +storvsc_destroy_bounce_buffer(struct sglist *sgl) +{ + struct hv_sgl_node *sgl_node = NULL; + + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough in use sgl\n"); + return; + } + sgl_node->sgl_data = sgl; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); +} + +/** + * @brief create bounce buffer + * + * This function is responsible for create a Scatter/Gather list, + * which hold several pages that can be aligned with page size. + * + * @param seg_count- SG-list segments count + * @param write - if WRITE_TYPE, set SG list page used size to 0, + * otherwise set used size to page size. + * + * return NULL if create failed + */ +static struct sglist * +storvsc_create_bounce_buffer(uint16_t seg_count, int write) +{ + int i = 0; + struct sglist *bounce_sgl = NULL; + unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); + struct hv_sgl_node *sgl_node = NULL; + + /* get struct sglist from free_sgl_list */ + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough free sgl\n"); + return NULL; + } + bounce_sgl = sgl_node->sgl_data; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); + + bounce_sgl->sg_maxseg = seg_count; + + if (write == WRITE_TYPE) + bounce_sgl->sg_nseg = 0; + else + bounce_sgl->sg_nseg = seg_count; + + for (i = 0; i < seg_count; i++) + bounce_sgl->sg_segs[i].ss_len = buf_len; + + return bounce_sgl; +} + +/** + * @brief copy data from SG list to bounce buffer + * + * This function is responsible for copy data from one SG list's segments + * to another SG list which used as bounce buffer. + * + * @param bounce_sgl - the destination SG list + * @param orig_sgl - the segment of the source SG list. + * @param orig_sgl_count - the count of segments. + * @param orig_sgl_count - indicate which segment need bounce buffer, + * set 1 means need. + * + */ +static void +storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits) +{ + int src_sgl_idx = 0; + + for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { + if (seg_bits & (1 << src_sgl_idx)) { + memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, + (void*)orig_sgl[src_sgl_idx].ds_addr, + orig_sgl[src_sgl_idx].ds_len); + + bounce_sgl->sg_segs[src_sgl_idx].ss_len = + orig_sgl[src_sgl_idx].ds_len; + } + } +} + +/** + * @brief copy data from SG list which used as bounce to another SG list + * + * This function is responsible for copy data from one SG list with bounce + * buffer to another SG list's segments. + * + * @param dest_sgl - the destination SG list's segments + * @param dest_sgl_count - the count of destination SG list's segment. + * @param src_sgl - the source SG list. + * @param seg_bits - indicate which segment used bounce buffer of src SG-list. + * + */ +void +storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits) +{ + int sgl_idx = 0; + + for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { + if (seg_bits & (1 << sgl_idx)) { + memcpy((void*)(dest_sgl[sgl_idx].ds_addr), + (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), + src_sgl->sg_segs[sgl_idx].ss_len); + } + } +} + +/** + * @brief check SG list with bounce buffer or not + * + * This function is responsible for check if need bounce buffer for SG list. + * + * @param sgl - the SG list's segments + * @param sg_count - the count of SG list's segment. + * @param bits - segmengs number that need bounce buffer + * + * return -1 if SG list needless bounce buffer + */ +static int +storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, + unsigned int sg_count, + uint64_t *bits) +{ + int i = 0; + int offset = 0; + uint64_t phys_addr = 0; + uint64_t tmp_bits = 0; + boolean_t found_hole = FALSE; + boolean_t pre_aligned = TRUE; + + if (sg_count < 2){ + return -1; + } + + *bits = 0; + + phys_addr = vtophys(sgl[0].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset != 0) { + pre_aligned = FALSE; + tmp_bits |= 1; + } + + for (i = 1; i < sg_count; i++) { + phys_addr = vtophys(sgl[i].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset == 0) { + if (FALSE == pre_aligned){ + /* + * This segment is aligned, if the previous + * one is not aligned, find a hole + */ + found_hole = TRUE; + } + pre_aligned = TRUE; + } else { + tmp_bits |= 1 << i; + if (!pre_aligned) { + if (phys_addr != vtophys(sgl[i-1].ds_addr + + sgl[i-1].ds_len)) { + /* + * Check whether connect to previous + * segment,if not, find the hole + */ + found_hole = TRUE; + } + } else { + found_hole = TRUE; + } + pre_aligned = FALSE; + } + } + + if (!found_hole) { + return (-1); + } else { + *bits = tmp_bits; + return 0; + } +} + +/** * @brief Fill in a request structure based on a CAM control block * * Fills in a request structure based on the contents of a CAM control @@ -1203,7 +1667,7 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) * @param ccb pointer to a CAM contorl block * @param reqp pointer to a request structure */ -static void +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) { struct ccb_scsiio *csio = &ccb->csio; @@ -1211,6 +1675,7 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) uint32_t bytes_to_copy = 0; uint32_t pfn_num = 0; uint32_t pfn; + uint64_t not_aligned_seg_bits = 0; /* refer to struct vmscsi_req for meanings of these two fields */ reqp->vstor_packet.u.vm_srb.port = @@ -1231,48 +1696,172 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { - case CAM_DIR_OUT: - reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; - break; - case CAM_DIR_IN: - reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; - break; - case CAM_DIR_NONE: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; - default: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; + case CAM_DIR_OUT: + reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + break; + case CAM_DIR_IN: + reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; + break; + case CAM_DIR_NONE: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; + default: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; } reqp->sense_data = &csio->sense_data; reqp->sense_info_len = csio->sense_len; reqp->ccb = ccb; - /* - KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0, - ("ccb is scatter gather valid\n")); - */ - if (csio->dxfer_len != 0) { - reqp->data_buf.length = csio->dxfer_len; + + if (0 == csio->dxfer_len) { + return (0); + } + + reqp->data_buf.length = csio->dxfer_len; + + switch (ccb->ccb_h.flags & CAM_DATA_MASK) { + case CAM_DATA_VADDR: + { bytes_to_copy = csio->dxfer_len; phys_addr = vtophys(csio->data_ptr); - reqp->data_buf.offset = phys_addr - trunc_page(phys_addr); + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + while (bytes_to_copy != 0) { + int bytes, page_offset; + phys_addr = + vtophys(&csio->data_ptr[reqp->data_buf.length - + bytes_to_copy]); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[pfn_num] = pfn; + page_offset = phys_addr & PAGE_MASK; + + bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + + bytes_to_copy -= bytes; + pfn_num++; + } + break; } - while (bytes_to_copy != 0) { - int bytes, page_offset; - phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length - - bytes_to_copy]); - pfn = phys_addr >> PAGE_SHIFT; - reqp->data_buf.pfn_array[pfn_num] = pfn; - page_offset = phys_addr - trunc_page(phys_addr); + case CAM_DATA_SG: + { + int i = 0; + int offset = 0; + int ret; + + bus_dma_segment_t *storvsc_sglist = + (bus_dma_segment_t *)ccb->csio.data_ptr; + u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; + + printf("Storvsc: get SG I/O operation, %d\n", + reqp->vstor_packet.u.vm_srb.data_in); + + if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){ + printf("Storvsc: %d segments is too much, " + "only support %d segments\n", + storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT); + return (EINVAL); + } + + /* + * We create our own bounce buffer function currently. Idealy + * we should use BUS_DMA(9) framework. But with current BUS_DMA + * code there is no callback API to check the page alignment of + * middle segments before busdma can decide if a bounce buffer + * is needed for particular segment. There is callback, + * "bus_dma_filter_t *filter", but the parrameters are not + * sufficient for storvsc driver. + * TODO: + * Add page alignment check in BUS_DMA(9) callback. Once + * this is complete, switch the following code to use + * BUS_DMA(9) for storvsc bounce buffer support. + */ + /* check if we need to create bounce buffer */ + ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, + storvsc_sg_count, ¬_aligned_seg_bits); + if (ret != -1) { + reqp->bounce_sgl = + storvsc_create_bounce_buffer(storvsc_sg_count, + reqp->vstor_packet.u.vm_srb.data_in); + if (NULL == reqp->bounce_sgl) { + printf("Storvsc_error: " + "create bounce buffer failed.\n"); + return (ENOMEM); + } + + reqp->bounce_sgl_count = storvsc_sg_count; + reqp->not_aligned_seg_bits = not_aligned_seg_bits; + + /* + * if it is write, we need copy the original data + *to bounce buffer + */ + if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_sgl_to_bounce_buf( + reqp->bounce_sgl, + storvsc_sglist, + storvsc_sg_count, + reqp->not_aligned_seg_bits); + } + + /* transfer virtual address to physical frame number */ + if (reqp->not_aligned_seg_bits & 0x1){ + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); + }else{ + phys_addr = + vtophys(storvsc_sglist[0].ds_addr); + } + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[0] = pfn; + + for (i = 1; i < storvsc_sg_count; i++) { + if (reqp->not_aligned_seg_bits & (1 << i)) { + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); + } else { + phys_addr = + vtophys(storvsc_sglist[i].ds_addr); + } + + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + } else { + phys_addr = vtophys(storvsc_sglist[0].ds_addr); + + reqp->data_buf.offset = phys_addr & PAGE_MASK; - bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + for (i = 0; i < storvsc_sg_count; i++) { + phys_addr = vtophys(storvsc_sglist[i].ds_addr); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } - bytes_to_copy -= bytes; - pfn_num++; + /* check the last segment cross boundary or not */ + offset = phys_addr & PAGE_MASK; + if (offset) { + phys_addr = + vtophys(storvsc_sglist[i-1].ds_addr + + PAGE_SIZE - offset); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + + reqp->bounce_sgl_count = 0; + } + break; + } + default: + printf("Unknow flags: %d\n", ccb->ccb_h.flags); + return(EINVAL); } + + return(0); } /** @@ -1291,7 +1880,29 @@ storvsc_io_done(struct hv_storvsc_request *reqp) struct ccb_scsiio *csio = &ccb->csio; struct storvsc_softc *sc = reqp->softc; struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; - + bus_dma_segment_t *ori_sglist = NULL; + int ori_sg_count = 0; + + /* destroy bounce buffer if it is used */ + if (reqp->bounce_sgl_count) { + ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; + ori_sg_count = ccb->csio.sglist_cnt; + + /* + * If it is READ operation, we should copy back the data + * to original SG list. + */ + if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, + ori_sg_count, + reqp->bounce_sgl, + reqp->not_aligned_seg_bits); + } + + storvsc_destroy_bounce_buffer(reqp->bounce_sgl); + reqp->bounce_sgl_count = 0; + } + if (reqp->retries > 0) { mtx_lock(&sc->hs_lock); #if HVS_TIMEOUT_TEST @@ -1309,7 +1920,7 @@ storvsc_io_done(struct hv_storvsc_request *reqp) mtx_unlock(&sc->hs_lock); } - /* + /* * callout_drain() will wait for the timer handler to finish * if it is running. So we don't need any lock to synchronize * between this routine and the timer handler. diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h index 2632676..deb9183 100644 --- a/sys/dev/hyperv/storvsc/hv_vstorage.h +++ b/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -53,7 +53,7 @@ * V1 RC > 2008/1/31 2.0 */ -#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0) +#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) /** * Packet structure ops describing virtual storage requests. @@ -69,7 +69,10 @@ enum vstor_packet_ops { VSTOR_OPERATION_ENDINITIALIZATION = 8, VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, VSTOR_OPERATION_QUERYPROPERTIES = 10, - VSTOR_OPERATION_MAXIMUM = 10 + VSTOR_OPERATION_ENUMERATE_BUS = 11, + VSTOR_OPERATION_FCHBA_DATA = 12, + VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, + VSTOR_OPERATION_MAXIMUM = 13 }; @@ -123,10 +126,12 @@ struct vmstor_chan_props { uint8_t path_id; uint8_t target_id; + uint16_t max_channel_cnt; + /** * Note: port number is only really known on the client side */ - uint32_t port; + uint16_t port; uint32_t flags; uint32_t max_transfer_bytes; @@ -193,6 +198,11 @@ struct vstor_packet { * Used during version negotiations. */ struct vmstor_proto_ver version; + + /** + * Number of multichannels to create + */ + uint16_t multi_channels_cnt; } u; } __packed; diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c index 848d364..4598510 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.c +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/_null.h> #include <sys/signal.h> #include <sys/syslog.h> +#include <sys/systm.h> #include <sys/mutex.h> #include <net/if_arp.h> @@ -232,7 +233,7 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, */ if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) { icframe_vercnt = 3; - if (icmsg_vercnt >= 2) + if (icmsg_vercnt > 2) icmsg_vercnt = 4; else icmsg_vercnt = 3; @@ -734,8 +735,8 @@ hv_kvp_process_request(void *context) recvlen = 0; ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, &recvlen, &requestid); - hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n", - __func__, context, pending_cnt, ret, recvlen); + hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n", + __func__, context, (unsigned long long)pending_cnt, ret, recvlen); } } @@ -813,9 +814,9 @@ static void hv_kvp_dev_destroy(void) { - if (daemon_task != NULL) { + if (daemon_task != NULL) { PROC_LOCK(daemon_task); - kern_psignal(daemon_task, SIGKILL); + kern_psignal(daemon_task, SIGKILL); PROC_UNLOCK(daemon_task); } diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c index 3e545cf..dc4b1e2 100644 --- a/sys/dev/hyperv/utilities/hv_util.c +++ b/sys/dev/hyperv/utilities/hv_util.c @@ -408,6 +408,15 @@ hv_util_attach(device_t dev) } } + /* + * These services are not performance critical and do not need + * batched reading. Furthermore, some services such as KVP can + * only handle one message from the host at a time. + * Turn off batched reading for all util drivers before we open the + * channel. + */ + hv_set_channel_read_state(hv_dev->channel, FALSE); + ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0, service->callback, hv_dev->channel); diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c index 103260a..94137fb 100644 --- a/sys/dev/hyperv/vmbus/hv_channel.c +++ b/sys/dev/hyperv/vmbus/hv_channel.c @@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel) (uint32_t *)&monitor_page-> trigger_group[channel->monitor_group].u.pending); } else { - hv_vmbus_set_event(channel->offer_msg.child_rel_id); + hv_vmbus_set_event(channel); } } @@ -99,6 +99,18 @@ hv_vmbus_channel_open( hv_vmbus_channel_open_channel* open_msg; hv_vmbus_channel_msg_info* open_info; + mtx_lock(&new_channel->sc_lock); + if (new_channel->state == HV_CHANNEL_OPEN_STATE) { + new_channel->state = HV_CHANNEL_OPENING_STATE; + } else { + mtx_unlock(&new_channel->sc_lock); + if(bootverbose) + printf("VMBUS: Trying to open channel <%p> which in " + "%d state.\n", new_channel, new_channel->state); + return (EINVAL); + } + mtx_unlock(&new_channel->sc_lock); + new_channel->on_channel_callback = pfn_on_channel_callback; new_channel->channel_callback_context = context; @@ -162,7 +174,7 @@ hv_vmbus_channel_open( new_channel->ring_buffer_gpadl_handle; open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size >> PAGE_SHIFT; - open_msg->server_context_area_gpadl_handle = 0; + open_msg->target_vcpu = new_channel->target_vcpu; if (user_data_len) memcpy(open_msg->user_data, user_data, user_data_len); @@ -182,10 +194,14 @@ hv_vmbus_channel_open( ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ - if (ret) + if (ret) { + if(bootverbose) + printf("VMBUS: channel <%p> open timeout.\n", new_channel); goto cleanup; + } if (open_info->response.open_result.status == 0) { + new_channel->state = HV_CHANNEL_OPENED_STATE; if(bootverbose) printf("VMBUS: channel <%p> open success.\n", new_channel); } else { @@ -497,16 +513,20 @@ cleanup: return (ret); } -/** - * @brief Close the specified channel - */ -void -hv_vmbus_channel_close(hv_vmbus_channel *channel) +static void +hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) { int ret = 0; hv_vmbus_channel_close_channel* msg; hv_vmbus_channel_msg_info* info; + channel->state = HV_CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + + /* + * Grab the lock to prevent race condition when a packet received + * and unloading driver is in the process. + */ mtx_lock(&channel->inbound_lock); channel->on_channel_callback = NULL; mtx_unlock(&channel->inbound_lock); @@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel) M_DEVBUF); free(info, M_DEVBUF); +} + +/** + * @brief Close the specified channel + */ +void +hv_vmbus_channel_close(hv_vmbus_channel *channel) +{ + hv_vmbus_channel* sub_channel; + + if (channel->primary_channel != NULL) { + /* + * We only close multi-channels when the primary is + * closed. + */ + return; + } /* - * If we are closing the channel during an error path in - * opening the channel, don't free the channel - * since the caller will free the channel + * Close all multi-channels first. */ - if (channel->state == HV_CHANNEL_OPEN_STATE) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - - hv_vmbus_free_vmbus_channel(channel); + TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor, + sc_list_entry) { + if (sub_channel->state != HV_CHANNEL_OPENED_STATE) + continue; + hv_vmbus_channel_close_internal(sub_channel); } - + /* + * Then close the primary channel. + */ + hv_vmbus_channel_close_internal(channel); } /** @@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet( uint32_t packet_len; uint64_t aligned_data; uint32_t packet_len_aligned; + boolean_t need_sig; hv_vmbus_sg_buffer_list buffer_list[3]; packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; @@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 - && !hv_vmbus_get_ring_buffer_interrupt_mask( - &channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer( int ret = 0; int i = 0; + boolean_t need_sig; uint32_t packet_len; uint32_t packetLen_aligned; hv_vmbus_sg_buffer_list buffer_list[3]; @@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer( buffer_list[2].data = &alignedData; buffer_list[2].length = packetLen_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer( int ret = 0; uint32_t desc_size; + boolean_t need_sig; uint32_t packet_len; uint32_t packet_len_aligned; uint32_t pfn_count; @@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index 011e305..783f6bc 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -50,6 +50,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_process_offer(void *context); +struct hv_vmbus_channel* + vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); /** * Channel message dispatch table @@ -233,6 +235,9 @@ hv_vmbus_allocate_channel(void) return (NULL); mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); + mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); + + TAILQ_INIT(&channel->sc_list_anchor); channel->control_work_queue = hv_work_queue_create("control"); @@ -262,6 +267,7 @@ ReleaseVmbusChannel(void *context) void hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { + mtx_destroy(&channel->sc_lock); mtx_destroy(&channel->inbound_lock); /* * We have to release the channel's workqueue/thread in @@ -279,10 +285,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) static void vmbus_channel_process_offer(void *context) { - int ret; hv_vmbus_channel* new_channel; boolean_t f_new; hv_vmbus_channel* channel; + int ret; new_channel = (hv_vmbus_channel*) context; f_new = TRUE; @@ -291,38 +297,76 @@ vmbus_channel_process_offer(void *context) /* * Make sure this is a new offer */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { - if (!memcmp( - &channel->offer_msg.offer.interface_type, - &new_channel->offer_msg.offer.interface_type, - sizeof(hv_guid)) - && !memcmp( - &channel->offer_msg.offer.interface_instance, + if (memcmp(&channel->offer_msg.offer.interface_type, + &new_channel->offer_msg.offer.interface_type, + sizeof(hv_guid)) == 0 && + memcmp(&channel->offer_msg.offer.interface_instance, &new_channel->offer_msg.offer.interface_instance, - sizeof(hv_guid))) { - f_new = FALSE; - break; - } + sizeof(hv_guid)) == 0) { + f_new = FALSE; + break; + } } if (f_new) { - /* Insert at tail */ - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + /* Insert at tail */ + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + /*XXX add new channel to percpu_list */ if (!f_new) { + /* + * Check if this is a sub channel. + */ + if (new_channel->offer_msg.offer.sub_channel_index != 0) { + /* + * It is a sub channel offer, process it. + */ + new_channel->primary_channel = channel; + mtx_lock(&channel->sc_lock); + TAILQ_INSERT_TAIL( + &channel->sc_list_anchor, + new_channel, + sc_list_entry); + mtx_unlock(&channel->sc_lock); + + /* Insert new channel into channel_anchor. */ + printf("Storvsc get multi-channel offer, rel=%u.\n", + new_channel->offer_msg.child_rel_id); + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + if(bootverbose) + printf("VMBUS: new multi-channel offer <%p>.\n", + new_channel); + + /*XXX add it to percpu_list */ + + new_channel->state = HV_CHANNEL_OPEN_STATE; + if (channel->sc_creation_callback != NULL) { + channel->sc_creation_callback(new_channel); + } + return; + } + hv_vmbus_free_vmbus_channel(new_channel); return; } + new_channel->state = HV_CHANNEL_OPEN_STATE; + /* * Start the process of binding this offer to the driver * (We need to set the device field before calling @@ -333,35 +377,86 @@ vmbus_channel_process_offer(void *context) new_channel->offer_msg.offer.interface_instance, new_channel); /* - * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below - * but in the "open" channel request. The ret != 0 logic below - * doesn't take into account that a channel - * may have been opened successfully - */ - - /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() * method. */ ret = hv_vmbus_child_device_register(new_channel->device); if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - hv_vmbus_free_vmbus_channel(new_channel); - } else { - /* - * This state is used to indicate a successful open - * so that when we do close the channel normally, - * we can clean up properly - */ - new_channel->state = HV_CHANNEL_OPEN_STATE; + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_free_vmbus_channel(new_channel); + } +} + +/** + * Array of device guids that are performance critical. We try to distribute + * the interrupt load for these devices across all online cpus. + */ +static const hv_guid high_perf_devices[] = { + {HV_NIC_GUID, }, + {HV_IDE_GUID, }, + {HV_SCSI_GUID, }, +}; + +enum { + PERF_CHN_NIC = 0, + PERF_CHN_IDE, + PERF_CHN_SCSI, + MAX_PERF_CHN, +}; +/* + * We use this static number to distribute the channel interrupt load. + */ +static uint32_t next_vcpu; + +/** + * Starting with Win8, we can statically distribute the incoming + * channel interrupt load by binding a channel to VCPU. We + * implement here a simple round robin scheme for distributing + * the interrupt load. + * We will bind channels that are not performance critical to cpu 0 and + * performance critical channels (IDE, SCSI and Network) will be uniformly + * distributed across all available CPUs. + */ +static void +vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) +{ + uint32_t current_cpu; + int i; + boolean_t is_perf_channel = FALSE; + + for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { + if (memcmp(guid->data, high_perf_devices[i].data, + sizeof(hv_guid)) == 0) { + is_perf_channel = TRUE; + break; + } + } + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || + (!is_perf_channel)) { + /* Host's view of guest cpu */ + channel->target_vcpu = 0; + /* Guest's own view of cpu */ + channel->target_cpu = 0; + return; } + /* mp_ncpus should have the number cpus currently online */ + current_cpu = (++next_vcpu % mp_ncpus); + channel->target_cpu = current_cpu; + channel->target_vcpu = + hv_vmbus_g_context.hv_vcpu_index[current_cpu]; + if (bootverbose) + printf("VMBUS: Total online cpus %d, assign perf channel %d " + "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, + current_cpu); } /** @@ -391,6 +486,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) if (new_channel == NULL) return; + /* + * By default we setup state to enable batched + * reading. A specific service can choose to + * disable this prior to opening the channel. + */ + new_channel->batched_reading = TRUE; + + new_channel->signal_event_param = + (hv_vmbus_input_signal_event *) + (HV_ALIGN_UP((unsigned long) + &new_channel->signal_event_buffer, + HV_HYPERCALL_PARAM_ALIGN)); + + new_channel->signal_event_param->connection_id.as_uint32_t = 0; + new_channel->signal_event_param->connection_id.u.id = + HV_VMBUS_EVENT_CONNECTION_ID; + new_channel->signal_event_param->flag_number = 0; + new_channel->signal_event_param->rsvd_z = 0; + + if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) { + new_channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + new_channel->signal_event_param->connection_id.u.id = + offer->connection_id; + } + + /* + * Bind the channel to a chosen cpu. + */ + vmbus_channel_select_cpu(new_channel, + &offer->offer.interface_type); + memcpy(&new_channel->offer_msg, offer, sizeof(hv_vmbus_channel_offer_channel)); new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; @@ -666,7 +793,7 @@ hv_vmbus_release_unattached_channels(void) { hv_vmbus_channel *channel; - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) { channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor); @@ -676,5 +803,61 @@ hv_vmbus_release_unattached_channels(void) hv_vmbus_child_device_unregister(channel->device); hv_vmbus_free_vmbus_channel(channel); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); +} + +/** + * @brief Select the best outgoing channel + * + * The channel whose vcpu binding is closest to the currect vcpu will + * be selected. + * If no multi-channel, always select primary channel + * + * @param primary - primary channel + */ +struct hv_vmbus_channel * +vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) +{ + hv_vmbus_channel *new_channel = NULL; + hv_vmbus_channel *outgoing_channel = primary; + int old_cpu_distance = 0; + int new_cpu_distance = 0; + int cur_vcpu = 0; + int smp_pro_id = PCPU_GET(cpuid); + + if (TAILQ_EMPTY(&primary->sc_list_anchor)) { + return outgoing_channel; + } + + if (smp_pro_id >= MAXCPU) { + return outgoing_channel; + } + + cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id]; + + TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) { + if (new_channel->state != HV_CHANNEL_OPENED_STATE){ + continue; + } + + if (new_channel->target_vcpu == cur_vcpu){ + return new_channel; + } + + old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ? + (outgoing_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - outgoing_channel->target_vcpu)); + + new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ? + (new_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - new_channel->target_vcpu)); + + if (old_cpu_distance < new_cpu_distance) { + continue; + } + + outgoing_channel = new_channel; + } + + return(outgoing_channel); } diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index c8e0b48..0300828 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -45,14 +45,113 @@ hv_vmbus_connection hv_vmbus_g_connection = { .connect_state = HV_DISCONNECTED, .next_gpadl_handle = 0xE1E10, }; +uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008; + +static uint32_t +hv_vmbus_get_next_version(uint32_t current_ver) +{ + switch (current_ver) { + case (HV_VMBUS_VERSION_WIN7): + return(HV_VMBUS_VERSION_WS2008); + + case (HV_VMBUS_VERSION_WIN8): + return(HV_VMBUS_VERSION_WIN7); + + case (HV_VMBUS_VERSION_WIN8_1): + return(HV_VMBUS_VERSION_WIN8); + + case (HV_VMBUS_VERSION_WS2008): + default: + return(HV_VMBUS_VERSION_INVALID); + } +} + +/** + * Negotiate the highest supported hypervisor version. + */ +static int +hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, + uint32_t version) +{ + int ret = 0; + hv_vmbus_channel_initiate_contact *msg; + + sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); + msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; + msg->vmbus_version_requested = version; + + msg->interrupt_page = hv_get_phys_addr( + hv_vmbus_g_connection.interrupt_page); + + msg->monitor_page_1 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_pages); + + msg->monitor_page_2 = + hv_get_phys_addr( + ((uint8_t *) hv_vmbus_g_connection.monitor_pages + + PAGE_SIZE)); + + /** + * Add to list before we send the request since we may receive the + * response before returning from this routine + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + msg, + sizeof(hv_vmbus_channel_initiate_contact)); + + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + return (ret); + } + + /** + * Wait for the connection response + */ + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + /** + * Check if successful + */ + if (msg_info->response.version_response.version_supported) { + hv_vmbus_g_connection.connect_state = HV_CONNECTED; + } else { + ret = ECONNREFUSED; + } + + return (ret); +} + /** * Send a connect request on the partition service connection */ int hv_vmbus_connect(void) { int ret = 0; + uint32_t version; hv_vmbus_channel_msg_info* msg_info = NULL; - hv_vmbus_channel_initiate_contact* msg; /** * Make sure we are not connecting or connected @@ -74,7 +173,7 @@ hv_vmbus_connect(void) { TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", - NULL, MTX_SPIN); + NULL, MTX_DEF); /** * Setup the vmbus event connection for channel interrupt abstraction @@ -130,71 +229,30 @@ hv_vmbus_connect(void) { goto cleanup; } - sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); - msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; - - msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; - msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; - - msg->interrupt_page = hv_get_phys_addr( - hv_vmbus_g_connection.interrupt_page); - - msg->monitor_page_1 = hv_get_phys_addr( - hv_vmbus_g_connection.monitor_pages); - - msg->monitor_page_2 = - hv_get_phys_addr( - ((uint8_t *) hv_vmbus_g_connection.monitor_pages - + PAGE_SIZE)); - - /** - * Add to list before we send the request since we may receive the - * response before returning from this routine + /* + * Find the highest vmbus version number we can support. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - ret = hv_vmbus_post_message( - msg, - sizeof(hv_vmbus_channel_initiate_contact)); - - if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - goto cleanup; - } + version = HV_VMBUS_VERSION_CURRENT; + + do { + ret = hv_vmbus_negotiate_version(msg_info, version); + if (ret == EWOULDBLOCK) { + /* + * We timed out. + */ + goto cleanup; + } - /** - * Wait for the connection response - */ - ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + if (hv_vmbus_g_connection.connect_state == HV_CONNECTED) + break; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + version = hv_vmbus_get_next_version(version); + } while (version != HV_VMBUS_VERSION_INVALID); - /** - * Check if successful - */ - if (msg_info->response.version_response.version_supported) { - hv_vmbus_g_connection.connect_state = HV_CONNECTED; - } else { - ret = ECONNREFUSED; - goto cleanup; - } + hv_vmbus_protocal_version = version; + if (bootverbose) + printf("VMBUS: Portocal Version: %d.%d\n", + version >> 16, version & 0xFFFF); sema_destroy(&msg_info->wait_sema); free(msg_info, M_DEVBUF); @@ -286,7 +344,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { * and channels are accessed without the need to take this lock or search * the list. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { @@ -295,7 +353,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { break; } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); return (foundChannel); } @@ -306,7 +364,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { static void VmbusProcessChannelEvent(uint32_t relid) { + void* arg; + uint32_t bytes_to_read; hv_vmbus_channel* channel; + boolean_t is_batched_reading; /** * Find the channel based on this relid and invokes @@ -327,31 +388,98 @@ VmbusProcessChannelEvent(uint32_t relid) * callback to NULL. This closes the window. */ - mtx_lock(&channel->inbound_lock); + /* + * Disable the lock due to newly added WITNESS check in r277723. + * Will seek other way to avoid race condition. + * -- whu + */ + // mtx_lock(&channel->inbound_lock); if (channel->on_channel_callback != NULL) { - channel->on_channel_callback(channel->channel_callback_context); + arg = channel->channel_callback_context; + is_batched_reading = channel->batched_reading; + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + */ + do { + if (is_batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + + channel->on_channel_callback(arg); + + if (is_batched_reading) + bytes_to_read = + hv_ring_buffer_read_end(&channel->inbound); + else + bytes_to_read = 0; + } while (is_batched_reading && (bytes_to_read != 0)); } - mtx_unlock(&channel->inbound_lock); + // mtx_unlock(&channel->inbound_lock); } +#ifdef HV_DEBUG_INTR +extern uint32_t hv_intr_count; +extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +extern uint32_t hv_vmbus_intr_cpu[MAXCPU]; +#endif + /** * Handler for events */ void hv_vmbus_on_events(void *arg) { - int dword; int bit; + int cpu; + int dword; + void *page_addr; + uint32_t* recv_interrupt_page = NULL; int rel_id; - int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + int maxdword; + hv_vmbus_synic_event_flags *event; /* int maxdword = PAGE_SIZE >> 3; */ - /* - * receive size is 1/2 page and divide that by 4 bytes - */ - - uint32_t* recv_interrupt_page = - hv_vmbus_g_connection.recv_interrupt_page; + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " + "cpu out of range!")); + +#ifdef HV_DEBUG_INTR + int i; + hv_vmbus_swintr_event_cpu[cpu]++; + if (hv_intr_count % 10000 == 0) { + printf("VMBUS: Total interrupt %d\n", hv_intr_count); + for (i = 0; i < mp_ncpus; i++) + printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n", + i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]); + } +#endif + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + /* + * receive size is 1/2 page and divide that by 4 bytes + */ + recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; + } else { + /* + * On Host with Win8 or above, the event page can be + * checked directly to get the id of the channel + * that has the pending interrupt. + */ + maxdword = HV_EVENT_FLAGS_DWORD_COUNT; + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags *) + page_addr + HV_VMBUS_MESSAGE_SINT; + recv_interrupt_page = event->flags32; + } /* * Check events @@ -416,16 +544,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) { * Send an event notification to the parent */ int -hv_vmbus_set_event(uint32_t child_rel_id) { +hv_vmbus_set_event(hv_vmbus_channel *channel) { int ret = 0; + uint32_t child_rel_id = channel->offer_msg.child_rel_id; /* Each uint32_t represents 32 channels */ synch_set_bit(child_rel_id & 31, (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + (child_rel_id >> 5)))); - ret = hv_vmbus_signal_event(); + ret = hv_vmbus_signal_event(channel->signal_event_param); return (ret); } - diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c index 80a1f42..84e2a5e 100644 --- a/sys/dev/hyperv/vmbus/hv_hv.c +++ b/sys/dev/hyperv/vmbus/hv_hv.c @@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax, hv_vmbus_context hv_vmbus_g_context = { .syn_ic_initialized = FALSE, .hypercall_page = NULL, - .signal_event_param = NULL, - .signal_event_buffer = NULL, }; static struct timecounter hv_timecounter = { @@ -256,28 +254,6 @@ hv_vmbus_init(void) hv_vmbus_g_context.hypercall_page = virt_addr; - /* - * Setup the global signal event param for the signal event hypercall - */ - hv_vmbus_g_context.signal_event_buffer = - malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, - M_ZERO | M_NOWAIT); - KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, - ("Error VMBUS: Failed to allocate signal_event_buffer\n")); - if (hv_vmbus_g_context.signal_event_buffer == NULL) - goto cleanup; - - hv_vmbus_g_context.signal_event_param = - (hv_vmbus_input_signal_event*) - (HV_ALIGN_UP((unsigned long) - hv_vmbus_g_context.signal_event_buffer, - HV_HYPERCALL_PARAM_ALIGN)); - hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; - hv_vmbus_g_context.signal_event_param->connection_id.u.id = - HV_VMBUS_EVENT_CONNECTION_ID; - hv_vmbus_g_context.signal_event_param->flag_number = 0; - hv_vmbus_g_context.signal_event_param->rsvd_z = 0; - tc_init(&hv_timecounter); /* register virtual timecount */ return (0); @@ -303,12 +279,6 @@ hv_vmbus_cleanup(void) { hv_vmbus_x64_msr_hypercall_contents hypercall_msr; - if (hv_vmbus_g_context.signal_event_buffer != NULL) { - free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); - hv_vmbus_g_context.signal_event_buffer = NULL; - hv_vmbus_g_context.signal_event_param = NULL; - } - if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { if (hv_vmbus_g_context.hypercall_page != NULL) { hypercall_msr.as_uint64_t = 0; @@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc( * event IPC. (This involves a hypercall.) */ hv_vmbus_status -hv_vmbus_signal_event() +hv_vmbus_signal_event(void *con_id) { hv_vmbus_status status; status = hv_vmbus_do_hypercall( HV_CALL_SIGNAL_EVENT, - hv_vmbus_g_context.signal_event_param, + con_id, 0) & 0xFFFF; return (status); @@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg) { int cpu; + uint64_t hv_vcpu_index; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; hv_vmbus_synic_scontrol sctrl; @@ -403,23 +374,14 @@ hv_vmbus_synic_init(void *arg) return; /* - * KYS: Looks like we can only initialize on cpu0; don't we support - * SMP guests? - * - * TODO: Need to add SMP support for FreeBSD V9 - */ - - if (cpu != 0) - return; - - /* * TODO: Check the version */ version = rdmsr(HV_X64_MSR_SVERSION); - - hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0]; - hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1]; + hv_vmbus_g_context.syn_ic_msg_page[cpu] = + setup_args->page_buffers[2 * cpu]; + hv_vmbus_g_context.syn_ic_event_page[cpu] = + setup_args->page_buffers[2 * cpu + 1]; /* * Setup the Synic's message page @@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg) wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ + shared_sint.as_uint64_t = 0; shared_sint.u.vector = setup_args->vector; shared_sint.u.masked = FALSE; - shared_sint.u.auto_eoi = FALSE; + shared_sint.u.auto_eoi = TRUE; wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); @@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg) hv_vmbus_g_context.syn_ic_initialized = TRUE; + /* + * Set up the cpuid mapping from Hyper-V to FreeBSD. + * The array is indexed using FreeBSD cpuid. + */ + hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX); + hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index; + return; } @@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg) hv_vmbus_synic_sint shared_sint; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; - int cpu = PCPU_GET(cpuid); if (!hv_vmbus_g_context.syn_ic_initialized) return; - if (cpu != 0) - return; /* TODO: XXXKYS: SMP? */ - shared_sint.as_uint64_t = rdmsr( HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c index f7c1965..5e4f52a 100644 --- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c +++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c @@ -144,6 +144,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info) return (uint64_t) ring_info->ring_buffer->write_index << 32; } +void +hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info* ring_info) +{ + ring_info->ring_buffer->interrupt_mask = 1; + mb(); +} + +uint32_t +hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t read, write; + + ring_info->ring_buffer->interrupt_mask = 0; + mb(); + + /* + * Now check to see if the ring buffer is still empty. + * If it is not, we raced and we need to process new + * incoming messages. + */ + get_ring_buffer_avail_bytes(ring_info, &read, &write); + + return (read); +} + +/* + * When we write to the ring buffer, check if the host needs to + * be signaled. Here is the details of this protocol: + * + * 1. The host guarantees that while it is draining the + * ring buffer, it will set the interrupt_mask to + * indicate it does not need to be interrupted when + * new data is placed. + * + * 2. The host guarantees that it will completely drain + * the ring buffer before exiting the read loop. Further, + * once the ring buffer is empty, it will clear the + * interrupt_mask and re-check to see if new data has + * arrived. + */ +static boolean_t +hv_ring_buffer_needsig_on_write( + uint32_t old_write_location, + hv_vmbus_ring_buffer_info* rbi) +{ + mb(); + if (rbi->ring_buffer->interrupt_mask) + return (FALSE); + + /* Read memory barrier */ + rmb(); + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + if (old_write_location == rbi->ring_buffer->read_index) + return (TRUE); + + return (FALSE); +} + static uint32_t copy_to_ring_buffer( hv_vmbus_ring_buffer_info* ring_info, uint32_t start_write_offset, @@ -204,11 +267,13 @@ int hv_ring_buffer_write( hv_vmbus_ring_buffer_info* out_ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buffer_count) + uint32_t sg_buffer_count, + boolean_t *need_sig) { int i = 0; uint32_t byte_avail_to_write; uint32_t byte_avail_to_read; + uint32_t old_write_location; uint32_t total_bytes_to_write = 0; volatile uint32_t next_write_location; @@ -242,6 +307,8 @@ hv_ring_buffer_write( */ next_write_location = get_next_write_location(out_ring_info); + old_write_location = next_write_location; + for (i = 0; i < sg_buffer_count; i++) { next_write_location = copy_to_ring_buffer(out_ring_info, next_write_location, (char *) sg_buffers[i].data, @@ -258,9 +325,9 @@ hv_ring_buffer_write( (char *) &prev_indices, sizeof(uint64_t)); /* - * Make sure we flush all writes before updating the writeIndex + * Full memory barrier before upding the write index. */ - wmb(); + mb(); /* * Now, update the write location @@ -269,6 +336,9 @@ hv_ring_buffer_write( mtx_unlock_spin(&out_ring_info->ring_lock); + *need_sig = hv_ring_buffer_needsig_on_write(old_write_location, + out_ring_info); + return (0); } diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c index ca28fd5..f9432c8 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$"); #include <machine/stdarg.h> #include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/segments.h> #include <sys/pcpu.h> +#include <x86/apicvar.h> #include "hv_vmbus_priv.h" #define VMBUS_IRQ 0x5 -static struct intr_event *hv_msg_intr_event; -static struct intr_event *hv_event_intr_event; -static void *msg_swintr; -static void *event_swintr; static device_t vmbus_devp; -static void *vmbus_cookiep; -static int vmbus_rid; -struct resource *intr_res; -static int vmbus_irq = VMBUS_IRQ; static int vmbus_inited; static hv_setup_args setup_args; /* only CPU 0 supported at this time */ @@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */ * the hypervisor. */ static void -vmbus_msg_swintr(void *dummy) +vmbus_msg_swintr(void *arg) { int cpu; void* page_addr; hv_vmbus_message* msg; hv_vmbus_message* copied; - cpu = PCPU_GET(cpuid); + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " + "cpu out of range!")); + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; @@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy) * * The purpose of this routine is to determine the type of VMBUS protocol * message to process - an event or a channel message. - * As this is an interrupt filter routine, the function runs in a very - * restricted envinronment. From the manpage for bus_setup_intr(9) - * - * In this restricted environment, care must be taken to account for all - * races. A careful analysis of races should be done as well. It is gener- - * ally cheaper to take an extra interrupt, for example, than to protect - * variables with spinlocks. Read, modify, write cycles of hardware regis- - * ters need to be carefully analyzed if other threads are accessing the - * same registers. */ -static int +static inline int hv_vmbus_isr(void *unused) { int cpu; @@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused) void* page_addr; cpu = PCPU_GET(cpuid); - /* (Temporary limit) */ - KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); /* * The Windows team has advised that we check for events @@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused) event = (hv_vmbus_synic_event_flags*) page_addr + HV_VMBUS_MESSAGE_SINT; - /* Since we are a child, we only need to check bit 0 */ - if (synch_test_and_clear_bit(0, &event->flags32[0])) { - swi_sched(event_swintr, 0); + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + /* Since we are a child, we only need to check bit 0 */ + if (synch_test_and_clear_bit(0, &event->flags32[0])) { + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + } + } else { + /* + * On host with Win8 or above, we can directly look at + * the event page. If bit n is set, we have an interrupt + * on the channel with id n. + * Directly schedule the event software interrupt on + * current cpu. + */ + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); } /* Check if there are actual msgs to be process */ @@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused) msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { - swi_sched(msg_swintr, 0); + swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); } return FILTER_HANDLED; } +#ifdef HV_DEBUG_INTR +uint32_t hv_intr_count = 0; +#endif +uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +uint32_t hv_vmbus_intr_cpu[MAXCPU]; + +void +hv_vector_handler(struct trapframe *trap_frame) +{ +#ifdef HV_DEBUG_INTR + int cpu; +#endif + + /* + * Disable preemption. + */ + critical_enter(); + +#ifdef HV_DEBUG_INTR + /* + * Do a little interrupt counting. + */ + cpu = PCPU_GET(cpuid); + hv_vmbus_intr_cpu[cpu]++; + hv_intr_count++; +#endif + + hv_vmbus_isr(NULL); + + /* + * Enable preemption. + */ + critical_exit(); +} + static int vmbus_read_ivar( device_t dev, @@ -316,6 +350,81 @@ vmbus_probe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } +#ifdef HYPERV +extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); + +/** + * @brief Find a free IDT slot and setup the interrupt handler. + */ +static int +vmbus_vector_alloc(void) +{ + int vector; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards form the highest IDT vector available for use + * as vmbus channel callback vector. We install 'hv_vmbus_callback' + * handler at that vector and use it to interrupt vcpus. + */ + vector = APIC_SPURIOUS_INT; + while (--vector >= APIC_IPI_INTS) { + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { +#ifdef __i386__ + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#else + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, + SEL_KPL, 0); +#endif + + return (vector); + } + } + return (0); +} + +/** + * @brief Restore the IDT slot to rsvd. + */ +static void +vmbus_vector_free(int vector) +{ + uintptr_t func; + struct gate_descriptor *ip; + + if (vector == 0) + return; + + KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, + ("invalid vector %d", vector)); + + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), + ("invalid vector %d", vector)); + + setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +#else /* HYPERV */ + +static int +vmbus_vector_alloc(void) +{ + return(0); +} + +static void +vmbus_vector_free(int vector) +{ +} + +#endif /* HYPERV */ + /** * @brief Main vmbus driver initialization routine. * @@ -331,22 +440,7 @@ vmbus_probe(device_t dev) { static int vmbus_bus_init(void) { - struct ioapic_intsrc { - struct intsrc io_intsrc; - u_int io_irq; - u_int io_intpin:8; - u_int io_vector:8; - u_int io_cpu:8; - u_int io_activehi:1; - u_int io_edgetrigger:1; - u_int io_masked:1; - int io_bus:4; - uint32_t io_lowreg; - }; - int i, ret; - unsigned int vector = 0; - struct intsrc *isrc; - struct ioapic_intsrc *intpin; + int i, j, n, ret; if (vmbus_inited) return (0); @@ -361,80 +455,100 @@ vmbus_bus_init(void) return (ret); } - ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, - NULL, SWI_CLOCK, 0, &msg_swintr); - - if (ret) - goto cleanup; - /* - * Message SW interrupt handler checks a per-CPU page and - * thus the thread needs to be bound to CPU-0 - which is where - * all interrupts are processed. + * Find a free IDT slot for vmbus callback. */ - ret = intr_event_bind(hv_msg_intr_event, 0); - - if (ret) - goto cleanup1; + hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc(); - ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, - NULL, SWI_CLOCK, 0, &event_swintr); - - if (ret) - goto cleanup1; + if (hv_vmbus_g_context.hv_cb_vector == 0) { + if(bootverbose) + printf("Error VMBUS: Cannot find free IDT slot for " + "vmbus callback!\n"); + goto cleanup; + } - intr_res = bus_alloc_resource(vmbus_devp, - SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); + if(bootverbose) + printf("VMBUS: vmbus callback vector %d\n", + hv_vmbus_g_context.hv_cb_vector); - if (intr_res == NULL) { - ret = ENOMEM; /* XXXKYS: Need a better errno */ - goto cleanup2; + /* + * Notify the hypervisor of our vector. + */ + setup_args.vector = hv_vmbus_g_context.hv_cb_vector; + + CPU_FOREACH(j) { + hv_vmbus_intr_cpu[j] = 0; + hv_vmbus_swintr_event_cpu[j] = 0; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.event_swintr[j] = NULL; + hv_vmbus_g_context.msg_swintr[j] = NULL; + + for (i = 0; i < 2; i++) + setup_args.page_buffers[2 * j + i] = NULL; } /* - * Setup interrupt filter handler + * Per cpu setup. */ - ret = bus_setup_intr(vmbus_devp, intr_res, - INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, - NULL, &vmbus_cookiep); - - if (ret != 0) - goto cleanup3; - - ret = bus_bind_intr(vmbus_devp, intr_res, 0); - if (ret != 0) - goto cleanup4; - - isrc = intr_lookup_source(vmbus_irq); - if ((isrc == NULL) || (isrc->is_event == NULL)) { - ret = EINVAL; - goto cleanup4; - } + CPU_FOREACH(j) { + /* + * Setup software interrupt thread and handler for msg handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], + "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, + &hv_vmbus_g_context.msg_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup msg swi for " + "cpu %d\n", j); + goto cleanup1; + } - /* vector = isrc->is_event->ie_vector; */ - intpin = (struct ioapic_intsrc *)isrc; - vector = intpin->io_vector; + /* + * Bind the swi thread to the cpu. + */ + ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], + j); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to bind msg swi thread " + "to cpu %d\n", j); + goto cleanup1; + } - if(bootverbose) - printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); + /* + * Setup software interrupt thread and handler for + * event handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], + "hv_event", hv_vmbus_on_events, (void *)(long)j, + SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup event swi for " + "cpu %d\n", j); + goto cleanup1; + } - /** - * Notify the hypervisor of our irq. - */ - setup_args.vector = vector; - for(i = 0; i < 2; i++) { - setup_args.page_buffers[i] = + /* + * Prepare the per cpu msg and event pages to be called on each cpu. + */ + for(i = 0; i < 2; i++) { + setup_args.page_buffers[2 * j + i] = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - if (setup_args.page_buffers[i] == NULL) { - KASSERT(setup_args.page_buffers[i] != NULL, + if (setup_args.page_buffers[2 * j + i] == NULL) { + KASSERT(setup_args.page_buffers[2 * j + i] != NULL, ("Error VMBUS: malloc failed!")); - if (i > 0) - free(setup_args.page_buffers[0], M_DEVBUF); - goto cleanup4; + goto cleanup1; + } } } - /* only CPU #0 supported at this time */ + if (bootverbose) + printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n", + smp_started); + smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args); /* @@ -443,26 +557,32 @@ vmbus_bus_init(void) ret = hv_vmbus_connect(); if (ret != 0) - goto cleanup4; + goto cleanup1; hv_vmbus_request_channel_offers(); return (ret); - cleanup4: - + cleanup1: /* - * remove swi, bus and intr resource + * Free pages alloc'ed */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + for (n = 0; n < 2 * MAXCPU; n++) + if (setup_args.page_buffers[n] != NULL) + free(setup_args.page_buffers[n], M_DEVBUF); - cleanup3: - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); - - cleanup2: - swi_remove(event_swintr); + /* + * remove swi and vmbus callback vector; + */ + CPU_FOREACH(j) { + if (hv_vmbus_g_context.msg_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[j]); + if (hv_vmbus_g_context.event_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[j]); + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + } - cleanup1: - swi_remove(msg_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); cleanup: hv_vmbus_cleanup(); @@ -515,20 +635,24 @@ vmbus_bus_exit(void) smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); - for(i = 0; i < 2; i++) { + for(i = 0; i < 2 * MAXCPU; i++) { if (setup_args.page_buffers[i] != 0) free(setup_args.page_buffers[i], M_DEVBUF); } hv_vmbus_cleanup(); - /* remove swi, bus and intr resource */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); - - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); + /* remove swi */ + CPU_FOREACH(i) { + if (hv_vmbus_g_context.msg_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[i]); + if (hv_vmbus_g_context.event_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[i]); + hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; + hv_vmbus_g_context.hv_event_intr_event[i] = NULL; + } - swi_remove(msg_swintr); - swi_remove(event_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); return; } @@ -603,6 +727,6 @@ devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); MODULE_VERSION(vmbus,1); -/* TODO: We want to be earlier than SI_SUB_VFS */ -SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); +/* We want to be started after SMP is initialized */ +SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL); diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 6bc875d..faa6dec 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -181,49 +181,30 @@ enum { #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) -/* - * Connection identifier type - */ -typedef union { - uint32_t as_uint32_t; - struct { - uint32_t id:24; - uint32_t reserved:8; - } u; - -} __packed hv_vmbus_connection_id; - -/* - * Definition of the hv_vmbus_signal_event hypercall input structure - */ -typedef struct { - hv_vmbus_connection_id connection_id; - uint16_t flag_number; - uint16_t rsvd_z; -} __packed hv_vmbus_input_signal_event; - -typedef struct { - uint64_t align8; - hv_vmbus_input_signal_event event; -} __packed hv_vmbus_input_signal_event_buffer; - typedef struct { uint64_t guest_id; void* hypercall_page; hv_bool_uint8_t syn_ic_initialized; + + hv_vmbus_handle syn_ic_msg_page[MAXCPU]; + hv_vmbus_handle syn_ic_event_page[MAXCPU]; /* - * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. - * The input param is immutable in our usage and - * must be dynamic mem (vs stack or global). + * For FreeBSD cpuid to Hyper-V vcpuid mapping. */ - hv_vmbus_input_signal_event_buffer *signal_event_buffer; + uint32_t hv_vcpu_index[MAXCPU]; /* - * 8-bytes aligned of the buffer above + * Each cpu has its own software interrupt handler for channel + * event and msg handling. */ - hv_vmbus_input_signal_event *signal_event_param; - - hv_vmbus_handle syn_ic_msg_page[MAXCPU]; - hv_vmbus_handle syn_ic_event_page[MAXCPU]; + struct intr_event *hv_event_intr_event[MAXCPU]; + struct intr_event *hv_msg_intr_event[MAXCPU]; + void *event_swintr[MAXCPU]; + void *msg_swintr[MAXCPU]; + /* + * Host use this vector to intrrupt guest for vmbus channel + * event and msg. + */ + unsigned int hv_cb_vector; } hv_vmbus_context; /* @@ -368,7 +349,8 @@ typedef struct { TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; struct mtx channel_msg_lock; /** - * List of channels + * List of primary channels. Sub channels will be linked + * under their primary channel. */ TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; struct mtx channel_lock; @@ -560,6 +542,8 @@ typedef union { uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; } hv_vmbus_synic_event_flags; +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX (0x40000002) /* * Define synthetic interrupt controller model specific registers @@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup( int hv_ring_buffer_write( hv_vmbus_ring_buffer_info *ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buff_count); + uint32_t sg_buff_count, + boolean_t *need_sig); int hv_ring_buffer_peek( hv_vmbus_ring_buffer_info *ring_info, @@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info( hv_vmbus_ring_buffer_info *ring_info, char *prefix); +void hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info *ring_info); + +uint32_t hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info *ring_info); + hv_vmbus_channel* hv_vmbus_allocate_channel(void); void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); void hv_vmbus_on_channel_message(void *context); @@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc( void *payload, size_t payload_size); -uint16_t hv_vmbus_signal_event(void); +uint16_t hv_vmbus_signal_event(void *con_id); void hv_vmbus_synic_init(void *irq_arg); void hv_vmbus_synic_cleanup(void *arg); int hv_vmbus_query_hypervisor_presence(void); @@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id); int hv_vmbus_connect(void); int hv_vmbus_disconnect(void); int hv_vmbus_post_message(void *buffer, size_t buf_size); -int hv_vmbus_set_event(uint32_t child_rel_id); +int hv_vmbus_set_event(hv_vmbus_channel *channel); void hv_vmbus_on_events(void *); @@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id( typedef struct { unsigned int vector; - void *page_buffers[2]; + void *page_buffers[2 * MAXCPU]; } hv_setup_args; #endif /* __HYPERV_PRIV_H__ */ diff --git a/sys/dev/ichsmb/ichsmb_pci.c b/sys/dev/ichsmb/ichsmb_pci.c index 5c1ef54..ae8b179 100644 --- a/sys/dev/ichsmb/ichsmb_pci.c +++ b/sys/dev/ichsmb/ichsmb_pci.c @@ -88,6 +88,7 @@ __FBSDID("$FreeBSD$"); #define ID_AVOTON 0x1f3c8086 #define ID_COLETOCRK 0x23B08086 #define ID_LPT 0x8c228086 +#define ID_LPTLP 0x9c228086 #define ID_WCPT 0x8ca28086 #define ID_WCPTLP 0x9ca28086 @@ -199,6 +200,9 @@ ichsmb_pci_probe(device_t dev) case ID_LPT: device_set_desc(dev, "Intel Lynx Point SMBus controller"); break; + case ID_LPTLP: + device_set_desc(dev, "Intel Lynx Point-LP SMBus controller"); + break; case ID_WCPT: device_set_desc(dev, "Intel Wildcat Point SMBus controller"); break; diff --git a/sys/dev/iicbus/pcf8563.c b/sys/dev/iicbus/pcf8563.c index 88307a5..556943a 100644 --- a/sys/dev/iicbus/pcf8563.c +++ b/sys/dev/iicbus/pcf8563.c @@ -1,5 +1,6 @@ /*- * Copyright (c) 2012 Marius Strobl <marius@FreeBSD.org> + * Copyright (c) 2015 Juraj Lutter * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +32,8 @@ __FBSDID("$FreeBSD$"); * Driver for NXP PCF8563 real-time clock/calendar */ +#include "opt_platform.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> @@ -41,6 +44,11 @@ __FBSDID("$FreeBSD$"); #include <dev/iicbus/iicbus.h> #include <dev/iicbus/iiconf.h> #include <dev/iicbus/pcf8563reg.h> +#ifdef FDT +#include <dev/ofw/openfirm.h> +#include <dev/ofw/ofw_bus.h> +#include <dev/ofw/ofw_bus_subr.h> +#endif #include "clock_if.h" #include "iicbus_if.h" @@ -48,6 +56,7 @@ __FBSDID("$FreeBSD$"); #define PCF8563_NCLOCKREGS (PCF8563_R_YEAR - PCF8563_R_CS1 + 1) struct pcf8563_softc { + struct intr_config_hook enum_hook; uint32_t sc_flags; #define PCF8563_CPOL (1 << 0) /* PCF8563_R_MONTH_C means 19xx */ uint16_t sc_addr; /* PCF8563 slave address */ @@ -58,30 +67,62 @@ static device_attach_t pcf8563_attach; static device_probe_t pcf8563_probe; static clock_gettime_t pcf8563_gettime; static clock_settime_t pcf8563_settime; +static void pcf8563_start(void *); static int pcf8563_probe(device_t dev) { +#ifdef FDT + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + if (!ofw_bus_is_compatible(dev, "nxp,pcf8563") && + !ofw_bus_is_compatible(dev, "philips,pcf8563") && + !ofw_bus_is_compatible(dev, "pcf8563")) + return (ENXIO); +#endif device_set_desc(dev, "NXP PCF8563 RTC"); - return (BUS_PROBE_NOWILDCARD); + + return (BUS_PROBE_DEFAULT); } static int pcf8563_attach(device_t dev) { + struct pcf8563_softc *sc; + + sc = device_get_softc(dev); + sc->sc_addr = iicbus_get_addr(dev); + if (sc->sc_addr == 0) + sc->sc_addr = PCF8563_ADDR; + sc->sc_year0 = 1900; + sc->enum_hook.ich_func = pcf8563_start; + sc->enum_hook.ich_arg = dev; + + /* + * We have to wait until interrupts are enabled. Sometimes I2C read + * and write only works when the interrupts are available. + */ + if (config_intrhook_establish(&sc->enum_hook) != 0) + return (ENOMEM); + + return (0); +} + +static void +pcf8563_start(void *xdev) +{ + device_t dev; uint8_t reg = PCF8563_R_SECOND, val; struct iic_msg msgs[] = { { 0, IIC_M_WR, sizeof(reg), ® }, { 0, IIC_M_RD, sizeof(val), &val } }; struct pcf8563_softc *sc; - int error; + dev = (device_t)xdev; sc = device_get_softc(dev); - sc->sc_addr = iicbus_get_addr(dev); - if (sc->sc_addr == 0) - sc->sc_addr = PCF8563_ADDR; + config_intrhook_disestablish(&sc->enum_hook); /* * NB: PCF8563_R_SECOND_VL doesn't automatically clear when VDD @@ -94,18 +135,14 @@ pcf8563_attach(device_t dev) * as a side-effect. */ msgs[0].slave = msgs[1].slave = sc->sc_addr; - error = iicbus_transfer(device_get_parent(dev), msgs, sizeof(msgs) / - sizeof(*msgs)); - if (error != 0) { + if (iicbus_transfer(dev, msgs, nitems(msgs)) != 0) { device_printf(dev, "%s: cannot read RTC\n", __func__); - return (error); + return; } if ((val & PCF8563_R_SECOND_VL) != 0) device_printf(dev, "%s: battery low\n", __func__); - sc->sc_year0 = 1900; clock_register(dev, 1000000); /* 1 second resolution */ - return (0); } static int @@ -122,8 +159,7 @@ pcf8563_gettime(device_t dev, struct timespec *ts) sc = device_get_softc(dev); msgs[0].slave = msgs[1].slave = sc->sc_addr; - error = iicbus_transfer(device_get_parent(dev), msgs, sizeof(msgs) / - sizeof(*msgs)); + error = iicbus_transfer(dev, msgs, nitems(msgs)); if (error != 0) { device_printf(dev, "%s: cannot read RTC\n", __func__); return (error); @@ -145,6 +181,7 @@ pcf8563_gettime(device_t dev, struct timespec *ts) sc->sc_flags |= PCF8563_CPOL; } else if (ct.year < 100 + sc->sc_year0) sc->sc_flags |= PCF8563_CPOL; + return (clock_ct_to_ts(&ct, ts)); } @@ -180,10 +217,10 @@ pcf8563_settime(device_t dev, struct timespec *ts) val[PCF8563_R_MONTH] |= PCF8563_R_MONTH_C; msgs[0].slave = sc->sc_addr; - error = iicbus_transfer(device_get_parent(dev), msgs, sizeof(msgs) / - sizeof(*msgs)); + error = iicbus_transfer(dev, msgs, nitems(msgs)); if (error != 0) device_printf(dev, "%s: cannot write RTC\n", __func__); + return (error); } diff --git a/sys/dev/ipmi/ipmi.c b/sys/dev/ipmi/ipmi.c index a1edbf6..8101717 100644 --- a/sys/dev/ipmi/ipmi.c +++ b/sys/dev/ipmi/ipmi.c @@ -756,17 +756,22 @@ ipmi_startup(void *arg) } device_printf(dev, "Number of channels %d\n", i); - /* probe for watchdog */ - IPMI_INIT_DRIVER_REQUEST(req, IPMI_ADDR(IPMI_APP_REQUEST, 0), - IPMI_GET_WDOG, 0, 0); + /* + * Probe for watchdog, but only for backends which support + * polled driver requests. + */ + if (sc->ipmi_driver_requests_polled) { + IPMI_INIT_DRIVER_REQUEST(req, IPMI_ADDR(IPMI_APP_REQUEST, 0), + IPMI_GET_WDOG, 0, 0); - ipmi_submit_driver_request(sc, req, 0); + ipmi_submit_driver_request(sc, req, 0); - if (req->ir_compcode == 0x00) { - device_printf(dev, "Attached watchdog\n"); - /* register the watchdog event handler */ - sc->ipmi_watchdog_tag = EVENTHANDLER_REGISTER(watchdog_list, - ipmi_wd_event, sc, 0); + if (req->ir_compcode == 0x00) { + device_printf(dev, "Attached watchdog\n"); + /* register the watchdog event handler */ + sc->ipmi_watchdog_tag = EVENTHANDLER_REGISTER( + watchdog_list, ipmi_wd_event, sc, 0); + } } sc->ipmi_cdev = make_dev(&ipmi_cdevsw, device_get_unit(dev), diff --git a/sys/dev/ipmi/ipmi_kcs.c b/sys/dev/ipmi/ipmi_kcs.c index 1c58646..864e9a0 100644 --- a/sys/dev/ipmi/ipmi_kcs.c +++ b/sys/dev/ipmi/ipmi_kcs.c @@ -520,6 +520,7 @@ ipmi_kcs_attach(struct ipmi_softc *sc) sc->ipmi_startup = kcs_startup; sc->ipmi_enqueue_request = ipmi_polled_enqueue_request; sc->ipmi_driver_request = kcs_driver_request; + sc->ipmi_driver_requests_polled = 1; /* See if we can talk to the controller. */ status = INB(sc, KCS_CTL_STS); diff --git a/sys/dev/ipmi/ipmi_smic.c b/sys/dev/ipmi/ipmi_smic.c index 4e26553..92cf14e 100644 --- a/sys/dev/ipmi/ipmi_smic.c +++ b/sys/dev/ipmi/ipmi_smic.c @@ -415,6 +415,7 @@ ipmi_smic_attach(struct ipmi_softc *sc) sc->ipmi_startup = smic_startup; sc->ipmi_enqueue_request = ipmi_polled_enqueue_request; sc->ipmi_driver_request = smic_driver_request; + sc->ipmi_driver_requests_polled = 1; /* See if we can talk to the controller. */ flags = INB(sc, SMIC_FLAGS); diff --git a/sys/dev/ipmi/ipmivars.h b/sys/dev/ipmi/ipmivars.h index 9d7dc32..9a0b435 100644 --- a/sys/dev/ipmi/ipmivars.h +++ b/sys/dev/ipmi/ipmivars.h @@ -105,6 +105,7 @@ struct ipmi_softc { int ipmi_opened; struct cdev *ipmi_cdev; TAILQ_HEAD(,ipmi_request) ipmi_pending_requests; + int ipmi_driver_requests_polled; eventhandler_tag ipmi_watchdog_tag; int ipmi_watchdog_active; struct intr_config_hook ipmi_ich; diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index 48ff9d1..b9bba31 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -54,7 +54,7 @@ int ixgbe_display_debug_stats = 0; /********************************************************************* * Driver version *********************************************************************/ -char ixgbe_driver_version[] = "2.7.4"; +char ixgbe_driver_version[] = "2.8.3"; /********************************************************************* * PCI Device ID Table @@ -117,6 +117,8 @@ static int ixgbe_probe(device_t); static int ixgbe_attach(device_t); static int ixgbe_detach(device_t); static int ixgbe_shutdown(device_t); +static int ixgbe_suspend(device_t); +static int ixgbe_resume(device_t); static int ixgbe_ioctl(struct ifnet *, u_long, caddr_t); static void ixgbe_init(void *); static void ixgbe_init_locked(struct adapter *); @@ -136,7 +138,12 @@ static int ixgbe_setup_msix(struct adapter *); static void ixgbe_free_pci_resources(struct adapter *); static void ixgbe_local_timer(void *); static int ixgbe_setup_interface(device_t, struct adapter *); +static void ixgbe_config_dmac(struct adapter *); +static void ixgbe_config_delay_values(struct adapter *); static void ixgbe_config_link(struct adapter *); +static void ixgbe_check_eee_support(struct adapter *); +static void ixgbe_check_wol_support(struct adapter *); +static int ixgbe_setup_low_power_mode(struct adapter *); static void ixgbe_rearm_queues(struct adapter *, u64); static void ixgbe_initialize_transmit_units(struct adapter *); @@ -150,9 +157,6 @@ static void ixgbe_update_stats_counters(struct adapter *); static void ixgbe_set_promisc(struct adapter *); static void ixgbe_set_multi(struct adapter *); static void ixgbe_update_link_status(struct adapter *); -static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS); -static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS); -static int ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS); static void ixgbe_set_ivar(struct adapter *, u8, u8, s8); static void ixgbe_configure_ivars(struct adapter *); static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); @@ -161,7 +165,22 @@ static void ixgbe_setup_vlan_hw_support(struct adapter *); static void ixgbe_register_vlan(void *, struct ifnet *, u16); static void ixgbe_unregister_vlan(void *, struct ifnet *, u16); -static void ixgbe_add_hw_stats(struct adapter *adapter); +static void ixgbe_add_device_sysctls(struct adapter *); +static void ixgbe_add_hw_stats(struct adapter *); + +/* Sysctl handlers */ +static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS); +static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_thermal_test(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_dmac(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_phy_temp(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_phy_overtemp_occurred(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_eee_enable(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_eee_negotiated(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_eee_rx_lpi_status(SYSCTL_HANDLER_ARGS); +static int ixgbe_sysctl_eee_tx_lpi_status(SYSCTL_HANDLER_ARGS); /* Support for pluggable optic modules */ static bool ixgbe_sfp_probe(struct adapter *); @@ -179,15 +198,12 @@ static void ixgbe_handle_que(void *, int); static void ixgbe_handle_link(void *, int); static void ixgbe_handle_msf(void *, int); static void ixgbe_handle_mod(void *, int); +static void ixgbe_handle_phy(void *, int); #ifdef IXGBE_FDIR static void ixgbe_reinit_fdir(void *, int); #endif - -/* Missing shared code prototype */ -extern void ixgbe_stop_mac_link_on_d3_82599(struct ixgbe_hw *hw); - /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ @@ -198,6 +214,8 @@ static device_method_t ix_methods[] = { DEVMETHOD(device_attach, ixgbe_attach), DEVMETHOD(device_detach, ixgbe_detach), DEVMETHOD(device_shutdown, ixgbe_shutdown), + DEVMETHOD(device_suspend, ixgbe_suspend), + DEVMETHOD(device_resume, ixgbe_resume), DEVMETHOD_END }; @@ -404,32 +422,6 @@ ixgbe_attach(device_t dev) /* Core Lock Init*/ IXGBE_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); - /* SYSCTL APIs */ - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), - SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "fc", CTLTYPE_INT | CTLFLAG_RW, - adapter, 0, ixgbe_set_flowcntl, "I", IXGBE_SYSCTL_DESC_SET_FC); - - SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), - SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "enable_aim", CTLFLAG_RW, - &ixgbe_enable_aim, 1, "Interrupt Moderation"); - - /* - ** Allow a kind of speed control by forcing the autoneg - ** advertised speed list to only a certain value, this - ** supports 1G on 82599 devices, and 100Mb on x540. - */ - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), - SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "advertise_speed", CTLTYPE_INT | CTLFLAG_RW, - adapter, 0, ixgbe_set_advertise, "I", IXGBE_SYSCTL_DESC_ADV_SPEED); - - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), - SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "ts", CTLTYPE_INT | CTLFLAG_RW, adapter, - 0, ixgbe_set_thermal_test, "I", "Thermal Test"); - /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); @@ -559,22 +551,26 @@ ixgbe_attach(device_t dev) adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixgbe_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); - /* - ** Check PCIE slot type/speed/width - */ + /* Check PCIE slot type/speed/width */ ixgbe_get_slot_info(hw); /* Set an initial default flow control value */ adapter->fc = ixgbe_fc_full; + /* Check for certain supported features */ + ixgbe_check_wol_support(adapter); + ixgbe_check_eee_support(adapter); + + /* Add sysctls */ + ixgbe_add_device_sysctls(adapter); + ixgbe_add_hw_stats(adapter); + /* let hardware know driver is loaded */ ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); - ixgbe_add_hw_stats(adapter); - #ifdef DEV_NETMAP ixgbe_netmap_attach(adapter); #endif /* DEV_NETMAP */ @@ -618,8 +614,9 @@ ixgbe_detach(device_t dev) return (EBUSY); } + /* Stop the adapter */ IXGBE_CORE_LOCK(adapter); - ixgbe_stop(adapter); + ixgbe_setup_low_power_mode(adapter); IXGBE_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { @@ -637,6 +634,7 @@ ixgbe_detach(device_t dev) taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->mod_task); taskqueue_drain(adapter->tq, &adapter->msf_task); + taskqueue_drain(adapter->tq, &adapter->phy_task); #ifdef IXGBE_FDIR taskqueue_drain(adapter->tq, &adapter->fdir_task); #endif @@ -681,9 +679,77 @@ static int ixgbe_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); + int error = 0; + + INIT_DEBUGOUT("ixgbe_shutdown: begin"); + IXGBE_CORE_LOCK(adapter); - ixgbe_stop(adapter); + error = ixgbe_setup_low_power_mode(adapter); + IXGBE_CORE_UNLOCK(adapter); + + return (error); +} + +/** + * Methods for going from: + * D0 -> D3: ixgbe_suspend + * D3 -> D0: ixgbe_resume + */ +static int +ixgbe_suspend(device_t dev) +{ + struct adapter *adapter = device_get_softc(dev); + int error = 0; + + INIT_DEBUGOUT("ixgbe_suspend: begin"); + + IXGBE_CORE_LOCK(adapter); + + error = ixgbe_setup_low_power_mode(adapter); + + /* Save state and power down */ + pci_save_state(dev); + pci_set_powerstate(dev, PCI_POWERSTATE_D3); + + IXGBE_CORE_UNLOCK(adapter); + + return (error); +} + +static int +ixgbe_resume(device_t dev) +{ + struct adapter *adapter = device_get_softc(dev); + struct ifnet *ifp = adapter->ifp; + struct ixgbe_hw *hw = &adapter->hw; + u32 wus; + + INIT_DEBUGOUT("ixgbe_resume: begin"); + + IXGBE_CORE_LOCK(adapter); + + pci_set_powerstate(dev, PCI_POWERSTATE_D0); + pci_restore_state(dev); + + /* Read & clear WUS register */ + wus = IXGBE_READ_REG(hw, IXGBE_WUS); + if (wus) + device_printf(dev, "Woken up by (WUS): %#010x\n", + IXGBE_READ_REG(hw, IXGBE_WUS)); + IXGBE_WRITE_REG(hw, IXGBE_WUS, 0xffffffff); + /* And clear WUFC until next low-power transition */ + IXGBE_WRITE_REG(hw, IXGBE_WUFC, 0); + + /* + * Required after D3->D0 transition; + * will re-advertise all previous advertised speeds + */ + if (ifp->if_flags & IFF_UP) + ixgbe_init_locked(adapter); + IXGBE_CORE_UNLOCK(adapter); + + INIT_DEBUGOUT("ixgbe_resume: end"); return (0); } @@ -736,13 +802,13 @@ ixgbe_ioctl(struct ifnet * ifp, u_long command, caddr_t data) break; case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); - if (ifr->ifr_mtu > IXGBE_MAX_FRAME_SIZE - ETHER_HDR_LEN) { + if (ifr->ifr_mtu > IXGBE_MAX_MTU) { error = EINVAL; } else { IXGBE_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = - ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; + ifp->if_mtu + IXGBE_MTU_HDR; ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } @@ -891,7 +957,7 @@ ixgbe_init_locked(struct adapter *adapter) /* Prepare transmit descriptors and buffers */ if (ixgbe_setup_transmit_structures(adapter)) { - device_printf(dev,"Could not setup transmit structures\n"); + device_printf(dev, "Could not setup transmit structures\n"); ixgbe_stop(adapter); return; } @@ -917,7 +983,7 @@ ixgbe_init_locked(struct adapter *adapter) /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(adapter)) { - device_printf(dev,"Could not setup receive structures\n"); + device_printf(dev, "Could not setup receive structures\n"); ixgbe_stop(adapter); return; } @@ -932,11 +998,16 @@ ixgbe_init_locked(struct adapter *adapter) /* Add for Module detection */ if (hw->mac.type == ixgbe_mac_82599EB) - gpie |= IXGBE_SDP2_GPIEN_BY_MAC(hw); + gpie |= IXGBE_SDP2_GPIEN; - /* Thermal Failure Detection */ - if (hw->mac.type == ixgbe_mac_X540) - gpie |= IXGBE_SDP0_GPIEN_BY_MAC(hw); + /* + * Thermal Failure Detection (X540) + * Link Detection (X552) + */ + if (hw->mac.type == ixgbe_mac_X540 || + hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP || + hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) + gpie |= IXGBE_SDP0_GPIEN_X540; if (adapter->msix > 1) { /* Enable Enhanced MSIX mode */ @@ -948,6 +1019,7 @@ ixgbe_init_locked(struct adapter *adapter) /* Set MTU size */ if (ifp->if_mtu > ETHERMTU) { + /* aka IXGBE_MAXFRS on 82599 and newer */ mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; @@ -955,7 +1027,6 @@ ixgbe_init_locked(struct adapter *adapter) } /* Now enable all the queues */ - for (int i = 0; i < adapter->num_queues; i++) { txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i)); txdctl |= IXGBE_TXDCTL_ENABLE; @@ -1072,55 +1143,25 @@ ixgbe_init_locked(struct adapter *adapter) /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EITR(adapter->vector), IXGBE_LINK_ITR); + /* Configure Energy Efficient Ethernet for supported devices */ + if (adapter->eee_support) + ixgbe_setup_eee(hw, adapter->eee_enabled); + /* Config/Enable Link */ ixgbe_config_link(adapter); /* Hardware Packet Buffer & Flow Control setup */ - { - u32 rxpb, frame, size, tmp; - - frame = adapter->max_frame_size; + ixgbe_config_delay_values(adapter); - /* Calculate High Water */ - switch (hw->mac.type) { - case ixgbe_mac_X540: - case ixgbe_mac_X550: - case ixgbe_mac_X550EM_a: - case ixgbe_mac_X550EM_x: - tmp = IXGBE_DV_X540(frame, frame); - break; - default: - tmp = IXGBE_DV(frame, frame); - break; - } - size = IXGBE_BT2KB(tmp); - rxpb = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(0)) >> 10; - hw->fc.high_water[0] = rxpb - size; - - /* Now calculate Low Water */ - switch (hw->mac.type) { - case ixgbe_mac_X540: - case ixgbe_mac_X550: - case ixgbe_mac_X550EM_a: - case ixgbe_mac_X550EM_x: - tmp = IXGBE_LOW_DV_X540(frame); - break; - default: - tmp = IXGBE_LOW_DV(frame); - break; - } - hw->fc.low_water[0] = IXGBE_BT2KB(tmp); - - hw->fc.requested_mode = adapter->fc; - hw->fc.pause_time = IXGBE_FC_PAUSE; - hw->fc.send_xon = TRUE; - } /* Initialize the FC settings */ ixgbe_start_hw(hw); /* Set up VLAN support and filter */ ixgbe_setup_vlan_hw_support(adapter); + /* Setup DMA Coalescing */ + ixgbe_config_dmac(adapter); + /* And now turn on interrupts */ ixgbe_enable_intr(adapter); @@ -1141,6 +1182,46 @@ ixgbe_init(void *arg) return; } +static void +ixgbe_config_delay_values(struct adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + u32 rxpb, frame, size, tmp; + + frame = adapter->max_frame_size; + + /* Calculate High Water */ + switch (hw->mac.type) { + case ixgbe_mac_X540: + case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: + tmp = IXGBE_DV_X540(frame, frame); + break; + default: + tmp = IXGBE_DV(frame, frame); + break; + } + size = IXGBE_BT2KB(tmp); + rxpb = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(0)) >> 10; + hw->fc.high_water[0] = rxpb - size; + + /* Now calculate Low Water */ + switch (hw->mac.type) { + case ixgbe_mac_X540: + case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: + tmp = IXGBE_LOW_DV_X540(frame); + break; + default: + tmp = IXGBE_LOW_DV(frame); + break; + } + hw->fc.low_water[0] = IXGBE_BT2KB(tmp); + + hw->fc.requested_mode = adapter->fc; + hw->fc.pause_time = IXGBE_FC_PAUSE; + hw->fc.send_xon = TRUE; +} /* ** @@ -1195,9 +1276,10 @@ ixgbe_handle_que(void *context, int pending) struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; + bool more; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ixgbe_rxeof(que); + more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifndef IXGBE_LEGACY_TX @@ -1270,6 +1352,11 @@ ixgbe_legacy_irq(void *arg) if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); + /* External PHY interrupt */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T && + (reg_eicr & IXGBE_EICR_GPI_SDP0_X540)) + taskqueue_enqueue(adapter->tq, &adapter->phy_task); + if (more) taskqueue_enqueue(que->tq, &que->que_task); else @@ -1378,9 +1465,9 @@ ixgbe_msix_link(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; - u32 reg_eicr; + u32 reg_eicr, mod_mask; - ++adapter->vector_irq; + ++adapter->link_irq; /* First get the cause */ reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICS); @@ -1408,42 +1495,46 @@ ixgbe_msix_link(void *arg) device_printf(adapter->dev, "\nCRITICAL: ECC ERROR!! " "Please Reboot!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC); - } else + } - if (ixgbe_is_sfp(hw)) { - if (reg_eicr & IXGBE_EICR_GPI_SDP1) { - IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1_BY_MAC(hw)); - taskqueue_enqueue(adapter->tq, &adapter->msf_task); - } else if (reg_eicr & IXGBE_EICR_GPI_SDP2) { - IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP2_BY_MAC(hw)); - taskqueue_enqueue(adapter->tq, &adapter->mod_task); - } + /* Check for over temp condition */ + if (reg_eicr & IXGBE_EICR_TS) { + device_printf(adapter->dev, "\nCRITICAL: OVER TEMP!! " + "PHY IS SHUT DOWN!!\n"); + device_printf(adapter->dev, "System shutdown required!\n"); + IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_TS); } - } + } + + /* Pluggable optics-related interrupt */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP) + mod_mask = IXGBE_EICR_GPI_SDP0_X540; + else + mod_mask = IXGBE_EICR_GPI_SDP2_BY_MAC(hw); + + if (ixgbe_is_sfp(hw)) { + if (reg_eicr & IXGBE_EICR_GPI_SDP1_BY_MAC(hw)) { + IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1_BY_MAC(hw)); + taskqueue_enqueue(adapter->tq, &adapter->msf_task); + } else if (reg_eicr & mod_mask) { + IXGBE_WRITE_REG(hw, IXGBE_EICR, mod_mask); + taskqueue_enqueue(adapter->tq, &adapter->mod_task); + } + } /* Check for fan failure */ if ((hw->device_id == IXGBE_DEV_ID_82598AT) && - (reg_eicr & IXGBE_EICR_GPI_SDP1_BY_MAC(hw))) { + (reg_eicr & IXGBE_EICR_GPI_SDP1)) { + IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); - IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1_BY_MAC(hw)); } - /* Check for over temp condition */ - switch (hw->mac.type) { - case ixgbe_mac_X540: - case ixgbe_mac_X550: - case ixgbe_mac_X550EM_a: - if (reg_eicr & IXGBE_EICR_TS) { - device_printf(adapter->dev, "\nCRITICAL: OVER TEMP!! " - "PHY IS SHUT DOWN!!\n"); - device_printf(adapter->dev, "System shutdown required\n"); - IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_TS); - } - break; - default: - /* Other MACs have no thermal sensor interrupt */ - break; + /* External PHY interrupt */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T && + (reg_eicr & IXGBE_EICR_GPI_SDP0_X540)) { + IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP0_X540); + taskqueue_enqueue(adapter->tq, &adapter->phy_task); } IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_OTHER); @@ -1542,20 +1633,26 @@ ixgbe_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: - ifmr->ifm_active |= IFM_10_T | IFM_FDX; + ifmr->ifm_active |= IFM_10G_SR | IFM_FDX; + break; + case IXGBE_LINK_SPEED_2_5GB_FULL: + ifmr->ifm_active |= IFM_2500_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: - ifmr->ifm_active |= IFM_10_5 | IFM_FDX; + ifmr->ifm_active |= IFM_1000_CX | IFM_FDX; break; } - if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4 + else if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4 || layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: - ifmr->ifm_active |= IFM_10_2 | IFM_FDX; + ifmr->ifm_active |= IFM_10G_CX4 | IFM_FDX; + break; + case IXGBE_LINK_SPEED_2_5GB_FULL: + ifmr->ifm_active |= IFM_2500_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: - ifmr->ifm_active |= IFM_10_5 | IFM_FDX; + ifmr->ifm_active |= IFM_1000_CX | IFM_FDX; break; } @@ -1564,10 +1661,12 @@ ixgbe_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) ifmr->ifm_active |= IFM_UNKNOWN; #if __FreeBSD_version >= 900025 - /* Flow control setting */ - if (adapter->fc == ixgbe_fc_rx_pause || adapter->fc == ixgbe_fc_full) + /* Display current flow control setting used on link */ + if (hw->fc.current_mode == ixgbe_fc_rx_pause || + hw->fc.current_mode == ixgbe_fc_full) ifmr->ifm_active |= IFM_ETH_RXPAUSE; - if (adapter->fc == ixgbe_fc_tx_pause || adapter->fc == ixgbe_fc_full) + if (hw->fc.current_mode == ixgbe_fc_tx_pause || + hw->fc.current_mode == ixgbe_fc_full) ifmr->ifm_active |= IFM_ETH_TXPAUSE; #endif @@ -1597,21 +1696,22 @@ ixgbe_media_change(struct ifnet * ifp) if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); + if (hw->phy.media_type == ixgbe_media_type_backplane) + return (EPERM); + /* ** We don't actually need to check against the supported ** media types of the adapter; ifmedia will take care of ** that for us. - ** NOTE: this relies on falling thru the switch - ** to get all the values set, it can be confusing. */ switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: case IFM_10G_T: speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_10G_LRM: - case IFM_10G_SR: /* KR, too */ + case IFM_10G_SR: /* KR, too */ case IFM_10G_LR: - case IFM_10G_CX4: /* KX4 for now */ + case IFM_10G_CX4: /* KX4 */ speed |= IXGBE_LINK_SPEED_1GB_FULL; case IFM_10G_TWINAX: speed |= IXGBE_LINK_SPEED_10GB_FULL; @@ -1620,7 +1720,7 @@ ixgbe_media_change(struct ifnet * ifp) speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_1000_LX: case IFM_1000_SX: - case IFM_1000_CX: /* KX until there's real support */ + case IFM_1000_CX: /* KX */ speed |= IXGBE_LINK_SPEED_1GB_FULL; break; case IFM_100_TX: @@ -1640,7 +1740,7 @@ ixgbe_media_change(struct ifnet * ifp) return (0); invalid: - device_printf(adapter->dev, "Invalid media type\n"); + device_printf(adapter->dev, "Invalid media type!\n"); return (EINVAL); } @@ -1865,7 +1965,6 @@ ixgbe_update_link_status(struct adapter *adapter) struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; - if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) @@ -1875,6 +1974,8 @@ ixgbe_update_link_status(struct adapter *adapter) adapter->link_active = TRUE; /* Update any Flow Control changes */ ixgbe_fc_enable(&adapter->hw); + /* Update DMA coalescing config */ + ixgbe_config_dmac(adapter); if_link_state_change(ifp, LINK_STATE_UP); } } else { /* Link down */ @@ -1961,7 +2062,7 @@ ixgbe_identify_hardware(struct adapter *adapter) /* We need this here to set the num_segs below */ ixgbe_set_mac_type(hw); - /* Pick up the 82599 and VF settings */ + /* Pick up the 82599 settings */ if (hw->mac.type != ixgbe_mac_82598EB) { hw->phy.smart_speed = ixgbe_smart_speed; adapter->num_segs = IXGBE_82599_SCATTER; @@ -2071,6 +2172,7 @@ ixgbe_allocate_legacy(struct adapter *adapter) TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); + TASK_INIT(&adapter->phy_task, 0, ixgbe_handle_phy, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif @@ -2136,8 +2238,6 @@ ixgbe_allocate_msix(struct adapter *adapter) } #endif - - for (int i = 0; i < adapter->num_queues; i++, vector++, que++, txr++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, @@ -2187,14 +2287,11 @@ ixgbe_allocate_msix(struct adapter *adapter) "Bound RSS bucket %d to CPU %d\n", i, cpu_id); #else -#if 0 // This is too noisy - device_printf(dev, - "Bound queue %d to cpu %d\n", - i, cpu_id); -#endif + if (bootverbose) + device_printf(dev, + "Bound queue %d to cpu %d\n", + i, cpu_id); #endif - - #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif @@ -2240,6 +2337,7 @@ ixgbe_allocate_msix(struct adapter *adapter) TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); + TASK_INIT(&adapter->phy_task, 0, ixgbe_handle_phy, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif @@ -2465,6 +2563,12 @@ ixgbe_setup_interface(device_t dev, struct adapter *adapter) #if __FreeBSD_version >= 1100036 if_setgetcounterfn(ifp, ixgbe_get_counter); #endif +#if __FreeBSD_version >= 1100045 + /* TSO parameters */ + ifp->if_hw_tsomax = 65518; + ifp->if_hw_tsomaxsegcount = IXGBE_82599_SCATTER; + ifp->if_hw_tsomaxsegsize = 2048; +#endif #ifndef IXGBE_LEGACY_TX ifp->if_transmit = ixgbe_mq_start; ifp->if_qflush = ixgbe_qflush; @@ -2548,10 +2652,6 @@ ixgbe_add_media_types(struct adapter *adapter) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_CX4, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); -#if 0 - if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_LX) - ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_LX, 0, NULL); -#endif /* ** Other (no matching FreeBSD media type): @@ -2560,25 +2660,24 @@ ixgbe_add_media_types(struct adapter *adapter) */ if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) { device_printf(dev, "Media supported: 10GbaseKR\n"); - device_printf(dev, "10GbaseKR mapped to 10baseT\n"); - ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); + device_printf(dev, "10GbaseKR mapped to 10GbaseSR\n"); + ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_SR, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4) { device_printf(dev, "Media supported: 10GbaseKX4\n"); - device_printf(dev, "10GbaseKX4 mapped to 10base2\n"); - ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_2, 0, NULL); + device_printf(dev, "10GbaseKX4 mapped to 10GbaseCX4\n"); + ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_CX4, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) { device_printf(dev, "Media supported: 1000baseKX\n"); - device_printf(dev, "1000baseKX mapped to 10base5\n"); - ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_5, 0, NULL); + device_printf(dev, "1000baseKX mapped to 1000baseCX\n"); + ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_CX, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_BX) { /* Someday, someone will care about you... */ device_printf(dev, "Media supported: 1000baseBX\n"); } - /* Very old */ if (hw->device_id == IXGBE_DEV_ID_82598AT) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); @@ -2706,7 +2805,8 @@ ixgbe_initialise_rss_mapping(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; uint32_t reta; - int i, j, queue_id; + int i, j, queue_id, table_size; + int index_mult; uint32_t rss_key[10]; uint32_t mrqc; #ifdef RSS @@ -2724,8 +2824,23 @@ ixgbe_initialise_rss_mapping(struct adapter *adapter) arc4rand(&rss_key, sizeof(rss_key), 0); #endif + /* Set multiplier for RETA setup and table size based on MAC */ + index_mult = 0x1; + table_size = 128; + switch (adapter->hw.mac.type) { + case ixgbe_mac_82598EB: + index_mult = 0x11; + break; + case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: + table_size = 512; + break; + default: + break; + } + /* Set up the redirection table */ - for (i = 0, j = 0; i < 128; i++, j++) { + for (i = 0, j = 0; i < table_size; i++, j++) { if (j == adapter->num_queues) j = 0; #ifdef RSS /* @@ -2736,7 +2851,7 @@ ixgbe_initialise_rss_mapping(struct adapter *adapter) queue_id = rss_get_indirection_to_bucket(i); queue_id = queue_id % adapter->num_queues; #else - queue_id = (j * 0x11); + queue_id = (j * index_mult); #endif /* * The low 8 bits are for hash value (n+0); @@ -2745,7 +2860,10 @@ ixgbe_initialise_rss_mapping(struct adapter *adapter) reta = reta >> 8; reta = reta | ( ((uint32_t) queue_id) << 24); if ((i & 3) == 3) { - IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); + if (i < 128) + IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); + else + IXGBE_WRITE_REG(hw, IXGBE_ERETA((i >> 2) - 32), reta); reta = 0; } } @@ -2834,8 +2952,10 @@ ixgbe_initialize_receive_units(struct adapter *adapter) /* Enable broadcasts */ fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL); fctrl |= IXGBE_FCTRL_BAM; - fctrl |= IXGBE_FCTRL_DPF; - fctrl |= IXGBE_FCTRL_PMCF; + if (adapter->hw.mac.type == ixgbe_mac_82598EB) { + fctrl |= IXGBE_FCTRL_DPF; + fctrl |= IXGBE_FCTRL_PMCF; + } IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl); /* Set for Jumbo Frames? */ @@ -3045,30 +3165,37 @@ ixgbe_enable_intr(struct adapter *adapter) mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); /* Enable Fan Failure detection */ if (hw->device_id == IXGBE_DEV_ID_82598AT) - mask |= IXGBE_EIMS_GPI_SDP1_BY_MAC(hw); + mask |= IXGBE_EIMS_GPI_SDP1; switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: mask |= IXGBE_EIMS_ECC; /* Temperature sensor on some adapters */ - mask |= IXGBE_EIMS_GPI_SDP0_BY_MAC(hw); + mask |= IXGBE_EIMS_GPI_SDP0; /* SFP+ (RX_LOS_N & MOD_ABS_N) */ - mask |= IXGBE_EIMS_GPI_SDP1_BY_MAC(hw); - mask |= IXGBE_EIMS_GPI_SDP2_BY_MAC(hw); + mask |= IXGBE_EIMS_GPI_SDP1; + mask |= IXGBE_EIMS_GPI_SDP2; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif break; case ixgbe_mac_X540: - case ixgbe_mac_X550: - case ixgbe_mac_X550EM_a: - case ixgbe_mac_X550EM_x: /* Detect if Thermal Sensor is enabled */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM); if (fwsm & IXGBE_FWSM_TS_ENABLED) mask |= IXGBE_EIMS_TS; - /* XXX: Which SFP mode line does this look at? */ - if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP) + mask |= IXGBE_EIMS_ECC; +#ifdef IXGBE_FDIR + mask |= IXGBE_EIMS_FLOW_DIR; +#endif + break; + case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: + /* MAC thermal sensor is automatically enabled */ + mask |= IXGBE_EIMS_TS; + /* Some devices use SDP0 for important information */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP || + hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) mask |= IXGBE_EIMS_GPI_SDP0_BY_MAC(hw); mask |= IXGBE_EIMS_ECC; #ifdef IXGBE_FDIR @@ -3081,7 +3208,7 @@ ixgbe_enable_intr(struct adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); - /* With RSS we use auto clear */ + /* With MSI-X we use auto clear */ if (adapter->msix_mem) { mask = IXGBE_EIMS_ENABLE_MASK; /* Don't autoclear Link */ @@ -3135,10 +3262,12 @@ ixgbe_get_slot_info(struct ixgbe_hw *hw) if (hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) { ixgbe_get_bus_info(hw); /* These devices don't use PCI-E */ - if (hw->mac.type == ixgbe_mac_X550EM_x - || hw->mac.type == ixgbe_mac_X550EM_a) + switch (hw->mac.type) { + case ixgbe_mac_X550EM_x: return; - goto display; + default: + goto display; + } } /* @@ -3260,7 +3389,6 @@ ixgbe_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: - case ixgbe_mac_X550EM_a: case ixgbe_mac_X550EM_x: if (type == -1) { /* MISC IVAR */ index = (entry & 1) * 8; @@ -3289,8 +3417,14 @@ ixgbe_configure_ivars(struct adapter *adapter) if (ixgbe_max_interrupt_rate > 0) newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; - else + else { + /* + ** Disable DMA coalescing if interrupt moderation is + ** disabled. + */ + adapter->dmac = 0; newitr = 0; + } for (int i = 0; i < adapter->num_queues; i++, que++) { /* First the RX queue entry */ @@ -3350,7 +3484,7 @@ ixgbe_handle_link(void *context, int pending) ixgbe_check_link(&adapter->hw, &adapter->link_speed, &adapter->link_up, 0); - ixgbe_update_link_status(adapter); + ixgbe_update_link_status(adapter); } /* @@ -3410,6 +3544,28 @@ ixgbe_handle_msf(void *context, int pending) return; } +/* +** Tasklet for handling interrupts from an external PHY +*/ +static void +ixgbe_handle_phy(void *context, int pending) +{ + struct adapter *adapter = context; + struct ixgbe_hw *hw = &adapter->hw; + int error; + + error = hw->phy.ops.handle_lasi(hw); + if (error == IXGBE_ERR_OVERTEMP) + device_printf(adapter->dev, + "CRITICAL: EXTERNAL PHY OVER TEMP!! " + " PHY will downshift to lower power state!\n"); + else if (error) + device_printf(adapter->dev, + "Error handling LASI interrupt: %d\n", + error); + return; +} + #ifdef IXGBE_FDIR /* ** Tasklet for reinitializing the Flow Director filter table @@ -3432,6 +3588,127 @@ ixgbe_reinit_fdir(void *context, int pending) } #endif +/********************************************************************* + * + * Configure DMA Coalescing + * + **********************************************************************/ +static void +ixgbe_config_dmac(struct adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_dmac_config *dcfg = &hw->mac.dmac_config; + + if (hw->mac.type < ixgbe_mac_X550 || + !hw->mac.ops.dmac_config) + return; + + if (dcfg->watchdog_timer ^ adapter->dmac || + dcfg->link_speed ^ adapter->link_speed) { + dcfg->watchdog_timer = adapter->dmac; + dcfg->fcoe_en = false; + dcfg->link_speed = adapter->link_speed; + dcfg->num_tcs = 1; + + INIT_DEBUGOUT2("dmac settings: watchdog %d, link speed %d\n", + dcfg->watchdog_timer, dcfg->link_speed); + + hw->mac.ops.dmac_config(hw); + } +} + +/* + * Checks whether the adapter supports Energy Efficient Ethernet + * or not, based on device ID. + */ +static void +ixgbe_check_eee_support(struct adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + adapter->eee_support = adapter->eee_enabled = + (hw->device_id == IXGBE_DEV_ID_X550T || + hw->device_id == IXGBE_DEV_ID_X550EM_X_KR); +} + +/* + * Checks whether the adapter's ports are capable of + * Wake On LAN by reading the adapter's NVM. + * + * Sets each port's hw->wol_enabled value depending + * on the value read here. + */ +static void +ixgbe_check_wol_support(struct adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + u16 dev_caps = 0; + + /* Find out WoL support for port */ + adapter->wol_support = hw->wol_enabled = 0; + ixgbe_get_device_caps(hw, &dev_caps); + if ((dev_caps & IXGBE_DEVICE_CAPS_WOL_PORT0_1) || + ((dev_caps & IXGBE_DEVICE_CAPS_WOL_PORT0) && + hw->bus.func == 0)) + adapter->wol_support = hw->wol_enabled = 1; + + /* Save initial wake up filter configuration */ + adapter->wufc = IXGBE_READ_REG(hw, IXGBE_WUFC); + + return; +} + +/* + * Prepare the adapter/port for LPLU and/or WoL + */ +static int +ixgbe_setup_low_power_mode(struct adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + device_t dev = adapter->dev; + s32 error = 0; + + mtx_assert(&adapter->core_mtx, MA_OWNED); + + /* Limit power management flow to X550EM baseT */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T + && hw->phy.ops.enter_lplu) { + /* Turn off support for APM wakeup. (Using ACPI instead) */ + IXGBE_WRITE_REG(hw, IXGBE_GRC, + IXGBE_READ_REG(hw, IXGBE_GRC) & ~(u32)2); + + /* + * Clear Wake Up Status register to prevent any previous wakeup + * events from waking us up immediately after we suspend. + */ + IXGBE_WRITE_REG(hw, IXGBE_WUS, 0xffffffff); + + /* + * Program the Wakeup Filter Control register with user filter + * settings + */ + IXGBE_WRITE_REG(hw, IXGBE_WUFC, adapter->wufc); + + /* Enable wakeups and power management in Wakeup Control */ + IXGBE_WRITE_REG(hw, IXGBE_WUC, + IXGBE_WUC_WKEN | IXGBE_WUC_PME_EN); + + /* X550EM baseT adapters need a special LPLU flow */ + hw->phy.reset_disable = true; + ixgbe_stop(adapter); + error = hw->phy.ops.enter_lplu(hw); + if (error) + device_printf(dev, + "Error entering LPLU: %d\n", error); + hw->phy.reset_disable = false; + } else { + /* Just stop for other adapters */ + ixgbe_stop(adapter); + } + + return error; +} + /********************************************************************** * * Update the board statistics counters. @@ -3449,42 +3726,6 @@ ixgbe_update_stats_counters(struct adapter *adapter) adapter->stats.pf.errbc += IXGBE_READ_REG(hw, IXGBE_ERRBC); adapter->stats.pf.mspdc += IXGBE_READ_REG(hw, IXGBE_MSPDC); - /* - ** Note: these are for the 8 possible traffic classes, - ** which in current implementation is unused, - ** therefore only 0 should read real data. - */ - for (int i = 0; i < 8; i++) { - u32 mp; - mp = IXGBE_READ_REG(hw, IXGBE_MPC(i)); - /* missed_rx tallies misses for the gprc workaround */ - missed_rx += mp; - /* global total per queue */ - adapter->stats.pf.mpc[i] += mp; - /* total for stats display */ - total_missed_rx += adapter->stats.pf.mpc[i]; - if (hw->mac.type == ixgbe_mac_82598EB) { - adapter->stats.pf.rnbc[i] += - IXGBE_READ_REG(hw, IXGBE_RNBC(i)); - adapter->stats.pf.qbtc[i] += - IXGBE_READ_REG(hw, IXGBE_QBTC(i)); - adapter->stats.pf.qbrc[i] += - IXGBE_READ_REG(hw, IXGBE_QBRC(i)); - adapter->stats.pf.pxonrxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXONRXC(i)); - } else - adapter->stats.pf.pxonrxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXONRXCNT(i)); - adapter->stats.pf.pxontxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXONTXC(i)); - adapter->stats.pf.pxofftxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(i)); - if (hw->mac.type != ixgbe_mac_X550EM_x) - adapter->stats.pf.pxoffrxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(i)); - adapter->stats.pf.pxon2offc[i] += - IXGBE_READ_REG(hw, IXGBE_PXON2OFFCNT(i)); - } for (int i = 0; i < 16; i++) { adapter->stats.pf.qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); adapter->stats.pf.qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); @@ -3592,6 +3833,8 @@ static uint64_t ixgbe_get_counter(struct ifnet *ifp, ift_counter cnt) { struct adapter *adapter; + struct tx_ring *txr; + uint64_t rv; adapter = if_getsoftc(ifp); @@ -3612,6 +3855,12 @@ ixgbe_get_counter(struct ifnet *ifp, ift_counter cnt) return (0); case IFCOUNTER_IQDROPS: return (adapter->iqdrops); + case IFCOUNTER_OQDROPS: + rv = 0; + txr = adapter->tx_rings; + for (int i = 0; i < adapter->num_queues; i++, txr++) + rv += txr->br->br_drops; + return (rv); case IFCOUNTER_IERRORS: return (adapter->ierrors); default: @@ -3720,6 +3969,108 @@ ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) return 0; } +static void +ixgbe_add_device_sysctls(struct adapter *adapter) +{ + device_t dev = adapter->dev; + struct ixgbe_hw *hw = &adapter->hw; + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + /* Sysctls for all devices */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "fc", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_set_flowcntl, "I", IXGBE_SYSCTL_DESC_SET_FC); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "enable_aim", + CTLFLAG_RW, + &ixgbe_enable_aim, 1, "Interrupt Moderation"); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "advertise_speed", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_set_advertise, "I", IXGBE_SYSCTL_DESC_ADV_SPEED); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "thermal_test", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_sysctl_thermal_test, "I", "Thermal Test"); + + /* for X550 devices */ + if (hw->mac.type >= ixgbe_mac_X550) + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "dmac", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_sysctl_dmac, "I", "DMA Coalesce"); + + /* for X550T and X550EM backplane devices */ + if (hw->device_id == IXGBE_DEV_ID_X550T || + hw->device_id == IXGBE_DEV_ID_X550EM_X_KR) { + struct sysctl_oid *eee_node; + struct sysctl_oid_list *eee_list; + + eee_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eee", + CTLFLAG_RD, NULL, + "Energy Efficient Ethernet sysctls"); + eee_list = SYSCTL_CHILDREN(eee_node); + + SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "enable", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_sysctl_eee_enable, "I", + "Enable or Disable EEE"); + + SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "negotiated", + CTLTYPE_INT | CTLFLAG_RD, adapter, 0, + ixgbe_sysctl_eee_negotiated, "I", + "EEE negotiated on link"); + + SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "tx_lpi_status", + CTLTYPE_INT | CTLFLAG_RD, adapter, 0, + ixgbe_sysctl_eee_tx_lpi_status, "I", + "Whether or not TX link is in LPI state"); + + SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "rx_lpi_status", + CTLTYPE_INT | CTLFLAG_RD, adapter, 0, + ixgbe_sysctl_eee_rx_lpi_status, "I", + "Whether or not RX link is in LPI state"); + } + + /* for certain 10GBaseT devices */ + if (hw->device_id == IXGBE_DEV_ID_X550T || + hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "wol_enable", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_sysctl_wol_enable, "I", + "Enable/Disable Wake on LAN"); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "wufc", + CTLTYPE_INT | CTLFLAG_RW, adapter, 0, + ixgbe_sysctl_wufc, "I", + "Enable/Disable Wake Up Filters"); + } + + /* for X550EM 10GBaseT devices */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { + struct sysctl_oid *phy_node; + struct sysctl_oid_list *phy_list; + + phy_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "phy", + CTLFLAG_RD, NULL, + "External PHY sysctls"); + phy_list = SYSCTL_CHILDREN(phy_node); + + SYSCTL_ADD_PROC(ctx, phy_list, OID_AUTO, "temp", + CTLTYPE_INT | CTLFLAG_RD, adapter, 0, + ixgbe_sysctl_phy_temp, "I", + "Current External PHY Temperature (Celsius)"); + + SYSCTL_ADD_PROC(ctx, phy_list, OID_AUTO, "overtemp_occurred", + CTLTYPE_INT | CTLFLAG_RD, adapter, 0, + ixgbe_sysctl_phy_overtemp_occurred, "I", + "External PHY High Temperature Event Occurred"); + } +} + /* * Add sysctl variables, one per statistic, to the system. */ @@ -3753,7 +4104,7 @@ ixgbe_add_hw_stats(struct adapter *adapter) CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", - CTLFLAG_RD, &adapter->vector_irq, + CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); for (int i = 0; i < adapter->num_queues; i++, txr++) { @@ -3790,6 +4141,9 @@ ixgbe_add_hw_stats(struct adapter *adapter) SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); + SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "br_drops", + CTLFLAG_RD, &txr->br->br_drops, + "Packets dropped in buf_ring"); } for (int i = 0; i < adapter->num_queues; i++, rxr++) { @@ -4082,20 +4436,77 @@ ixgbe_set_advertise(SYSCTL_HANDLER_ARGS) } /* -** Thermal Shutdown Trigger -** - cause a Thermal Overtemp IRQ -** - this now requires firmware enabling -*/ + * The following two sysctls are for X550 BaseT devices; + * they deal with the external PHY used in them. + */ static int -ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS) +ixgbe_sysctl_phy_temp(SYSCTL_HANDLER_ARGS) { - int error, fire = 0; struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; + u16 reg; + if (hw->device_id != IXGBE_DEV_ID_X550EM_X_10G_T) { + device_printf(adapter->dev, + "Device has no supported external thermal sensor.\n"); + return (ENODEV); + } - if (hw->mac.type < ixgbe_mac_X540) - return (0); + if (hw->phy.ops.read_reg(hw, IXGBE_PHY_CURRENT_TEMP, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®)) { + device_printf(adapter->dev, + "Error reading from PHY's current temperature register\n"); + return (EAGAIN); + } + + /* Shift temp for output */ + reg = reg >> 8; + + return (sysctl_handle_int(oidp, NULL, reg, req)); +} + +/* + * Reports whether the current PHY temperature is over + * the overtemp threshold. + * - This is reported directly from the PHY + */ +static int +ixgbe_sysctl_phy_overtemp_occurred(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + u16 reg; + + if (hw->device_id != IXGBE_DEV_ID_X550EM_X_10G_T) { + device_printf(adapter->dev, + "Device has no supported external thermal sensor.\n"); + return (ENODEV); + } + + if (hw->phy.ops.read_reg(hw, IXGBE_PHY_OVERTEMP_STATUS, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®)) { + device_printf(adapter->dev, + "Error reading from PHY's temperature status register\n"); + return (EAGAIN); + } + + /* Get occurrence bit */ + reg = !!(reg & 0x4000); + return (sysctl_handle_int(oidp, 0, reg, req)); +} + +/* +** Thermal Shutdown Trigger (internal MAC) +** - Set this to 1 to cause an overtemp event to occur +*/ +static int +ixgbe_sysctl_thermal_test(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + int error, fire = 0; error = sysctl_handle_int(oidp, &fire, 0, req); if ((error) || (req->newptr == NULL)) @@ -4111,6 +4522,223 @@ ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS) } /* +** Manage DMA Coalescing. +** Control values: +** 0/1 - off / on (use default value of 1000) +** +** Legal timer values are: +** 50,100,250,500,1000,2000,5000,10000 +** +** Turning off interrupt moderation will also turn this off. +*/ +static int +ixgbe_sysctl_dmac(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + struct ifnet *ifp = adapter->ifp; + int error; + u16 oldval; + + oldval = adapter->dmac; + error = sysctl_handle_int(oidp, &adapter->dmac, 0, req); + if ((error) || (req->newptr == NULL)) + return (error); + + switch (hw->mac.type) { + case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: + break; + default: + device_printf(adapter->dev, + "DMA Coalescing is only supported on X550 devices\n"); + return (ENODEV); + } + + switch (adapter->dmac) { + case 0: + /* Disabled */ + break; + case 1: /* Enable and use default */ + adapter->dmac = 1000; + break; + case 50: + case 100: + case 250: + case 500: + case 1000: + case 2000: + case 5000: + case 10000: + /* Legal values - allow */ + break; + default: + /* Do nothing, illegal value */ + adapter->dmac = oldval; + return (EINVAL); + } + + /* Re-initialize hardware if it's already running */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + ixgbe_init(adapter); + + return (0); +} + +/* + * Sysctl to enable/disable the WoL capability, if supported by the adapter. + * Values: + * 0 - disabled + * 1 - enabled + */ +static int +ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + int new_wol_enabled; + int error = 0; + + new_wol_enabled = hw->wol_enabled; + error = sysctl_handle_int(oidp, &new_wol_enabled, 0, req); + if ((error) || (req->newptr == NULL)) + return (error); + if (new_wol_enabled == hw->wol_enabled) + return (0); + + if (new_wol_enabled > 0 && !adapter->wol_support) + return (ENODEV); + else + hw->wol_enabled = !!(new_wol_enabled); + + return (0); +} + +/* + * Sysctl to enable/disable the Energy Efficient Ethernet capability, + * if supported by the adapter. + * Values: + * 0 - disabled + * 1 - enabled + */ +static int +ixgbe_sysctl_eee_enable(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ifnet *ifp = adapter->ifp; + int new_eee_enabled, error = 0; + + new_eee_enabled = adapter->eee_enabled; + error = sysctl_handle_int(oidp, &new_eee_enabled, 0, req); + if ((error) || (req->newptr == NULL)) + return (error); + if (new_eee_enabled == adapter->eee_enabled) + return (0); + + if (new_eee_enabled > 0 && !adapter->eee_support) + return (ENODEV); + else + adapter->eee_enabled = !!(new_eee_enabled); + + /* Re-initialize hardware if it's already running */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + ixgbe_init(adapter); + + return (0); +} + +/* + * Read-only sysctl indicating whether EEE support was negotiated + * on the link. + */ +static int +ixgbe_sysctl_eee_negotiated(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + bool status; + + status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & IXGBE_EEE_STAT_NEG); + + return (sysctl_handle_int(oidp, 0, status, req)); +} + +/* + * Read-only sysctl indicating whether RX Link is in LPI state. + */ +static int +ixgbe_sysctl_eee_rx_lpi_status(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + bool status; + + status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & + IXGBE_EEE_RX_LPI_STATUS); + + return (sysctl_handle_int(oidp, 0, status, req)); +} + +/* + * Read-only sysctl indicating whether TX Link is in LPI state. + */ +static int +ixgbe_sysctl_eee_tx_lpi_status(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + struct ixgbe_hw *hw = &adapter->hw; + bool status; + + status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & + IXGBE_EEE_TX_LPI_STATUS); + + return (sysctl_handle_int(oidp, 0, status, req)); +} + +/* + * Sysctl to enable/disable the types of packets that the + * adapter will wake up on upon receipt. + * WUFC - Wake Up Filter Control + * Flags: + * 0x1 - Link Status Change + * 0x2 - Magic Packet + * 0x4 - Direct Exact + * 0x8 - Directed Multicast + * 0x10 - Broadcast + * 0x20 - ARP/IPv4 Request Packet + * 0x40 - Direct IPv4 Packet + * 0x80 - Direct IPv6 Packet + * + * Setting another flag will cause the sysctl to return an + * error. + */ +static int +ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + int error = 0; + u32 new_wufc; + + new_wufc = adapter->wufc; + + error = sysctl_handle_int(oidp, &new_wufc, 0, req); + if ((error) || (req->newptr == NULL)) + return (error); + if (new_wufc == adapter->wufc) + return (0); + + if (new_wufc & 0xffffff00) + return (EINVAL); + else { + new_wufc &= 0xff; + new_wufc |= (0xffffff & adapter->wufc); + adapter->wufc = new_wufc; + } + + return (0); +} + +/* ** Enable the hardware to drop packets when the buffer is ** full. This is useful when multiqueue,so that no single ** queue being full stalls the entire RX engine. We only @@ -4154,6 +4782,7 @@ ixgbe_rearm_queues(struct adapter *adapter, u64 queues) case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: + case ixgbe_mac_X550EM_x: mask = (queues & 0xFFFFFFFF); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EICS_EX(0), mask); mask = (queues >> 32); diff --git a/sys/dev/ixgbe/if_ixv.c b/sys/dev/ixgbe/if_ixv.c index 672077b..a550a85 100644 --- a/sys/dev/ixgbe/if_ixv.c +++ b/sys/dev/ixgbe/if_ixv.c @@ -60,7 +60,6 @@ static ixgbe_vendor_info_t ixv_vendor_info_array[] = {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550_VF, 0, 0, 0}, - {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_A_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_VF, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} @@ -882,7 +881,7 @@ ixv_msix_mbx(void *arg) struct ixgbe_hw *hw = &adapter->hw; u32 reg; - ++adapter->vector_irq; + ++adapter->link_irq; /* First get the cause */ reg = IXGBE_READ_REG(hw, IXGBE_VTEICS); @@ -2034,8 +2033,8 @@ ixv_add_stats_sysctls(struct adapter *adapter) SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &(txr->total_packets), "TX Packets"); - SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_bytes", - CTLFLAG_RD, &(txr->tx_bytes), + SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "tx_bytes", + CTLFLAG_RD, &(txr->bytes), 0, "TX Bytes"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_no_desc", CTLFLAG_RD, &(txr->no_desc_avail), @@ -2083,7 +2082,7 @@ ixv_print_debug_info(struct adapter *adapter) } device_printf(dev,"MBX IRQ Handled: %lu\n", - (long)adapter->vector_irq); + (long)adapter->link_irq); return; } diff --git a/sys/dev/ixgbe/ix_txrx.c b/sys/dev/ixgbe/ix_txrx.c index 6c9c84a..bfe1607 100644 --- a/sys/dev/ixgbe/ix_txrx.c +++ b/sys/dev/ixgbe/ix_txrx.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -210,7 +210,11 @@ ixgbe_mq_start(struct ifnet *ifp, struct mbuf *m) * If everything is setup correctly, it should be the * same bucket that the current CPU we're on is. */ +#if __FreeBSD_version < 1100054 + if (m->m_flags & M_FLOWID) { +#else if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { +#endif #ifdef RSS if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bucket_id) == 0) @@ -276,7 +280,12 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) enqueued++; #if 0 // this is VF-only #if __FreeBSD_version >= 1100036 - if (next->m_flags & M_MCAST) + /* + * Since we're looking at the tx ring, we can check + * to see if we're a VF by examing our tail register + * address. + */ + if (txr->tail < IXGBE_TDT(0) && next->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); #endif #endif @@ -312,8 +321,8 @@ ixgbe_deferred_mq_start(void *arg, int pending) } /* -** Flush all ring buffers -*/ + * Flush all ring buffers + */ void ixgbe_qflush(struct ifnet *ifp) { @@ -387,6 +396,10 @@ retry: /* Try it again? - one try */ if (remap == TRUE) { remap = FALSE; + /* + * XXX: m_defrag will choke on + * non-MCLBYTES-sized clusters + */ m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; @@ -418,9 +431,9 @@ retry: m_head = *m_headp; /* - ** Set up the appropriate offload context - ** this will consume the first descriptor - */ + * Set up the appropriate offload context + * this will consume the first descriptor + */ error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); if (__predict_false(error)) { if (error == ENOBUFS) @@ -439,7 +452,6 @@ retry: } #endif - olinfo_status |= IXGBE_ADVTXD_CC; i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; @@ -466,11 +478,11 @@ retry: txbuf->m_head = m_head; /* - ** Here we swap the map so the last descriptor, - ** which gets the completion interrupt has the - ** real map, and the first descriptor gets the - ** unused map from this descriptor. - */ + * Here we swap the map so the last descriptor, + * which gets the completion interrupt has the + * real map, and the first descriptor gets the + * unused map from this descriptor. + */ txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); @@ -493,7 +505,6 @@ retry: txr->busy = 1; return (0); - } @@ -732,6 +743,7 @@ static int ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { + struct adapter *adapter = txr->adapter; struct ixgbe_adv_tx_context_desc *TXD; struct ether_vlan_header *eh; struct ip *ip; @@ -766,6 +778,8 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } + else if (!IXGBE_IS_X550VF(adapter) && (offload == FALSE)) + return (0); /* * Determine where frame payload starts. @@ -1727,9 +1741,6 @@ ixgbe_rx_discard(struct rx_ring *rxr, int i) * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * - * We loop at most count times if count is > 0, or until done if - * count < 0. - * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ bool @@ -1792,10 +1803,9 @@ ixgbe_rxeof(struct ix_queue *que) /* Make sure bad packets are discarded */ if (eop && (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) != 0) { -#if 0 // VF-only #if __FreeBSD_version >= 1100036 - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); -#endif + if (IXGBE_IS_VF(adapter)) + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); #endif rxr->rx_discarded++; ixgbe_rx_discard(rxr, i); @@ -1906,6 +1916,9 @@ ixgbe_rxeof(struct ix_queue *que) #ifdef RSS sendmp->m_pkthdr.flowid = le32toh(cur->wb.lower.hi_dword.rss); +#if __FreeBSD_version < 1100054 + sendmp->m_flags |= M_FLOWID; +#endif switch (pkt_info & IXGBE_RXDADV_RSSTYPE_MASK) { case IXGBE_RXDADV_RSSTYPE_IPV4_TCP: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_TCP_IPV4); @@ -1939,7 +1952,11 @@ ixgbe_rxeof(struct ix_queue *que) } #else /* RSS */ sendmp->m_pkthdr.flowid = que->msix; +#if __FreeBSD_version >= 1100054 M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE); +#else + sendmp->m_flags |= M_FLOWID; +#endif #endif /* RSS */ #endif /* FreeBSD_version */ } diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h index 813ee4f..44e7c3c 100644 --- a/sys/dev/ixgbe/ixgbe.h +++ b/sys/dev/ixgbe/ixgbe.h @@ -90,8 +90,11 @@ #include <sys/pcpu.h> #include <sys/smp.h> #include <machine/smp.h> +#include <sys/sbuf.h> #include "ixgbe_api.h" +#include "ixgbe_common.h" +#include "ixgbe_phy.h" #include "ixgbe_vf.h" /* Tunables */ @@ -146,7 +149,11 @@ #define IXGBE_TX_CLEANUP_THRESHOLD (adapter->num_tx_desc / 8) #define IXGBE_TX_OP_THRESHOLD (adapter->num_tx_desc / 32) -#define IXGBE_MAX_FRAME_SIZE 0x3F00 +/* These defines are used in MTU calculations */ +#define IXGBE_MAX_FRAME_SIZE 9728 +#define IXGBE_MTU_HDR (ETHER_HDR_LEN + ETHER_CRC_LEN + \ + ETHER_VLAN_ENCAP_LEN) +#define IXGBE_MAX_MTU (IXGBE_MAX_FRAME_SIZE - IXGBE_MTU_HDR) /* Flow control constants */ #define IXGBE_FC_PAUSE 0xFFFF @@ -227,6 +234,17 @@ #define IXGBE_BULK_LATENCY 1200 #define IXGBE_LINK_ITR 2000 +/* MAC type macros */ +#define IXGBE_IS_X550VF(_adapter) \ + ((_adapter->hw.mac.type == ixgbe_mac_X550_vf) || \ + (_adapter->hw.mac.type == ixgbe_mac_X550EM_x_vf)) + +#define IXGBE_IS_VF(_adapter) \ + (IXGBE_IS_X550VF(_adapter) || \ + (_adapter->hw.mac.type == ixgbe_mac_X540_vf) || \ + (_adapter->hw.mac.type == ixgbe_mac_82599_vf)) + + /* ***************************************************************************** * vendor_info_array @@ -323,8 +341,8 @@ struct tx_ring { u32 bytes; /* used for AIM */ u32 packets; /* Soft Stats */ - u64 tx_bytes; unsigned long tso_tx; + unsigned long no_tx_map_avail; unsigned long no_tx_dma_setup; u64 no_desc_avail; u64 total_packets; @@ -419,6 +437,13 @@ struct adapter { u32 link_speed; bool link_up; u32 vector; + u16 dmac; + bool eee_support; + bool eee_enabled; + + /* Power management-related */ + bool wol_support; + u32 wufc; /* Mbuf cluster size */ u32 rx_mbuf_sz; @@ -432,6 +457,7 @@ struct adapter { int fdir_reinit; struct task fdir_task; #endif + struct task phy_task; /* PHY intr tasklet */ struct taskqueue *tq; /* @@ -467,7 +493,7 @@ struct adapter { unsigned long mbuf_header_failed; unsigned long mbuf_packet_failed; unsigned long watchdog_events; - unsigned long vector_irq; + unsigned long link_irq; union { struct ixgbe_hw_stats pf; struct ixgbevf_hw_stats vf; @@ -540,12 +566,17 @@ struct adapter { #define IXGBE_SET_IQDROPS(sc, count) (sc)->ifp->if_iqdrops = (count) #endif +/* External PHY register addresses */ +#define IXGBE_PHY_CURRENT_TEMP 0xC820 +#define IXGBE_PHY_OVERTEMP_STATUS 0xC830 + /* Sysctl help messages; displayed with sysctl -d */ #define IXGBE_SYSCTL_DESC_ADV_SPEED \ "\nControl advertised link speed using these flags:\n" \ "\t0x1 - advertise 100M\n" \ "\t0x2 - advertise 1G\n" \ - "\t0x4 - advertise 10G" + "\t0x4 - advertise 10G\n\n" \ + "\t100M is only supported on certain 10GBaseT adapters.\n" #define IXGBE_SYSCTL_DESC_SET_FC \ "\nSet flow control mode using these values:\n" \ diff --git a/sys/dev/ixgbe/ixgbe_82598.c b/sys/dev/ixgbe/ixgbe_82598.c index d9b8985..46e64c5 100644 --- a/sys/dev/ixgbe/ixgbe_82598.c +++ b/sys/dev/ixgbe/ixgbe_82598.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -260,6 +260,8 @@ s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) DEBUGFUNC("ixgbe_start_hw_82598"); ret_val = ixgbe_start_hw_generic(hw); + if (ret_val) + return ret_val; /* Disable relaxed ordering */ for (i = 0; ((i < hw->mac.max_tx_queues) && @@ -278,8 +280,7 @@ s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) } /* set the completion timeout for interface */ - if (ret_val == IXGBE_SUCCESS) - ixgbe_set_pcie_completion_timeout(hw); + ixgbe_set_pcie_completion_timeout(hw); return ret_val; } diff --git a/sys/dev/ixgbe/ixgbe_82598.h b/sys/dev/ixgbe/ixgbe_82598.h index 621be41..d2241c7 100644 --- a/sys/dev/ixgbe/ixgbe_82598.h +++ b/sys/dev/ixgbe/ixgbe_82598.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_82599.c b/sys/dev/ixgbe/ixgbe_82599.c index d49d851..b38620f 100644 --- a/sys/dev/ixgbe/ixgbe_82599.c +++ b/sys/dev/ixgbe/ixgbe_82599.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -421,6 +421,8 @@ s32 ixgbe_get_link_capabilities_82599(struct ixgbe_hw *hw, /* Check if 1G SFP module. */ if (hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core0 || hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core1 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1) { *speed = IXGBE_LINK_SPEED_1GB_FULL; @@ -1803,7 +1805,6 @@ s32 ixgbe_fdir_set_input_mask_82599(struct ixgbe_hw *hw, switch (hw->mac.type) { case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: IXGBE_WRITE_REG(hw, IXGBE_FDIRSCTPM, ~fdirtcpm); break; default: @@ -2465,7 +2466,6 @@ reset_pipeline_out: return ret_val; } - /** * ixgbe_read_i2c_byte_82599 - Reads 8 bit word over I2C * @hw: pointer to hardware structure diff --git a/sys/dev/ixgbe/ixgbe_82599.h b/sys/dev/ixgbe/ixgbe_82599.h index 8c973ac..bcfb043 100644 --- a/sys/dev/ixgbe/ixgbe_82599.h +++ b/sys/dev/ixgbe/ixgbe_82599.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_api.c b/sys/dev/ixgbe/ixgbe_api.c index 2535664..9784e3c 100644 --- a/sys/dev/ixgbe/ixgbe_api.c +++ b/sys/dev/ixgbe/ixgbe_api.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -35,6 +35,22 @@ #include "ixgbe_api.h" #include "ixgbe_common.h" +static const u32 ixgbe_mvals_base[IXGBE_MVALS_IDX_LIMIT] = { + IXGBE_MVALS_INIT() +}; + +static const u32 ixgbe_mvals_X540[IXGBE_MVALS_IDX_LIMIT] = { + IXGBE_MVALS_INIT(_X540) +}; + +static const u32 ixgbe_mvals_X550[IXGBE_MVALS_IDX_LIMIT] = { + IXGBE_MVALS_INIT(_X550) +}; + +static const u32 ixgbe_mvals_X550EM_x[IXGBE_MVALS_IDX_LIMIT] = { + IXGBE_MVALS_INIT(_X550EM_x) +}; + /** * ixgbe_dcb_get_rtrup2tc - read rtrup2tc reg * @hw: pointer to hardware structure @@ -81,20 +97,16 @@ s32 ixgbe_init_shared_code(struct ixgbe_hw *hw) case ixgbe_mac_X540: status = ixgbe_init_ops_X540(hw); break; -#if 0 //JFV temporary disable case ixgbe_mac_X550: status = ixgbe_init_ops_X550(hw); break; case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: status = ixgbe_init_ops_X550EM(hw); break; -#endif case ixgbe_mac_82599_vf: case ixgbe_mac_X540_vf: case ixgbe_mac_X550_vf: case ixgbe_mac_X550EM_x_vf: - case ixgbe_mac_X550EM_a_vf: status = ixgbe_init_ops_vf(hw); break; default: @@ -124,6 +136,8 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw) return IXGBE_ERR_DEVICE_NOT_SUPPORTED; } + hw->mvals = ixgbe_mvals_base; + switch (hw->device_id) { case IXGBE_DEV_ID_82598: case IXGBE_DEV_ID_82598_BX: @@ -164,14 +178,17 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw) case IXGBE_DEV_ID_X540_VF: case IXGBE_DEV_ID_X540_VF_HV: hw->mac.type = ixgbe_mac_X540_vf; + hw->mvals = ixgbe_mvals_X540; break; case IXGBE_DEV_ID_X540T: case IXGBE_DEV_ID_X540T1: case IXGBE_DEV_ID_X540_BYPASS: hw->mac.type = ixgbe_mac_X540; + hw->mvals = ixgbe_mvals_X540; break; case IXGBE_DEV_ID_X550T: hw->mac.type = ixgbe_mac_X550; + hw->mvals = ixgbe_mvals_X550; break; case IXGBE_DEV_ID_X550EM_X_KX4: case IXGBE_DEV_ID_X550EM_X_KR: @@ -179,21 +196,17 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw) case IXGBE_DEV_ID_X550EM_X_1G_T: case IXGBE_DEV_ID_X550EM_X_SFP: hw->mac.type = ixgbe_mac_X550EM_x; - break; - case IXGBE_DEV_ID_X550EM_A_KR: - hw->mac.type = ixgbe_mac_X550EM_a; + hw->mvals = ixgbe_mvals_X550EM_x; break; case IXGBE_DEV_ID_X550_VF: case IXGBE_DEV_ID_X550_VF_HV: hw->mac.type = ixgbe_mac_X550_vf; + hw->mvals = ixgbe_mvals_X550; break; case IXGBE_DEV_ID_X550EM_X_VF: case IXGBE_DEV_ID_X550EM_X_VF_HV: hw->mac.type = ixgbe_mac_X550EM_x_vf; - break; - case IXGBE_DEV_ID_X550EM_A_VF: - case IXGBE_DEV_ID_X550EM_A_VF_HV: - hw->mac.type = ixgbe_mac_X550EM_a_vf; + hw->mvals = ixgbe_mvals_X550EM_x; break; default: ret_val = IXGBE_ERR_DEVICE_NOT_SUPPORTED; @@ -1283,6 +1296,23 @@ s32 ixgbe_enter_lplu(struct ixgbe_hw *hw) } /** + * ixgbe_handle_lasi - Handle external Base T PHY interrupt + * @hw: pointer to hardware structure + * + * Handle external Base T PHY interrupt. If high temperature + * failure alarm then return error, else if link status change + * then setup internal/external PHY link + * + * Return IXGBE_ERR_OVERTEMP if interrupt is high temperature + * failure alarm, else return PHY access status. + */ +s32 ixgbe_handle_lasi(struct ixgbe_hw *hw) +{ + return ixgbe_call_func(hw, hw->phy.ops.handle_lasi, (hw), + IXGBE_NOT_IMPLEMENTED); +} + +/** * ixgbe_read_analog_reg8 - Reads 8 bit analog register * @hw: pointer to hardware structure * @reg: analog register to read @@ -1340,6 +1370,23 @@ s32 ixgbe_read_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, } /** + * ixgbe_read_i2c_byte_unlocked - Reads 8 bit word via I2C from device address + * @hw: pointer to hardware structure + * @byte_offset: byte offset to read + * @dev_addr: I2C bus address to read from + * @data: value read + * + * Performs byte read operation to SFP module's EEPROM over I2C interface. + **/ +s32 ixgbe_read_i2c_byte_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data) +{ + return ixgbe_call_func(hw, hw->phy.ops.read_i2c_byte_unlocked, + (hw, byte_offset, dev_addr, data), + IXGBE_NOT_IMPLEMENTED); +} + +/** * ixgbe_read_i2c_combined - Perform I2C read combined operation * @hw: pointer to the hardware structure * @addr: I2C bus address to read from @@ -1355,6 +1402,23 @@ s32 ixgbe_read_i2c_combined(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 *val) } /** + * ixgbe_read_i2c_combined_unlocked - Perform I2C read combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to read from + * @reg: I2C device register to read from + * @val: pointer to location to receive read value + * + * Returns an error code on error. + **/ +s32 ixgbe_read_i2c_combined_unlocked(struct ixgbe_hw *hw, u8 addr, u16 reg, + u16 *val) +{ + return ixgbe_call_func(hw, hw->phy.ops.read_i2c_combined_unlocked, + (hw, addr, reg, val), + IXGBE_NOT_IMPLEMENTED); +} + +/** * ixgbe_write_i2c_byte - Writes 8 bit word over I2C * @hw: pointer to hardware structure * @byte_offset: byte offset to write @@ -1372,6 +1436,24 @@ s32 ixgbe_write_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, } /** + * ixgbe_write_i2c_byte_unlocked - Writes 8 bit word over I2C + * @hw: pointer to hardware structure + * @byte_offset: byte offset to write + * @dev_addr: I2C bus address to write to + * @data: value to write + * + * Performs byte write operation to SFP module's EEPROM over I2C interface + * at a specified device address. + **/ +s32 ixgbe_write_i2c_byte_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data) +{ + return ixgbe_call_func(hw, hw->phy.ops.write_i2c_byte_unlocked, + (hw, byte_offset, dev_addr, data), + IXGBE_NOT_IMPLEMENTED); +} + +/** * ixgbe_write_i2c_combined - Perform I2C write combined operation * @hw: pointer to the hardware structure * @addr: I2C bus address to write to @@ -1387,6 +1469,22 @@ s32 ixgbe_write_i2c_combined(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 val) } /** + * ixgbe_write_i2c_combined_unlocked - Perform I2C write combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to write to + * @reg: I2C device register to write to + * @val: value to write + * + * Returns an error code on error. + **/ +s32 ixgbe_write_i2c_combined_unlocked(struct ixgbe_hw *hw, u8 addr, u16 reg, + u16 val) +{ + return ixgbe_call_func(hw, hw->phy.ops.write_i2c_combined_unlocked, + (hw, addr, reg, val), IXGBE_NOT_IMPLEMENTED); +} + +/** * ixgbe_write_i2c_eeprom - Writes 8 bit EEPROM word over I2C interface * @hw: pointer to hardware structure * @byte_offset: EEPROM byte offset to write diff --git a/sys/dev/ixgbe/ixgbe_api.h b/sys/dev/ixgbe/ixgbe_api.h index 650ae67..8c2c4a8 100644 --- a/sys/dev/ixgbe/ixgbe_api.h +++ b/sys/dev/ixgbe/ixgbe_api.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -173,10 +173,18 @@ u32 ixgbe_atr_compute_sig_hash_82599(union ixgbe_atr_hash_dword input, bool ixgbe_verify_lesm_fw_enabled_82599(struct ixgbe_hw *hw); s32 ixgbe_read_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 *data); +s32 ixgbe_read_i2c_byte_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data); s32 ixgbe_read_i2c_combined(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 *val); +s32 ixgbe_read_i2c_combined_unlocked(struct ixgbe_hw *hw, u8 addr, u16 reg, + u16 *val); s32 ixgbe_write_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 data); +s32 ixgbe_write_i2c_byte_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data); s32 ixgbe_write_i2c_combined(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 val); +s32 ixgbe_write_i2c_combined_unlocked(struct ixgbe_hw *hw, u8 addr, u16 reg, + u16 val); s32 ixgbe_write_i2c_eeprom(struct ixgbe_hw *hw, u8 byte_offset, u8 eeprom_data); s32 ixgbe_get_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr); s32 ixgbe_set_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr); @@ -203,6 +211,7 @@ void ixgbe_enable_mdd(struct ixgbe_hw *hw); void ixgbe_mdd_event(struct ixgbe_hw *hw, u32 *vf_bitmap); void ixgbe_restore_mdd_vf(struct ixgbe_hw *hw, u32 vf); s32 ixgbe_enter_lplu(struct ixgbe_hw *hw); +s32 ixgbe_handle_lasi(struct ixgbe_hw *hw); void ixgbe_set_rate_select_speed(struct ixgbe_hw *hw, ixgbe_link_speed speed); void ixgbe_disable_rx(struct ixgbe_hw *hw); void ixgbe_enable_rx(struct ixgbe_hw *hw); diff --git a/sys/dev/ixgbe/ixgbe_common.c b/sys/dev/ixgbe/ixgbe_common.c index 57fe1b5..f0a0776 100644 --- a/sys/dev/ixgbe/ixgbe_common.c +++ b/sys/dev/ixgbe/ixgbe_common.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -188,6 +188,7 @@ bool ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw) case IXGBE_DEV_ID_X540T1: case IXGBE_DEV_ID_X540_BYPASS: case IXGBE_DEV_ID_X550T: + case IXGBE_DEV_ID_X550EM_X_10G_T: supported = TRUE; break; default: @@ -1090,7 +1091,7 @@ s32 ixgbe_stop_adapter_generic(struct ixgbe_hw *hw) msec_delay(2); /* - * Prevent the PCI-E bus from from hanging by disabling PCI-E master + * Prevent the PCI-E bus from hanging by disabling PCI-E master * access and verify no pending requests */ return ixgbe_disable_pcie_master(hw); @@ -3573,7 +3574,6 @@ u16 ixgbe_get_pcie_msix_count_generic(struct ixgbe_hw *hw) case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: pcie_offset = IXGBE_PCIE_MSIX_82599_CAPS; max_msix_count = IXGBE_MAX_MSIX_VECTORS_82599; break; diff --git a/sys/dev/ixgbe/ixgbe_common.h b/sys/dev/ixgbe/ixgbe_common.h index 94c7f97..e685f5b 100644 --- a/sys/dev/ixgbe/ixgbe_common.h +++ b/sys/dev/ixgbe/ixgbe_common.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_dcb.c b/sys/dev/ixgbe/ixgbe_dcb.c index 6f848e7..3659d17 100644 --- a/sys/dev/ixgbe/ixgbe_dcb.c +++ b/sys/dev/ixgbe/ixgbe_dcb.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -396,7 +396,6 @@ s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_get_tc_stats_82599(hw, stats, tc_count); break; @@ -427,7 +426,6 @@ s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_get_pfc_stats_82599(hw, stats, tc_count); break; @@ -469,7 +467,6 @@ s32 ixgbe_dcb_config_rx_arbiter_cee(struct ixgbe_hw *hw, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_rx_arbiter_82599(hw, refill, max, bwgid, tsa, map); @@ -511,7 +508,6 @@ s32 ixgbe_dcb_config_tx_desc_arbiter_cee(struct ixgbe_hw *hw, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_tx_desc_arbiter_82599(hw, refill, max, bwgid, tsa); @@ -555,7 +551,6 @@ s32 ixgbe_dcb_config_tx_data_arbiter_cee(struct ixgbe_hw *hw, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_tx_data_arbiter_82599(hw, refill, max, bwgid, tsa, @@ -593,7 +588,6 @@ s32 ixgbe_dcb_config_pfc_cee(struct ixgbe_hw *hw, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_pfc_82599(hw, pfc_en, map); break; @@ -622,7 +616,6 @@ s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *hw) case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_tc_stats_82599(hw, NULL); break; @@ -670,7 +663,6 @@ s32 ixgbe_dcb_hw_config_cee(struct ixgbe_hw *hw, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ixgbe_dcb_config_82599(hw, dcb_config); ret = ixgbe_dcb_hw_config_82599(hw, dcb_config->link_speed, @@ -705,7 +697,6 @@ s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *hw, u8 pfc_en, u8 *map) case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ret = ixgbe_dcb_config_pfc_82599(hw, pfc_en, map); break; @@ -731,7 +722,6 @@ s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw, u16 *refill, u16 *max, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: #if !defined(NO_82599_SUPPORT) || !defined(NO_X540_SUPPORT) ixgbe_dcb_config_rx_arbiter_82599(hw, refill, max, bwg_id, tsa, map); diff --git a/sys/dev/ixgbe/ixgbe_dcb.h b/sys/dev/ixgbe/ixgbe_dcb.h index 878bbf8..871b784 100644 --- a/sys/dev/ixgbe/ixgbe_dcb.h +++ b/sys/dev/ixgbe/ixgbe_dcb.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_dcb_82598.c b/sys/dev/ixgbe/ixgbe_dcb_82598.c index a5a090d..fb946c9 100644 --- a/sys/dev/ixgbe/ixgbe_dcb_82598.c +++ b/sys/dev/ixgbe/ixgbe_dcb_82598.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_dcb_82598.h b/sys/dev/ixgbe/ixgbe_dcb_82598.h index 47f19f6..35974f7 100644 --- a/sys/dev/ixgbe/ixgbe_dcb_82598.h +++ b/sys/dev/ixgbe/ixgbe_dcb_82598.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_dcb_82599.c b/sys/dev/ixgbe/ixgbe_dcb_82599.c index 0232d3c..4443411 100644 --- a/sys/dev/ixgbe/ixgbe_dcb_82599.c +++ b/sys/dev/ixgbe/ixgbe_dcb_82599.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_dcb_82599.h b/sys/dev/ixgbe/ixgbe_dcb_82599.h index 7702dc9..bab7628 100644 --- a/sys/dev/ixgbe/ixgbe_dcb_82599.h +++ b/sys/dev/ixgbe/ixgbe_dcb_82599.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_mbx.c b/sys/dev/ixgbe/ixgbe_mbx.c index 067bba0..d8ba55a 100644 --- a/sys/dev/ixgbe/ixgbe_mbx.c +++ b/sys/dev/ixgbe/ixgbe_mbx.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -607,7 +607,6 @@ static s32 ixgbe_check_for_rst_pf(struct ixgbe_hw *hw, u16 vf_number) break; case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: - case ixgbe_mac_X550EM_a: case ixgbe_mac_X540: vflre = IXGBE_READ_REG(hw, IXGBE_VFLREC(reg_offset)); break; @@ -745,7 +744,6 @@ void ixgbe_init_mbx_params_pf(struct ixgbe_hw *hw) if (hw->mac.type != ixgbe_mac_82599EB && hw->mac.type != ixgbe_mac_X550 && hw->mac.type != ixgbe_mac_X550EM_x && - hw->mac.type != ixgbe_mac_X550EM_a && hw->mac.type != ixgbe_mac_X540) return; diff --git a/sys/dev/ixgbe/ixgbe_mbx.h b/sys/dev/ixgbe/ixgbe_mbx.h index 2cffb8a..ea75cbe 100644 --- a/sys/dev/ixgbe/ixgbe_mbx.h +++ b/sys/dev/ixgbe/ixgbe_mbx.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_osdep.h b/sys/dev/ixgbe/ixgbe_osdep.h index ebb55f4..95f6ed5c 100644 --- a/sys/dev/ixgbe/ixgbe_osdep.h +++ b/sys/dev/ixgbe/ixgbe_osdep.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -108,13 +108,14 @@ #define UNREFERENCED_3PARAMETER(_p, _q, _r) #define UNREFERENCED_4PARAMETER(_p, _q, _r, _s) - #define IXGBE_NTOHL(_i) ntohl(_i) #define IXGBE_NTOHS(_i) ntohs(_i) /* XXX these need to be revisited */ -#define IXGBE_CPU_TO_LE32 le32toh -#define IXGBE_LE32_TO_CPUS le32dec +#define IXGBE_CPU_TO_LE32 htole32 +#define IXGBE_LE32_TO_CPUS(x) +#define IXGBE_CPU_TO_BE16 htobe16 +#define IXGBE_CPU_TO_BE32 htobe32 typedef uint8_t u8; typedef int8_t s8; diff --git a/sys/dev/ixgbe/ixgbe_phy.c b/sys/dev/ixgbe/ixgbe_phy.c index 5ac719c..88206c7 100644 --- a/sys/dev/ixgbe/ixgbe_phy.c +++ b/sys/dev/ixgbe/ixgbe_phy.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -101,16 +101,17 @@ static u8 ixgbe_ones_comp_byte_add(u8 add1, u8 add2) } /** - * ixgbe_read_i2c_combined_generic - Perform I2C read combined operation + * ixgbe_read_i2c_combined_generic_int - Perform I2C read combined operation * @hw: pointer to the hardware structure * @addr: I2C bus address to read from * @reg: I2C device register to read from * @val: pointer to location to receive read value + * @lock: TRUE if to take and release semaphore * * Returns an error code on error. */ -static s32 ixgbe_read_i2c_combined_generic(struct ixgbe_hw *hw, u8 addr, - u16 reg, u16 *val) +static s32 ixgbe_read_i2c_combined_generic_int(struct ixgbe_hw *hw, u8 addr, + u16 reg, u16 *val, bool lock) { u32 swfw_mask = hw->phy.phy_semaphore_mask; int max_retry = 10; @@ -121,11 +122,13 @@ static s32 ixgbe_read_i2c_combined_generic(struct ixgbe_hw *hw, u8 addr, u8 reg_high; u8 csum; + if (hw->mac.type >= ixgbe_mac_X550) + max_retry = 3; reg_high = ((reg >> 7) & 0xFE) | 1; /* Indicate read combined */ csum = ixgbe_ones_comp_byte_add(reg_high, reg & 0xFF); csum = ~csum; do { - if (hw->mac.ops.acquire_swfw_sync(hw, swfw_mask)) + if (lock && hw->mac.ops.acquire_swfw_sync(hw, swfw_mask)) return IXGBE_ERR_SWFW_SYNC; ixgbe_i2c_start(hw); /* Device Address and write indication */ @@ -158,13 +161,15 @@ static s32 ixgbe_read_i2c_combined_generic(struct ixgbe_hw *hw, u8 addr, if (ixgbe_clock_out_i2c_bit(hw, FALSE)) goto fail; ixgbe_i2c_stop(hw); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); *val = (high_bits << 8) | low_bits; return 0; fail: ixgbe_i2c_bus_clear(hw); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); retry++; if (retry < max_retry) DEBUGOUT("I2C byte read combined error - Retrying.\n"); @@ -176,17 +181,50 @@ fail: } /** - * ixgbe_write_i2c_combined_generic - Perform I2C write combined operation + * ixgbe_read_i2c_combined_generic - Perform I2C read combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to read from + * @reg: I2C device register to read from + * @val: pointer to location to receive read value + * + * Returns an error code on error. + **/ +static s32 ixgbe_read_i2c_combined_generic(struct ixgbe_hw *hw, u8 addr, + u16 reg, u16 *val) +{ + return ixgbe_read_i2c_combined_generic_int(hw, addr, reg, val, TRUE); +} + +/** + * ixgbe_read_i2c_combined_generic_unlocked - Do I2C read combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to read from + * @reg: I2C device register to read from + * @val: pointer to location to receive read value + * + * Returns an error code on error. + **/ +static s32 +ixgbe_read_i2c_combined_generic_unlocked(struct ixgbe_hw *hw, u8 addr, + u16 reg, u16 *val) +{ + return ixgbe_read_i2c_combined_generic_int(hw, addr, reg, val, FALSE); +} + +/** + * ixgbe_write_i2c_combined_generic_int - Perform I2C write combined operation * @hw: pointer to the hardware structure * @addr: I2C bus address to write to * @reg: I2C device register to write to * @val: value to write + * @lock: TRUE if to take and release semaphore * * Returns an error code on error. */ -static s32 ixgbe_write_i2c_combined_generic(struct ixgbe_hw *hw, - u8 addr, u16 reg, u16 val) +static s32 ixgbe_write_i2c_combined_generic_int(struct ixgbe_hw *hw, u8 addr, + u16 reg, u16 val, bool lock) { + u32 swfw_mask = hw->phy.phy_semaphore_mask; int max_retry = 1; int retry = 0; u8 reg_high; @@ -198,6 +236,8 @@ static s32 ixgbe_write_i2c_combined_generic(struct ixgbe_hw *hw, csum = ixgbe_ones_comp_byte_add(csum, val & 0xFF); csum = ~csum; do { + if (lock && hw->mac.ops.acquire_swfw_sync(hw, swfw_mask)) + return IXGBE_ERR_SWFW_SYNC; ixgbe_i2c_start(hw); /* Device Address and write indication */ if (ixgbe_out_i2c_byte_ack(hw, addr)) @@ -218,10 +258,14 @@ static s32 ixgbe_write_i2c_combined_generic(struct ixgbe_hw *hw, if (ixgbe_out_i2c_byte_ack(hw, csum)) goto fail; ixgbe_i2c_stop(hw); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); return 0; fail: ixgbe_i2c_bus_clear(hw); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); retry++; if (retry < max_retry) DEBUGOUT("I2C byte write combined error - Retrying.\n"); @@ -233,6 +277,37 @@ fail: } /** + * ixgbe_write_i2c_combined_generic - Perform I2C write combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to write to + * @reg: I2C device register to write to + * @val: value to write + * + * Returns an error code on error. + **/ +static s32 ixgbe_write_i2c_combined_generic(struct ixgbe_hw *hw, + u8 addr, u16 reg, u16 val) +{ + return ixgbe_write_i2c_combined_generic_int(hw, addr, reg, val, TRUE); +} + +/** + * ixgbe_write_i2c_combined_generic_unlocked - Do I2C write combined operation + * @hw: pointer to the hardware structure + * @addr: I2C bus address to write to + * @reg: I2C device register to write to + * @val: value to write + * + * Returns an error code on error. + **/ +static s32 +ixgbe_write_i2c_combined_generic_unlocked(struct ixgbe_hw *hw, + u8 addr, u16 reg, u16 val) +{ + return ixgbe_write_i2c_combined_generic_int(hw, addr, reg, val, FALSE); +} + +/** * ixgbe_init_phy_ops_generic - Inits PHY function ptrs * @hw: pointer to the hardware structure * @@ -265,6 +340,13 @@ s32 ixgbe_init_phy_ops_generic(struct ixgbe_hw *hw) phy->sfp_type = ixgbe_sfp_type_unknown; phy->ops.read_i2c_combined = ixgbe_read_i2c_combined_generic; phy->ops.write_i2c_combined = ixgbe_write_i2c_combined_generic; + phy->ops.read_i2c_combined_unlocked = + ixgbe_read_i2c_combined_generic_unlocked; + phy->ops.write_i2c_combined_unlocked = + ixgbe_write_i2c_combined_generic_unlocked; + phy->ops.read_i2c_byte_unlocked = ixgbe_read_i2c_byte_generic_unlocked; + phy->ops.write_i2c_byte_unlocked = + ixgbe_write_i2c_byte_generic_unlocked; phy->ops.check_overtemp = ixgbe_tn_check_overtemp; return IXGBE_SUCCESS; } @@ -1363,6 +1445,13 @@ s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw) else hw->phy.sfp_type = ixgbe_sfp_type_1g_sx_core1; + } else if (comp_codes_1g & IXGBE_SFF_1GBASELX_CAPABLE) { + if (hw->bus.lan_id == 0) + hw->phy.sfp_type = + ixgbe_sfp_type_1g_lx_core0; + else + hw->phy.sfp_type = + ixgbe_sfp_type_1g_lx_core1; } else { hw->phy.sfp_type = ixgbe_sfp_type_unknown; } @@ -1450,6 +1539,8 @@ s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw) if (comp_codes_10g == 0 && !(hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core1 || hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1)) { hw->phy.type = ixgbe_phy_sfp_unsupported; @@ -1467,6 +1558,8 @@ s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw) if (!(enforce_sfp & IXGBE_DEVICE_CAPS_ALLOW_ANY_SFP) && !(hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core0 || hw->phy.sfp_type == ixgbe_sfp_type_1g_cu_core1 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 || hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1)) { /* Make sure we're a supported PHY type */ @@ -1600,6 +1693,9 @@ s32 ixgbe_identify_qsfp_module_generic(struct ixgbe_hw *hw) goto out; } + /* LAN ID is needed for I2C access */ + hw->mac.ops.set_lan_id(hw); + status = hw->phy.ops.read_i2c_eeprom(hw, IXGBE_SFF_IDENTIFIER, &identifier); @@ -1614,9 +1710,6 @@ s32 ixgbe_identify_qsfp_module_generic(struct ixgbe_hw *hw) hw->phy.id = identifier; - /* LAN ID is needed for sfp_type determination */ - hw->mac.ops.set_lan_id(hw); - status = hw->phy.ops.read_i2c_eeprom(hw, IXGBE_SFF_QSFP_10GBE_COMP, &comp_codes_10g); @@ -1804,10 +1897,12 @@ s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw, * SR modules */ if (sfp_type == ixgbe_sfp_type_da_act_lmt_core0 || + sfp_type == ixgbe_sfp_type_1g_lx_core0 || sfp_type == ixgbe_sfp_type_1g_cu_core0 || sfp_type == ixgbe_sfp_type_1g_sx_core0) sfp_type = ixgbe_sfp_type_srlr_core0; else if (sfp_type == ixgbe_sfp_type_da_act_lmt_core1 || + sfp_type == ixgbe_sfp_type_1g_lx_core1 || sfp_type == ixgbe_sfp_type_1g_cu_core1 || sfp_type == ixgbe_sfp_type_1g_sx_core1) sfp_type = ixgbe_sfp_type_srlr_core1; @@ -1932,16 +2027,17 @@ static bool ixgbe_is_sfp_probe(struct ixgbe_hw *hw, u8 offset, u8 addr) } /** - * ixgbe_read_i2c_byte_generic - Reads 8 bit word over I2C + * ixgbe_read_i2c_byte_generic_int - Reads 8 bit word over I2C * @hw: pointer to hardware structure * @byte_offset: byte offset to read * @data: value read + * @lock: TRUE if to take and release semaphore * * Performs byte read operation to SFP module's EEPROM over I2C interface at * a specified device address. **/ -s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, - u8 dev_addr, u8 *data) +static s32 ixgbe_read_i2c_byte_generic_int(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data, bool lock) { s32 status; u32 max_retry = 10; @@ -1952,11 +2048,13 @@ s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, DEBUGFUNC("ixgbe_read_i2c_byte_generic"); + if (hw->mac.type >= ixgbe_mac_X550) + max_retry = 3; if (ixgbe_is_sfp_probe(hw, byte_offset, dev_addr)) max_retry = IXGBE_SFP_DETECT_RETRIES; do { - if (hw->mac.ops.acquire_swfw_sync(hw, swfw_mask)) + if (lock && hw->mac.ops.acquire_swfw_sync(hw, swfw_mask)) return IXGBE_ERR_SWFW_SYNC; ixgbe_i2c_start(hw); @@ -1998,13 +2096,16 @@ s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, goto fail; ixgbe_i2c_stop(hw); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); return IXGBE_SUCCESS; fail: ixgbe_i2c_bus_clear(hw); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); - msec_delay(100); + if (lock) { + hw->mac.ops.release_swfw_sync(hw, swfw_mask); + msec_delay(100); + } retry++; if (retry < max_retry) DEBUGOUT("I2C byte read error - Retrying.\n"); @@ -2017,28 +2118,60 @@ fail: } /** - * ixgbe_write_i2c_byte_generic - Writes 8 bit word over I2C + * ixgbe_read_i2c_byte_generic - Reads 8 bit word over I2C + * @hw: pointer to hardware structure + * @byte_offset: byte offset to read + * @data: value read + * + * Performs byte read operation to SFP module's EEPROM over I2C interface at + * a specified device address. + **/ +s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data) +{ + return ixgbe_read_i2c_byte_generic_int(hw, byte_offset, dev_addr, + data, TRUE); +} + +/** + * ixgbe_read_i2c_byte_generic_unlocked - Reads 8 bit word over I2C + * @hw: pointer to hardware structure + * @byte_offset: byte offset to read + * @data: value read + * + * Performs byte read operation to SFP module's EEPROM over I2C interface at + * a specified device address. + **/ +s32 ixgbe_read_i2c_byte_generic_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data) +{ + return ixgbe_read_i2c_byte_generic_int(hw, byte_offset, dev_addr, + data, FALSE); +} + +/** + * ixgbe_write_i2c_byte_generic_int - Writes 8 bit word over I2C * @hw: pointer to hardware structure * @byte_offset: byte offset to write * @data: value to write + * @lock: TRUE if to take and release semaphore * * Performs byte write operation to SFP module's EEPROM over I2C interface at * a specified device address. **/ -s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, - u8 dev_addr, u8 data) +static s32 ixgbe_write_i2c_byte_generic_int(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data, bool lock) { - s32 status = IXGBE_SUCCESS; + s32 status; u32 max_retry = 1; u32 retry = 0; u32 swfw_mask = hw->phy.phy_semaphore_mask; DEBUGFUNC("ixgbe_write_i2c_byte_generic"); - if (hw->mac.ops.acquire_swfw_sync(hw, swfw_mask) != IXGBE_SUCCESS) { - status = IXGBE_ERR_SWFW_SYNC; - goto write_byte_out; - } + if (lock && hw->mac.ops.acquire_swfw_sync(hw, swfw_mask) != + IXGBE_SUCCESS) + return IXGBE_ERR_SWFW_SYNC; do { ixgbe_i2c_start(hw); @@ -2068,7 +2201,8 @@ s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, goto fail; ixgbe_i2c_stop(hw); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); return IXGBE_SUCCESS; fail: @@ -2080,13 +2214,45 @@ fail: DEBUGOUT("I2C byte write error.\n"); } while (retry < max_retry); - hw->mac.ops.release_swfw_sync(hw, swfw_mask); + if (lock) + hw->mac.ops.release_swfw_sync(hw, swfw_mask); -write_byte_out: return status; } /** + * ixgbe_write_i2c_byte_generic - Writes 8 bit word over I2C + * @hw: pointer to hardware structure + * @byte_offset: byte offset to write + * @data: value to write + * + * Performs byte write operation to SFP module's EEPROM over I2C interface at + * a specified device address. + **/ +s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data) +{ + return ixgbe_write_i2c_byte_generic_int(hw, byte_offset, dev_addr, + data, TRUE); +} + +/** + * ixgbe_write_i2c_byte_generic_unlocked - Writes 8 bit word over I2C + * @hw: pointer to hardware structure + * @byte_offset: byte offset to write + * @data: value to write + * + * Performs byte write operation to SFP module's EEPROM over I2C interface at + * a specified device address. + **/ +s32 ixgbe_write_i2c_byte_generic_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data) +{ + return ixgbe_write_i2c_byte_generic_int(hw, byte_offset, dev_addr, + data, FALSE); +} + +/** * ixgbe_i2c_start - Sets I2C start condition * @hw: pointer to hardware structure * diff --git a/sys/dev/ixgbe/ixgbe_phy.h b/sys/dev/ixgbe/ixgbe_phy.h index 021d5f0..fad31bd 100644 --- a/sys/dev/ixgbe/ixgbe_phy.h +++ b/sys/dev/ixgbe/ixgbe_phy.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -89,9 +89,24 @@ #define IXGBE_I2C_EEPROM_STATUS_IN_PROGRESS 0x3 #define IXGBE_CS4227 0xBE /* CS4227 address */ -#define IXGBE_CS4227_SPARE24_LSB 0x12B0 /* Reg to program EDC */ +#define IXGBE_CS4227_GLOBAL_ID_LSB 0 +#define IXGBE_CS4227_SCRATCH 2 +#define IXGBE_CS4227_GLOBAL_ID_VALUE 0x03E5 +#define IXGBE_CS4227_SCRATCH_VALUE 0x5aa5 +#define IXGBE_CS4227_RETRIES 5 +#define IXGBE_CS4227_LINE_SPARE22_MSB 0x12AD /* Reg to program speed */ +#define IXGBE_CS4227_LINE_SPARE24_LSB 0x12B0 /* Reg to program EDC */ +#define IXGBE_CS4227_HOST_SPARE22_MSB 0x1AAD /* Reg to program speed */ +#define IXGBE_CS4227_HOST_SPARE24_LSB 0x1AB0 /* Reg to program EDC */ #define IXGBE_CS4227_EDC_MODE_CX1 0x0002 #define IXGBE_CS4227_EDC_MODE_SR 0x0004 +#define IXGBE_CS4227_RESET_HOLD 500 /* microseconds */ +#define IXGBE_CS4227_RESET_DELAY 500 /* milliseconds */ +#define IXGBE_CS4227_CHECK_DELAY 30 /* milliseconds */ +#define IXGBE_PE 0xE0 /* Port expander address */ +#define IXGBE_PE_OUTPUT 1 /* Output register offset */ +#define IXGBE_PE_CONFIG 3 /* Config register offset */ +#define IXGBE_PE_BIT1 (1 << 1) /* Flow control defines */ #define IXGBE_TAF_SYM_PAUSE 0x400 @@ -175,8 +190,12 @@ s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw, s32 ixgbe_tn_check_overtemp(struct ixgbe_hw *hw); s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 *data); +s32 ixgbe_read_i2c_byte_generic_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 *data); s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 data); +s32 ixgbe_write_i2c_byte_generic_unlocked(struct ixgbe_hw *hw, u8 byte_offset, + u8 dev_addr, u8 data); s32 ixgbe_read_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset, u8 *eeprom_data); s32 ixgbe_write_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset, diff --git a/sys/dev/ixgbe/ixgbe_type.h b/sys/dev/ixgbe/ixgbe_type.h index 24ba046..2a53952 100644 --- a/sys/dev/ixgbe/ixgbe_type.h +++ b/sys/dev/ixgbe/ixgbe_type.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -111,6 +111,7 @@ #define IXGBE_SUBDEV_ID_82599_LOM_SNAP6 0x2159 #define IXGBE_SUBDEV_ID_82599_SFP_1OCP 0x000D #define IXGBE_SUBDEV_ID_82599_SFP_2OCP 0x0008 +#define IXGBE_SUBDEV_ID_82599_SFP_LOM 0x06EE #define IXGBE_DEV_ID_82599_BACKPLANE_FCOE 0x152A #define IXGBE_DEV_ID_82599_SFP_FCOE 0x1529 #define IXGBE_DEV_ID_82599_SFP_EM 0x1507 @@ -130,8 +131,6 @@ #define IXGBE_DEV_ID_X540_BYPASS 0x155C #define IXGBE_DEV_ID_X540T1 0x1560 #define IXGBE_DEV_ID_X550T 0x1563 -/* Placeholder value, pending official value. */ -#define IXGBE_DEV_ID_X550EM_A_KR 0xABCD #define IXGBE_DEV_ID_X550EM_X_KX4 0x15AA #define IXGBE_DEV_ID_X550EM_X_KR 0x15AB #define IXGBE_DEV_ID_X550EM_X_SFP 0x15AC @@ -139,11 +138,13 @@ #define IXGBE_DEV_ID_X550EM_X_1G_T 0x15AE #define IXGBE_DEV_ID_X550_VF_HV 0x1564 #define IXGBE_DEV_ID_X550_VF 0x1565 -#define IXGBE_DEV_ID_X550EM_A_VF 0x15B3 -#define IXGBE_DEV_ID_X550EM_A_VF_HV 0x15B4 #define IXGBE_DEV_ID_X550EM_X_VF 0x15A8 #define IXGBE_DEV_ID_X550EM_X_VF_HV 0x15A9 +#define IXGBE_CAT(r,m) IXGBE_##r##m + +#define IXGBE_BY_MAC(_hw, r) ((_hw)->mvals[IXGBE_CAT(r, _IDX)]) + /* General Registers */ #define IXGBE_CTRL 0x00000 #define IXGBE_STATUS 0x00008 @@ -151,9 +152,11 @@ #define IXGBE_ESDP 0x00020 #define IXGBE_EODSDP 0x00028 #define IXGBE_I2CCTL_82599 0x00028 +#define IXGBE_I2CCTL IXGBE_I2CCTL_82599 +#define IXGBE_I2CCTL_X540 IXGBE_I2CCTL_82599 #define IXGBE_I2CCTL_X550 0x15F5C -#define IXGBE_I2CCTL_BY_MAC(_hw) ((((_hw)->mac.type >= ixgbe_mac_X550) ? \ - IXGBE_I2CCTL_X550 : IXGBE_I2CCTL_82599)) +#define IXGBE_I2CCTL_X550EM_x IXGBE_I2CCTL_X550 +#define IXGBE_I2CCTL_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2CCTL) #define IXGBE_PHY_GPIO 0x00028 #define IXGBE_MAC_GPIO 0x00030 #define IXGBE_PHYINT_STATUS0 0x00100 @@ -166,18 +169,40 @@ #define IXGBE_EXVET 0x05078 /* NVM Registers */ -#define IXGBE_EEC 0x10010 -#define IXGBE_EERD 0x10014 -#define IXGBE_EEWR 0x10018 -#define IXGBE_FLA 0x1001C +#define IXGBE_EEC 0x10010 +#define IXGBE_EEC_X540 IXGBE_EEC +#define IXGBE_EEC_X550 IXGBE_EEC +#define IXGBE_EEC_X550EM_x IXGBE_EEC +#define IXGBE_EEC_BY_MAC(_hw) IXGBE_EEC + +#define IXGBE_EERD 0x10014 +#define IXGBE_EEWR 0x10018 + +#define IXGBE_FLA 0x1001C +#define IXGBE_FLA_X540 IXGBE_FLA +#define IXGBE_FLA_X550 IXGBE_FLA +#define IXGBE_FLA_X550EM_x IXGBE_FLA +#define IXGBE_FLA_BY_MAC(_hw) IXGBE_FLA + #define IXGBE_EEMNGCTL 0x10110 #define IXGBE_EEMNGDATA 0x10114 #define IXGBE_FLMNGCTL 0x10118 #define IXGBE_FLMNGDATA 0x1011C #define IXGBE_FLMNGCNT 0x10120 #define IXGBE_FLOP 0x1013C -#define IXGBE_GRC 0x10200 -#define IXGBE_SRAMREL 0x10210 + +#define IXGBE_GRC 0x10200 +#define IXGBE_GRC_X540 IXGBE_GRC +#define IXGBE_GRC_X550 IXGBE_GRC +#define IXGBE_GRC_X550EM_x IXGBE_GRC +#define IXGBE_GRC_BY_MAC(_hw) IXGBE_GRC + +#define IXGBE_SRAMREL 0x10210 +#define IXGBE_SRAMREL_X540 IXGBE_SRAMREL +#define IXGBE_SRAMREL_X550 IXGBE_SRAMREL +#define IXGBE_SRAMREL_X550EM_x IXGBE_SRAMREL +#define IXGBE_SRAMREL_BY_MAC(_hw) IXGBE_SRAMREL + #define IXGBE_PHYDBG 0x10218 /* General Receive Control */ @@ -188,20 +213,48 @@ #define IXGBE_VPDDIAG1 0x10208 /* I2CCTL Bit Masks */ -#define IXGBE_I2C_CLK_IN_BY_MAC(_hw)(((_hw)->mac.type) >= ixgbe_mac_X550 ? \ - 0x00004000 : 0x00000001) -#define IXGBE_I2C_CLK_OUT_BY_MAC(_hw)(((_hw)->mac.type) >= ixgbe_mac_X550 ? \ - 0x00000200 : 0x00000002) -#define IXGBE_I2C_DATA_IN_BY_MAC(_hw)(((_hw)->mac.type) >= ixgbe_mac_X550 ? \ - 0x00001000 : 0x00000004) -#define IXGBE_I2C_DATA_OUT_BY_MAC(_hw)(((_hw)->mac.type) >= ixgbe_mac_X550 ? \ - 0x00000400 : 0x00000008) -#define IXGBE_I2C_BB_EN_BY_MAC(hw) ((hw)->mac.type >= ixgbe_mac_X550 ? \ - 0x00000100 : 0) -#define IXGBE_I2C_DATA_OE_N_EN_BY_MAC(hw) ((hw)->mac.type >= ixgbe_mac_X550 ? \ - 0x00000800 : 0) -#define IXGBE_I2C_CLK_OE_N_EN_BY_MAC(hw) ((hw)->mac.type >= ixgbe_mac_X550 ? \ - 0x00002000 : 0) +#define IXGBE_I2C_CLK_IN 0x00000001 +#define IXGBE_I2C_CLK_IN_X540 IXGBE_I2C_CLK_IN +#define IXGBE_I2C_CLK_IN_X550 0x00004000 +#define IXGBE_I2C_CLK_IN_X550EM_x IXGBE_I2C_CLK_IN_X550 +#define IXGBE_I2C_CLK_IN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_CLK_IN) + +#define IXGBE_I2C_CLK_OUT 0x00000002 +#define IXGBE_I2C_CLK_OUT_X540 IXGBE_I2C_CLK_OUT +#define IXGBE_I2C_CLK_OUT_X550 0x00000200 +#define IXGBE_I2C_CLK_OUT_X550EM_x IXGBE_I2C_CLK_OUT_X550 +#define IXGBE_I2C_CLK_OUT_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_CLK_OUT) + +#define IXGBE_I2C_DATA_IN 0x00000004 +#define IXGBE_I2C_DATA_IN_X540 IXGBE_I2C_DATA_IN +#define IXGBE_I2C_DATA_IN_X550 0x00001000 +#define IXGBE_I2C_DATA_IN_X550EM_x IXGBE_I2C_DATA_IN_X550 +#define IXGBE_I2C_DATA_IN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_DATA_IN) + +#define IXGBE_I2C_DATA_OUT 0x00000008 +#define IXGBE_I2C_DATA_OUT_X540 IXGBE_I2C_DATA_OUT +#define IXGBE_I2C_DATA_OUT_X550 0x00000400 +#define IXGBE_I2C_DATA_OUT_X550EM_x IXGBE_I2C_DATA_OUT_X550 +#define IXGBE_I2C_DATA_OUT_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_DATA_OUT) + +#define IXGBE_I2C_DATA_OE_N_EN 0 +#define IXGBE_I2C_DATA_OE_N_EN_X540 IXGBE_I2C_DATA_OE_N_EN +#define IXGBE_I2C_DATA_OE_N_EN_X550 0x00000800 +#define IXGBE_I2C_DATA_OE_N_EN_X550EM_x IXGBE_I2C_DATA_OE_N_EN_X550 +#define IXGBE_I2C_DATA_OE_N_EN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_DATA_OE_N_EN) + +#define IXGBE_I2C_BB_EN 0 +#define IXGBE_I2C_BB_EN_X540 IXGBE_I2C_BB_EN +#define IXGBE_I2C_BB_EN_X550 0x00000100 +#define IXGBE_I2C_BB_EN_X550EM_x IXGBE_I2C_BB_EN_X550 + +#define IXGBE_I2C_BB_EN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_BB_EN) + +#define IXGBE_I2C_CLK_OE_N_EN 0 +#define IXGBE_I2C_CLK_OE_N_EN_X540 IXGBE_I2C_CLK_OE_N_EN +#define IXGBE_I2C_CLK_OE_N_EN_X550 0x00002000 +#define IXGBE_I2C_CLK_OE_N_EN_X550EM_x IXGBE_I2C_CLK_OE_N_EN_X550 +#define IXGBE_I2C_CLK_OE_N_EN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), I2C_CLK_OE_N_EN) #define IXGBE_I2C_CLOCK_STRETCHING_TIMEOUT 500 @@ -612,6 +665,7 @@ struct ixgbe_dmac_config { #define IXGBE_EEER 0x043A0 /* EEE register */ #define IXGBE_EEE_STAT 0x04398 /* EEE Status */ #define IXGBE_EEE_SU 0x04380 /* EEE Set up */ +#define IXGBE_EEE_SU_TEEE_DLY_SHIFT 26 #define IXGBE_TLPIC 0x041F4 /* EEE Tx LPI count */ #define IXGBE_RLPIC 0x041F8 /* EEE Rx LPI count */ @@ -989,14 +1043,34 @@ struct ixgbe_dmac_config { #define IXGBE_GSCN_2 0x11028 #define IXGBE_GSCN_3 0x1102C #define IXGBE_FACTPS 0x10150 +#define IXGBE_FACTPS_X540 IXGBE_FACTPS +#define IXGBE_FACTPS_X550 IXGBE_FACTPS +#define IXGBE_FACTPS_X550EM_x IXGBE_FACTPS +#define IXGBE_FACTPS_BY_MAC(_hw) IXGBE_FACTPS + #define IXGBE_PCIEANACTL 0x11040 #define IXGBE_SWSM 0x10140 +#define IXGBE_SWSM_X540 IXGBE_SWSM +#define IXGBE_SWSM_X550 IXGBE_SWSM +#define IXGBE_SWSM_X550EM_x IXGBE_SWSM +#define IXGBE_SWSM_BY_MAC(_hw) IXGBE_SWSM + #define IXGBE_FWSM 0x10148 +#define IXGBE_FWSM_X540 IXGBE_FWSM +#define IXGBE_FWSM_X550 IXGBE_FWSM +#define IXGBE_FWSM_X550EM_x IXGBE_FWSM +#define IXGBE_FWSM_BY_MAC(_hw) IXGBE_FWSM + +#define IXGBE_SWFW_SYNC IXGBE_GSSR +#define IXGBE_SWFW_SYNC_X540 IXGBE_SWFW_SYNC +#define IXGBE_SWFW_SYNC_X550 IXGBE_SWFW_SYNC +#define IXGBE_SWFW_SYNC_X550EM_x IXGBE_SWFW_SYNC +#define IXGBE_SWFW_SYNC_BY_MAC(_hw) IXGBE_SWFW_SYNC + #define IXGBE_GSSR 0x10160 #define IXGBE_MREVID 0x11064 #define IXGBE_DCA_ID 0x11070 #define IXGBE_DCA_CTRL 0x11074 -#define IXGBE_SWFW_SYNC IXGBE_GSSR /* PCI-E registers 82599-Specific */ #define IXGBE_GCR_EXT 0x11050 @@ -1008,14 +1082,18 @@ struct ixgbe_dmac_config { #define IXGBE_PHYDAT_82599 0x11044 #define IXGBE_PHYCTL_82599 0x11048 #define IXGBE_PBACLR_82599 0x11068 -#define IXGBE_CIAA_82599 0x11088 -#define IXGBE_CIAD_82599 0x1108C +#define IXGBE_CIAA 0x11088 +#define IXGBE_CIAD 0x1108C +#define IXGBE_CIAA_82599 IXGBE_CIAA +#define IXGBE_CIAD_82599 IXGBE_CIAD +#define IXGBE_CIAA_X540 IXGBE_CIAA +#define IXGBE_CIAD_X540 IXGBE_CIAD #define IXGBE_CIAA_X550 0x11508 #define IXGBE_CIAD_X550 0x11510 -#define IXGBE_CIAA_BY_MAC(_hw) ((((_hw)->mac.type >= ixgbe_mac_X550) ? \ - IXGBE_CIAA_X550 : IXGBE_CIAA_82599)) -#define IXGBE_CIAD_BY_MAC(_hw) ((((_hw)->mac.type >= ixgbe_mac_X550) ? \ - IXGBE_CIAD_X550 : IXGBE_CIAD_82599)) +#define IXGBE_CIAA_X550EM_x IXGBE_CIAA_X550 +#define IXGBE_CIAD_X550EM_x IXGBE_CIAD_X550 +#define IXGBE_CIAA_BY_MAC(_hw) IXGBE_BY_MAC((_hw), CIAA) +#define IXGBE_CIAD_BY_MAC(_hw) IXGBE_BY_MAC((_hw), CIAD) #define IXGBE_PICAUSE 0x110B0 #define IXGBE_PIENA 0x110B8 #define IXGBE_CDQ_MBR_82599 0x110B4 @@ -1365,6 +1443,8 @@ struct ixgbe_dmac_config { #define IXGBE_MDIO_AUTO_NEG_STATUS 0x1 /* AUTO_NEG Status Reg */ #define IXGBE_MDIO_AUTO_NEG_VENDOR_STAT 0xC800 /* AUTO_NEG Vendor Status Reg */ #define IXGBE_MDIO_AUTO_NEG_VENDOR_TX_ALARM 0xCC00 /* AUTO_NEG Vendor TX Reg */ +#define IXGBE_MDIO_AUTO_NEG_VENDOR_TX_ALARM2 0xCC01 /* AUTO_NEG Vendor Tx Reg */ +#define IXGBE_MDIO_AUTO_NEG_VEN_LSC 0x1 /* AUTO_NEG Vendor Tx LSC */ #define IXGBE_MDIO_AUTO_NEG_ADVT 0x10 /* AUTO_NEG Advt Reg */ #define IXGBE_MDIO_AUTO_NEG_LP 0x13 /* AUTO_NEG LP Status Reg */ #define IXGBE_MDIO_AUTO_NEG_EEE_ADVT 0x3C /* AUTO_NEG EEE Advt Reg */ @@ -1393,11 +1473,24 @@ struct ixgbe_dmac_config { #define IXGBE_MDIO_TX_VENDOR_ALARMS_3_RST_MASK 0x3 /* PHY Reset Complete Mask */ #define IXGBE_MDIO_GLOBAL_RES_PR_10 0xC479 /* Global Resv Provisioning 10 Reg */ #define IXGBE_MDIO_POWER_UP_STALL 0x8000 /* Power Up Stall */ - +#define IXGBE_MDIO_GLOBAL_INT_CHIP_STD_MASK 0xFF00 /* int std mask */ +#define IXGBE_MDIO_GLOBAL_CHIP_STD_INT_FLAG 0xFC00 /* chip std int flag */ +#define IXGBE_MDIO_GLOBAL_INT_CHIP_VEN_MASK 0xFF01 /* int chip-wide mask */ +#define IXGBE_MDIO_GLOBAL_INT_CHIP_VEN_FLAG 0xFC01 /* int chip-wide mask */ +#define IXGBE_MDIO_GLOBAL_ALARM_1 0xCC00 /* Global alarm 1 */ +#define IXGBE_MDIO_GLOBAL_ALM_1_HI_TMP_FAIL 0x4000 /* high temp failure */ +#define IXGBE_MDIO_GLOBAL_INT_MASK 0xD400 /* Global int mask */ +#define IXGBE_MDIO_GLOBAL_AN_VEN_ALM_INT_EN 0x1000 /* autoneg vendor alarm int enable */ +#define IXGBE_MDIO_GLOBAL_ALARM_1_INT 0x4 /* int in Global alarm 1 */ +#define IXGBE_MDIO_GLOBAL_VEN_ALM_INT_EN 0x1 /* vendor alarm int enable */ +#define IXGBE_MDIO_GLOBAL_STD_ALM2_INT 0x200 /* vendor alarm2 int mask */ +#define IXGBE_MDIO_GLOBAL_INT_HI_TEMP_EN 0x4000 /* int high temp enable */ #define IXGBE_MDIO_PMA_PMD_CONTROL_ADDR 0x0000 /* PMA/PMD Control Reg */ #define IXGBE_MDIO_PMA_PMD_SDA_SCL_ADDR 0xC30A /* PHY_XS SDA/SCL Addr Reg */ #define IXGBE_MDIO_PMA_PMD_SDA_SCL_DATA 0xC30B /* PHY_XS SDA/SCL Data Reg */ #define IXGBE_MDIO_PMA_PMD_SDA_SCL_STAT 0xC30C /* PHY_XS SDA/SCL Status Reg */ +#define IXGBE_MDIO_PMA_TX_VEN_LASI_INT_MASK 0xD401 /* PHY TX Vendor LASI */ +#define IXGBE_MDIO_PMA_TX_VEN_LASI_INT_EN 0x1 /* PHY TX Vendor LASI enable */ #define IXGBE_MDIO_PMD_STD_TX_DISABLE_CNTR 0x9 /* Standard Transmit Dis Reg */ #define IXGBE_MDIO_PMD_GLOBAL_TX_DISABLE 0x0001 /* PMD Global Transmit Dis */ @@ -1479,12 +1572,16 @@ struct ixgbe_dmac_config { #define IXGBE_SDP0_GPIEN_X540 0x00000002 /* SDP0 on X540 and X550 */ #define IXGBE_SDP1_GPIEN_X540 0x00000004 /* SDP1 on X540 and X550 */ #define IXGBE_SDP2_GPIEN_X540 0x00000008 /* SDP2 on X540 and X550 */ -#define IXGBE_SDP0_GPIEN_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_SDP0_GPIEN_X540 : IXGBE_SDP0_GPIEN) -#define IXGBE_SDP1_GPIEN_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_SDP1_GPIEN_X540 : IXGBE_SDP1_GPIEN) -#define IXGBE_SDP2_GPIEN_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_SDP2_GPIEN_X540 : IXGBE_SDP2_GPIEN) +#define IXGBE_SDP0_GPIEN_X550 IXGBE_SDP0_GPIEN_X540 +#define IXGBE_SDP1_GPIEN_X550 IXGBE_SDP1_GPIEN_X540 +#define IXGBE_SDP2_GPIEN_X550 IXGBE_SDP2_GPIEN_X540 +#define IXGBE_SDP0_GPIEN_X550EM_x IXGBE_SDP0_GPIEN_X540 +#define IXGBE_SDP1_GPIEN_X550EM_x IXGBE_SDP1_GPIEN_X540 +#define IXGBE_SDP2_GPIEN_X550EM_x IXGBE_SDP2_GPIEN_X540 +#define IXGBE_SDP0_GPIEN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), SDP0_GPIEN) +#define IXGBE_SDP1_GPIEN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), SDP1_GPIEN) +#define IXGBE_SDP2_GPIEN_BY_MAC(_hw) IXGBE_BY_MAC((_hw), SDP2_GPIEN) + #define IXGBE_GPIE_MSIX_MODE 0x00000010 /* MSI-X mode */ #define IXGBE_GPIE_OCD 0x00000020 /* Other Clear Disable */ #define IXGBE_GPIE_EIMEN 0x00000040 /* Immediate Interrupt Enable */ @@ -1665,15 +1762,16 @@ enum { #define IXGBE_EICR_GPI_SDP0_X540 0x02000000 /* Gen Purpose Interrupt on SDP0 */ #define IXGBE_EICR_GPI_SDP1_X540 0x04000000 /* Gen Purpose Interrupt on SDP1 */ #define IXGBE_EICR_GPI_SDP2_X540 0x08000000 /* Gen Purpose Interrupt on SDP2 */ -#define IXGBE_EICR_GPI_SDP0_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_EICR_GPI_SDP0_X540 : \ - IXGBE_EICR_GPI_SDP0) -#define IXGBE_EICR_GPI_SDP1_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_EICR_GPI_SDP1_X540 : \ - IXGBE_EICR_GPI_SDP1) -#define IXGBE_EICR_GPI_SDP2_BY_MAC(_hw) ((_hw)->mac.type >= ixgbe_mac_X540 ? \ - IXGBE_EICR_GPI_SDP2_X540 : \ - IXGBE_EICR_GPI_SDP2) +#define IXGBE_EICR_GPI_SDP0_X550 IXGBE_EICR_GPI_SDP0_X540 +#define IXGBE_EICR_GPI_SDP1_X550 IXGBE_EICR_GPI_SDP1_X540 +#define IXGBE_EICR_GPI_SDP2_X550 IXGBE_EICR_GPI_SDP2_X540 +#define IXGBE_EICR_GPI_SDP0_X550EM_x IXGBE_EICR_GPI_SDP0_X540 +#define IXGBE_EICR_GPI_SDP1_X550EM_x IXGBE_EICR_GPI_SDP1_X540 +#define IXGBE_EICR_GPI_SDP2_X550EM_x IXGBE_EICR_GPI_SDP2_X540 +#define IXGBE_EICR_GPI_SDP0_BY_MAC(_hw) IXGBE_BY_MAC((_hw), EICR_GPI_SDP0) +#define IXGBE_EICR_GPI_SDP1_BY_MAC(_hw) IXGBE_BY_MAC((_hw), EICR_GPI_SDP1) +#define IXGBE_EICR_GPI_SDP2_BY_MAC(_hw) IXGBE_BY_MAC((_hw), EICR_GPI_SDP2) + #define IXGBE_EICR_PBUR 0x10000000 /* Packet Buffer Handler Error */ #define IXGBE_EICR_DHER 0x20000000 /* Descriptor Handler Error */ #define IXGBE_EICR_TCP_TIMER 0x40000000 /* TCP Timer */ @@ -1901,6 +1999,9 @@ enum { #define IXGBE_LED_IVRT(_i) IXGBE_LED_OFFSET(IXGBE_LED_IVRT_BASE, _i) #define IXGBE_LED_BLINK(_i) IXGBE_LED_OFFSET(IXGBE_LED_BLINK_BASE, _i) #define IXGBE_LED_MODE_MASK(_i) IXGBE_LED_OFFSET(IXGBE_LED_MODE_MASK_BASE, _i) +#define IXGBE_X557_LED_MANUAL_SET_MASK (1 << 8) +#define IXGBE_X557_MAX_LED_INDEX 3 +#define IXGBE_X557_LED_PROVISIONING 0xC430 /* LED modes */ #define IXGBE_LED_LINK_UP 0x0 @@ -2784,6 +2885,7 @@ enum ixgbe_fdir_pballoc_type { #define IXGBE_HI_FLASH_ERASE_TIMEOUT 1000 /* Process Erase command limit */ #define IXGBE_HI_FLASH_UPDATE_TIMEOUT 5000 /* Process Update command limit */ #define IXGBE_HI_FLASH_APPLY_TIMEOUT 0 /* Process Apply command limit */ +#define IXGBE_HI_PHY_MGMT_REQ_TIMEOUT 2000 /* Wait up to 2 seconds */ /* CEM Support */ #define FW_CEM_HDR_LEN 0x4 @@ -2804,6 +2906,7 @@ enum ixgbe_fdir_pballoc_type { #define FW_MAX_READ_BUFFER_SIZE 1024 #define FW_DISABLE_RXEN_CMD 0xDE #define FW_DISABLE_RXEN_LEN 0x1 +#define FW_PHY_MGMT_REQ_CMD 0x20 /* Host Interface Command Structures */ struct ixgbe_hic_hdr { @@ -3200,6 +3303,36 @@ union ixgbe_atr_hash_dword { }; +#define IXGBE_MVALS_INIT(m) \ + IXGBE_CAT(EEC, m), \ + IXGBE_CAT(FLA, m), \ + IXGBE_CAT(GRC, m), \ + IXGBE_CAT(SRAMREL, m), \ + IXGBE_CAT(FACTPS, m), \ + IXGBE_CAT(SWSM, m), \ + IXGBE_CAT(FWSM, m), \ + IXGBE_CAT(SDP0_GPIEN, m), \ + IXGBE_CAT(SDP1_GPIEN, m), \ + IXGBE_CAT(SDP2_GPIEN, m), \ + IXGBE_CAT(EICR_GPI_SDP0, m), \ + IXGBE_CAT(EICR_GPI_SDP1, m), \ + IXGBE_CAT(EICR_GPI_SDP2, m), \ + IXGBE_CAT(CIAA, m), \ + IXGBE_CAT(CIAD, m), \ + IXGBE_CAT(I2C_CLK_IN, m), \ + IXGBE_CAT(I2C_CLK_OUT, m), \ + IXGBE_CAT(I2C_DATA_IN, m), \ + IXGBE_CAT(I2C_DATA_OUT, m), \ + IXGBE_CAT(I2C_DATA_OE_N_EN, m), \ + IXGBE_CAT(I2C_BB_EN, m), \ + IXGBE_CAT(I2C_CLK_OE_N_EN, m), \ + IXGBE_CAT(I2CCTL, m) + +enum ixgbe_mvals { + IXGBE_MVALS_INIT(_IDX), + IXGBE_MVALS_IDX_LIMIT +}; + /* * Unavailable: The FCoE Boot Option ROM is not present in the flash. * Disabled: Present; boot order is not set for any targets on the port. @@ -3225,17 +3358,10 @@ enum ixgbe_mac_type { ixgbe_mac_82599_vf, ixgbe_mac_X540, ixgbe_mac_X540_vf, - /* - * X550EM MAC type decoder: - * ixgbe_mac_X550EM_x: "x" = Xeon - * ixgbe_mac_X550EM_a: "a" = Atom - */ ixgbe_mac_X550, ixgbe_mac_X550EM_x, - ixgbe_mac_X550EM_a, ixgbe_mac_X550_vf, ixgbe_mac_X550EM_x_vf, - ixgbe_mac_X550EM_a_vf, ixgbe_num_macs }; @@ -3294,6 +3420,8 @@ enum ixgbe_sfp_type { ixgbe_sfp_type_1g_cu_core1 = 10, ixgbe_sfp_type_1g_sx_core0 = 11, ixgbe_sfp_type_1g_sx_core1 = 12, + ixgbe_sfp_type_1g_lx_core0 = 13, + ixgbe_sfp_type_1g_lx_core1 = 14, ixgbe_sfp_type_not_present = 0xFFFE, ixgbe_sfp_type_unknown = 0xFFFF }; @@ -3611,6 +3739,15 @@ struct ixgbe_phy_operations { s32 (*check_overtemp)(struct ixgbe_hw *); s32 (*set_phy_power)(struct ixgbe_hw *, bool on); s32 (*enter_lplu)(struct ixgbe_hw *); + s32 (*handle_lasi)(struct ixgbe_hw *hw); + s32 (*read_i2c_combined_unlocked)(struct ixgbe_hw *, u8 addr, u16 reg, + u16 *value); + s32 (*write_i2c_combined_unlocked)(struct ixgbe_hw *, u8 addr, u16 reg, + u16 value); + s32 (*read_i2c_byte_unlocked)(struct ixgbe_hw *, u8 offset, u8 addr, + u8 *value); + s32 (*write_i2c_byte_unlocked)(struct ixgbe_hw *, u8 offset, u8 addr, + u8 value); }; struct ixgbe_eeprom_info { @@ -3674,6 +3811,7 @@ struct ixgbe_phy_info { bool multispeed_fiber; bool reset_if_overtemp; bool qsfp_shared_i2c_bus; + u32 nw_mng_if_sel; }; #include "ixgbe_mbx.h" @@ -3717,6 +3855,7 @@ struct ixgbe_hw { struct ixgbe_eeprom_info eeprom; struct ixgbe_bus_info bus; struct ixgbe_mbx_info mbx; + const u32 *mvals; u16 device_id; u16 vendor_id; u16 subsystem_device_id; @@ -3775,6 +3914,10 @@ struct ixgbe_hw { #define IXGBE_NOT_IMPLEMENTED 0x7FFFFFFF +#define IXGBE_FUSES0_GROUP(_i) (0x11158 + ((_i) * 4)) +#define IXGBE_FUSES0_300MHZ (1 << 5) +#define IXGBE_FUSES0_REV1 (1 << 6) + #define IXGBE_KRM_PORT_CAR_GEN_CTRL(P) ((P == 0) ? (0x4010) : (0x8010)) #define IXGBE_KRM_LINK_CTRL_1(P) ((P == 0) ? (0x420C) : (0x820C)) #define IXGBE_KRM_AN_CNTL_1(P) ((P == 0) ? (0x422C) : (0x822C)) @@ -3842,8 +3985,10 @@ struct ixgbe_hw { #define IXGBE_SB_IOSF_CTRL_BUSY_SHIFT 31 #define IXGBE_SB_IOSF_CTRL_BUSY (1 << IXGBE_SB_IOSF_CTRL_BUSY_SHIFT) #define IXGBE_SB_IOSF_TARGET_KR_PHY 0 -#define IXGBE_SB_IOSF_TARGET_KX4_UNIPHY 1 -#define IXGBE_SB_IOSF_TARGET_KX4_PCS0 2 -#define IXGBE_SB_IOSF_TARGET_KX4_PCS1 3 +#define IXGBE_SB_IOSF_TARGET_KX4_PHY 1 +#define IXGBE_SB_IOSF_TARGET_KX4_PCS 2 + +#define IXGBE_NW_MNG_IF_SEL 0x00011178 +#define IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE (1 << 24) #endif /* _IXGBE_TYPE_H_ */ diff --git a/sys/dev/ixgbe/ixgbe_vf.c b/sys/dev/ixgbe/ixgbe_vf.c index 964514c..c010cf4 100644 --- a/sys/dev/ixgbe/ixgbe_vf.c +++ b/sys/dev/ixgbe/ixgbe_vf.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -669,4 +669,3 @@ int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs, UNREFERENCED_3PARAMETER(hw, num_tcs, default_tc); return IXGBE_SUCCESS; } - diff --git a/sys/dev/ixgbe/ixgbe_vf.h b/sys/dev/ixgbe/ixgbe_vf.h index 8f4d46e..edc8013 100644 --- a/sys/dev/ixgbe/ixgbe_vf.h +++ b/sys/dev/ixgbe/ixgbe_vf.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_x540.c b/sys/dev/ixgbe/ixgbe_x540.c index 93c6ca4..ddf0674 100644 --- a/sys/dev/ixgbe/ixgbe_x540.c +++ b/sys/dev/ixgbe/ixgbe_x540.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -932,14 +932,14 @@ static void ixgbe_release_swfw_sync_semaphore(struct ixgbe_hw *hw) /* Release both semaphores by writing 0 to the bits REGSMP and SMBI */ - swsm = IXGBE_READ_REG(hw, IXGBE_SWSM); - swsm &= ~IXGBE_SWSM_SMBI; - IXGBE_WRITE_REG(hw, IXGBE_SWSM, swsm); - swsm = IXGBE_READ_REG(hw, IXGBE_SWFW_SYNC); swsm &= ~IXGBE_SWFW_REGSMP; IXGBE_WRITE_REG(hw, IXGBE_SWFW_SYNC, swsm); + swsm = IXGBE_READ_REG(hw, IXGBE_SWSM); + swsm &= ~IXGBE_SWSM_SMBI; + IXGBE_WRITE_REG(hw, IXGBE_SWSM, swsm); + IXGBE_WRITE_FLUSH(hw); } @@ -1011,5 +1011,3 @@ s32 ixgbe_blink_led_stop_X540(struct ixgbe_hw *hw, u32 index) return IXGBE_SUCCESS; } - - diff --git a/sys/dev/ixgbe/ixgbe_x540.h b/sys/dev/ixgbe/ixgbe_x540.h index 12da827..efd0d41 100644 --- a/sys/dev/ixgbe/ixgbe_x540.h +++ b/sys/dev/ixgbe/ixgbe_x540.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/ixgbe/ixgbe_x550.c b/sys/dev/ixgbe/ixgbe_x550.c new file mode 100644 index 0000000..65daa17 --- /dev/null +++ b/sys/dev/ixgbe/ixgbe_x550.c @@ -0,0 +1,3191 @@ +/****************************************************************************** + + Copyright (c) 2001-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD$*/ + +#include "ixgbe_x550.h" +#include "ixgbe_x540.h" +#include "ixgbe_type.h" +#include "ixgbe_api.h" +#include "ixgbe_common.h" +#include "ixgbe_phy.h" + +static s32 ixgbe_setup_ixfi_x550em(struct ixgbe_hw *hw, ixgbe_link_speed *speed); + +/** + * ixgbe_init_ops_X550 - Inits func ptrs and MAC type + * @hw: pointer to hardware structure + * + * Initialize the function pointers and assign the MAC type for X550. + * Does not touch the hardware. + **/ +s32 ixgbe_init_ops_X550(struct ixgbe_hw *hw) +{ + struct ixgbe_mac_info *mac = &hw->mac; + struct ixgbe_eeprom_info *eeprom = &hw->eeprom; + s32 ret_val; + + DEBUGFUNC("ixgbe_init_ops_X550"); + + ret_val = ixgbe_init_ops_X540(hw); + mac->ops.dmac_config = ixgbe_dmac_config_X550; + mac->ops.dmac_config_tcs = ixgbe_dmac_config_tcs_X550; + mac->ops.dmac_update_tcs = ixgbe_dmac_update_tcs_X550; + mac->ops.setup_eee = ixgbe_setup_eee_X550; + mac->ops.set_source_address_pruning = + ixgbe_set_source_address_pruning_X550; + mac->ops.set_ethertype_anti_spoofing = + ixgbe_set_ethertype_anti_spoofing_X550; + + mac->ops.get_rtrup2tc = ixgbe_dcb_get_rtrup2tc_generic; + eeprom->ops.init_params = ixgbe_init_eeprom_params_X550; + eeprom->ops.calc_checksum = ixgbe_calc_eeprom_checksum_X550; + eeprom->ops.read = ixgbe_read_ee_hostif_X550; + eeprom->ops.read_buffer = ixgbe_read_ee_hostif_buffer_X550; + eeprom->ops.write = ixgbe_write_ee_hostif_X550; + eeprom->ops.write_buffer = ixgbe_write_ee_hostif_buffer_X550; + eeprom->ops.update_checksum = ixgbe_update_eeprom_checksum_X550; + eeprom->ops.validate_checksum = ixgbe_validate_eeprom_checksum_X550; + + mac->ops.disable_mdd = ixgbe_disable_mdd_X550; + mac->ops.enable_mdd = ixgbe_enable_mdd_X550; + mac->ops.mdd_event = ixgbe_mdd_event_X550; + mac->ops.restore_mdd_vf = ixgbe_restore_mdd_vf_X550; + mac->ops.disable_rx = ixgbe_disable_rx_x550; + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { + hw->mac.ops.led_on = ixgbe_led_on_t_X550em; + hw->mac.ops.led_off = ixgbe_led_off_t_X550em; + } + return ret_val; +} + +/** + * ixgbe_read_cs4227 - Read CS4227 register + * @hw: pointer to hardware structure + * @reg: register number to write + * @value: pointer to receive value read + * + * Returns status code + **/ +static s32 ixgbe_read_cs4227(struct ixgbe_hw *hw, u16 reg, u16 *value) +{ + return ixgbe_read_i2c_combined_unlocked(hw, IXGBE_CS4227, reg, value); +} + +/** + * ixgbe_write_cs4227 - Write CS4227 register + * @hw: pointer to hardware structure + * @reg: register number to write + * @value: value to write to register + * + * Returns status code + **/ +static s32 ixgbe_write_cs4227(struct ixgbe_hw *hw, u16 reg, u16 value) +{ + return ixgbe_write_i2c_combined_unlocked(hw, IXGBE_CS4227, reg, value); +} + +/** + * ixgbe_get_cs4227_status - Return CS4227 status + * @hw: pointer to hardware structure + * + * Returns error if CS4227 not successfully initialized + **/ +static s32 ixgbe_get_cs4227_status(struct ixgbe_hw *hw) +{ + s32 status; + u16 value = 0; + u16 reg_slice, reg_val; + u8 retry; + + for (retry = 0; retry < IXGBE_CS4227_RETRIES; ++retry) { + status = ixgbe_read_cs4227(hw, IXGBE_CS4227_GLOBAL_ID_LSB, + &value); + if (status != IXGBE_SUCCESS) + return status; + if (value == IXGBE_CS4227_GLOBAL_ID_VALUE) + break; + msec_delay(IXGBE_CS4227_CHECK_DELAY); + } + if (value != IXGBE_CS4227_GLOBAL_ID_VALUE) + return IXGBE_ERR_PHY; + + status = ixgbe_read_cs4227(hw, IXGBE_CS4227_SCRATCH, &value); + if (status != IXGBE_SUCCESS) + return status; + + /* If this is the first time after power-on, check the ucode. + * Otherwise, this will disrupt link on all ports. Because we + * can only do this the first time, we must check all ports, + * not just our own. + */ + if (value != IXGBE_CS4227_SCRATCH_VALUE) { + reg_slice = IXGBE_CS4227_LINE_SPARE24_LSB; + reg_val = (IXGBE_CS4227_EDC_MODE_CX1 << 1) | 0x1; + status = ixgbe_write_cs4227(hw, reg_slice, + reg_val); + if (status != IXGBE_SUCCESS) + return status; + + reg_slice = IXGBE_CS4227_HOST_SPARE24_LSB; + reg_val = (IXGBE_CS4227_EDC_MODE_CX1 << 1) | 0x1; + status = ixgbe_write_cs4227(hw, reg_slice, + reg_val); + if (status != IXGBE_SUCCESS) + return status; + + reg_slice = IXGBE_CS4227_LINE_SPARE24_LSB + (1 << 12); + reg_val = (IXGBE_CS4227_EDC_MODE_SR << 1) | 0x1; + status = ixgbe_write_cs4227(hw, reg_slice, + reg_val); + if (status != IXGBE_SUCCESS) + return status; + + reg_slice = IXGBE_CS4227_HOST_SPARE24_LSB + (1 << 12); + reg_val = (IXGBE_CS4227_EDC_MODE_SR << 1) | 0x1; + status = ixgbe_write_cs4227(hw, reg_slice, + reg_val); + if (status != IXGBE_SUCCESS) + return status; + + msec_delay(10); + } + + /* Verify that the ucode is operational on all ports. */ + reg_slice = IXGBE_CS4227_LINE_SPARE24_LSB; + reg_val = 0xFFFF; + status = ixgbe_read_cs4227(hw, reg_slice, ®_val); + if (status != IXGBE_SUCCESS) + return status; + if (reg_val != 0) + return IXGBE_ERR_PHY; + + reg_slice = IXGBE_CS4227_HOST_SPARE24_LSB; + reg_val = 0xFFFF; + status = ixgbe_read_cs4227(hw, reg_slice, ®_val); + if (status != IXGBE_SUCCESS) + return status; + if (reg_val != 0) + return IXGBE_ERR_PHY; + + reg_slice = IXGBE_CS4227_LINE_SPARE24_LSB + (1 << 12); + reg_val = 0xFFFF; + status = ixgbe_read_cs4227(hw, reg_slice, ®_val); + if (status != IXGBE_SUCCESS) + return status; + if (reg_val != 0) + return IXGBE_ERR_PHY; + + reg_slice = IXGBE_CS4227_HOST_SPARE24_LSB + (1 << 12); + reg_val = 0xFFFF; + status = ixgbe_read_cs4227(hw, reg_slice, ®_val); + if (status != IXGBE_SUCCESS) + return status; + if (reg_val != 0) + return IXGBE_ERR_PHY; + + /* Set scratch for next time. */ + status = ixgbe_write_cs4227(hw, IXGBE_CS4227_SCRATCH, + IXGBE_CS4227_SCRATCH_VALUE); + if (status != IXGBE_SUCCESS) + return status; + status = ixgbe_read_cs4227(hw, IXGBE_CS4227_SCRATCH, &value); + if (status != IXGBE_SUCCESS) + return status; + if (value != IXGBE_CS4227_SCRATCH_VALUE) + return IXGBE_ERR_PHY; + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_read_pe - Read register from port expander + * @hw: pointer to hardware structure + * @reg: register number to read + * @value: pointer to receive read value + * + * Returns status code + **/ +static s32 ixgbe_read_pe(struct ixgbe_hw *hw, u8 reg, u8 *value) +{ + s32 status; + + status = ixgbe_read_i2c_byte_unlocked(hw, reg, IXGBE_PE, value); + if (status != IXGBE_SUCCESS) + ERROR_REPORT2(IXGBE_ERROR_CAUTION, + "port expander access failed with %d\n", status); + return status; +} + +/** + * ixgbe_write_pe - Write register to port expander + * @hw: pointer to hardware structure + * @reg: register number to write + * @value: value to write + * + * Returns status code + **/ +static s32 ixgbe_write_pe(struct ixgbe_hw *hw, u8 reg, u8 value) +{ + s32 status; + + status = ixgbe_write_i2c_byte_unlocked(hw, reg, IXGBE_PE, value); + if (status != IXGBE_SUCCESS) + ERROR_REPORT2(IXGBE_ERROR_CAUTION, + "port expander access failed with %d\n", status); + return status; +} + +/** + * ixgbe_reset_cs4227 - Reset CS4227 using port expander + * @hw: pointer to hardware structure + * + * Returns error code + **/ +static s32 ixgbe_reset_cs4227(struct ixgbe_hw *hw) +{ + s32 status; + u8 reg; + + status = ixgbe_read_pe(hw, IXGBE_PE_OUTPUT, ®); + if (status != IXGBE_SUCCESS) + return status; + reg |= IXGBE_PE_BIT1; + status = ixgbe_write_pe(hw, IXGBE_PE_OUTPUT, reg); + if (status != IXGBE_SUCCESS) + return status; + + status = ixgbe_read_pe(hw, IXGBE_PE_CONFIG, ®); + if (status != IXGBE_SUCCESS) + return status; + reg &= ~IXGBE_PE_BIT1; + status = ixgbe_write_pe(hw, IXGBE_PE_CONFIG, reg); + if (status != IXGBE_SUCCESS) + return status; + + status = ixgbe_read_pe(hw, IXGBE_PE_OUTPUT, ®); + if (status != IXGBE_SUCCESS) + return status; + reg &= ~IXGBE_PE_BIT1; + status = ixgbe_write_pe(hw, IXGBE_PE_OUTPUT, reg); + if (status != IXGBE_SUCCESS) + return status; + + usec_delay(IXGBE_CS4227_RESET_HOLD); + + status = ixgbe_read_pe(hw, IXGBE_PE_OUTPUT, ®); + if (status != IXGBE_SUCCESS) + return status; + reg |= IXGBE_PE_BIT1; + status = ixgbe_write_pe(hw, IXGBE_PE_OUTPUT, reg); + if (status != IXGBE_SUCCESS) + return status; + + msec_delay(IXGBE_CS4227_RESET_DELAY); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_check_cs4227 - Check CS4227 and reset as needed + * @hw: pointer to hardware structure + **/ +static void ixgbe_check_cs4227(struct ixgbe_hw *hw) +{ + u32 swfw_mask = hw->phy.phy_semaphore_mask; + s32 status; + u8 retry; + + for (retry = 0; retry < IXGBE_CS4227_RETRIES; retry++) { + status = hw->mac.ops.acquire_swfw_sync(hw, swfw_mask); + if (status != IXGBE_SUCCESS) { + ERROR_REPORT2(IXGBE_ERROR_CAUTION, + "semaphore failed with %d\n", status); + return; + } + status = ixgbe_get_cs4227_status(hw); + if (status == IXGBE_SUCCESS) { + hw->mac.ops.release_swfw_sync(hw, swfw_mask); + msec_delay(hw->eeprom.semaphore_delay); + return; + } + ixgbe_reset_cs4227(hw); + hw->mac.ops.release_swfw_sync(hw, swfw_mask); + msec_delay(hw->eeprom.semaphore_delay); + } + ERROR_REPORT2(IXGBE_ERROR_CAUTION, + "Unable to initialize CS4227, err=%d\n", status); +} + +/** + * ixgbe_setup_mux_ctl - Setup ESDP register for I2C mux control + * @hw: pointer to hardware structure + **/ +static void ixgbe_setup_mux_ctl(struct ixgbe_hw *hw) +{ + u32 esdp = IXGBE_READ_REG(hw, IXGBE_ESDP); + + if (hw->bus.lan_id) { + esdp &= ~(IXGBE_ESDP_SDP1_NATIVE | IXGBE_ESDP_SDP1); + esdp |= IXGBE_ESDP_SDP1_DIR; + } + esdp &= ~(IXGBE_ESDP_SDP0_NATIVE | IXGBE_ESDP_SDP0_DIR); + IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp); + IXGBE_WRITE_FLUSH(hw); +} + +/** + * ixgbe_identify_phy_x550em - Get PHY type based on device id + * @hw: pointer to hardware structure + * + * Returns error code + */ +static s32 ixgbe_identify_phy_x550em(struct ixgbe_hw *hw) +{ + switch (hw->device_id) { + case IXGBE_DEV_ID_X550EM_X_SFP: + /* set up for CS4227 usage */ + hw->phy.phy_semaphore_mask = IXGBE_GSSR_SHARED_I2C_SM; + ixgbe_setup_mux_ctl(hw); + ixgbe_check_cs4227(hw); + + return ixgbe_identify_module_generic(hw); + break; + case IXGBE_DEV_ID_X550EM_X_KX4: + hw->phy.type = ixgbe_phy_x550em_kx4; + break; + case IXGBE_DEV_ID_X550EM_X_KR: + hw->phy.type = ixgbe_phy_x550em_kr; + break; + case IXGBE_DEV_ID_X550EM_X_1G_T: + case IXGBE_DEV_ID_X550EM_X_10G_T: + return ixgbe_identify_phy_generic(hw); + default: + break; + } + return IXGBE_SUCCESS; +} + +static s32 ixgbe_read_phy_reg_x550em(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u16 *phy_data) +{ + UNREFERENCED_4PARAMETER(*hw, reg_addr, device_type, *phy_data); + return IXGBE_NOT_IMPLEMENTED; +} + +static s32 ixgbe_write_phy_reg_x550em(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u16 phy_data) +{ + UNREFERENCED_4PARAMETER(*hw, reg_addr, device_type, phy_data); + return IXGBE_NOT_IMPLEMENTED; +} + +/** +* ixgbe_init_ops_X550EM - Inits func ptrs and MAC type +* @hw: pointer to hardware structure +* +* Initialize the function pointers and for MAC type X550EM. +* Does not touch the hardware. +**/ +s32 ixgbe_init_ops_X550EM(struct ixgbe_hw *hw) +{ + struct ixgbe_mac_info *mac = &hw->mac; + struct ixgbe_eeprom_info *eeprom = &hw->eeprom; + struct ixgbe_phy_info *phy = &hw->phy; + s32 ret_val; + + DEBUGFUNC("ixgbe_init_ops_X550EM"); + + /* Similar to X550 so start there. */ + ret_val = ixgbe_init_ops_X550(hw); + + /* Since this function eventually calls + * ixgbe_init_ops_540 by design, we are setting + * the pointers to NULL explicitly here to overwrite + * the values being set in the x540 function. + */ + + /* FCOE not supported in x550EM */ + mac->ops.get_san_mac_addr = NULL; + mac->ops.set_san_mac_addr = NULL; + mac->ops.get_wwn_prefix = NULL; + mac->ops.get_fcoe_boot_status = NULL; + + /* IPsec not supported in x550EM */ + mac->ops.disable_sec_rx_path = NULL; + mac->ops.enable_sec_rx_path = NULL; + + /* AUTOC register is not present in x550EM. */ + mac->ops.prot_autoc_read = NULL; + mac->ops.prot_autoc_write = NULL; + + /* X550EM bus type is internal*/ + hw->bus.type = ixgbe_bus_type_internal; + mac->ops.get_bus_info = ixgbe_get_bus_info_X550em; + + mac->ops.read_iosf_sb_reg = ixgbe_read_iosf_sb_reg_x550; + mac->ops.write_iosf_sb_reg = ixgbe_write_iosf_sb_reg_x550; + mac->ops.get_media_type = ixgbe_get_media_type_X550em; + mac->ops.setup_sfp = ixgbe_setup_sfp_modules_X550em; + mac->ops.get_link_capabilities = ixgbe_get_link_capabilities_X550em; + mac->ops.reset_hw = ixgbe_reset_hw_X550em; + mac->ops.get_supported_physical_layer = + ixgbe_get_supported_physical_layer_X550em; + + if (mac->ops.get_media_type(hw) == ixgbe_media_type_copper) + mac->ops.setup_fc = ixgbe_setup_fc_generic; + else + mac->ops.setup_fc = ixgbe_setup_fc_X550em; + + mac->ops.acquire_swfw_sync = ixgbe_acquire_swfw_sync_X550em; + mac->ops.release_swfw_sync = ixgbe_release_swfw_sync_X550em; + + if (hw->device_id != IXGBE_DEV_ID_X550EM_X_KR) + mac->ops.setup_eee = NULL; + + /* PHY */ + phy->ops.init = ixgbe_init_phy_ops_X550em; + phy->ops.identify = ixgbe_identify_phy_x550em; + if (mac->ops.get_media_type(hw) != ixgbe_media_type_copper) + phy->ops.set_phy_power = NULL; + + + /* EEPROM */ + eeprom->ops.init_params = ixgbe_init_eeprom_params_X540; + eeprom->ops.read = ixgbe_read_ee_hostif_X550; + eeprom->ops.read_buffer = ixgbe_read_ee_hostif_buffer_X550; + eeprom->ops.write = ixgbe_write_ee_hostif_X550; + eeprom->ops.write_buffer = ixgbe_write_ee_hostif_buffer_X550; + eeprom->ops.update_checksum = ixgbe_update_eeprom_checksum_X550; + eeprom->ops.validate_checksum = ixgbe_validate_eeprom_checksum_X550; + eeprom->ops.calc_checksum = ixgbe_calc_eeprom_checksum_X550; + + return ret_val; +} + +/** + * ixgbe_dmac_config_X550 + * @hw: pointer to hardware structure + * + * Configure DMA coalescing. If enabling dmac, dmac is activated. + * When disabling dmac, dmac enable dmac bit is cleared. + **/ +s32 ixgbe_dmac_config_X550(struct ixgbe_hw *hw) +{ + u32 reg, high_pri_tc; + + DEBUGFUNC("ixgbe_dmac_config_X550"); + + /* Disable DMA coalescing before configuring */ + reg = IXGBE_READ_REG(hw, IXGBE_DMACR); + reg &= ~IXGBE_DMACR_DMAC_EN; + IXGBE_WRITE_REG(hw, IXGBE_DMACR, reg); + + /* Disable DMA Coalescing if the watchdog timer is 0 */ + if (!hw->mac.dmac_config.watchdog_timer) + goto out; + + ixgbe_dmac_config_tcs_X550(hw); + + /* Configure DMA Coalescing Control Register */ + reg = IXGBE_READ_REG(hw, IXGBE_DMACR); + + /* Set the watchdog timer in units of 40.96 usec */ + reg &= ~IXGBE_DMACR_DMACWT_MASK; + reg |= (hw->mac.dmac_config.watchdog_timer * 100) / 4096; + + reg &= ~IXGBE_DMACR_HIGH_PRI_TC_MASK; + /* If fcoe is enabled, set high priority traffic class */ + if (hw->mac.dmac_config.fcoe_en) { + high_pri_tc = 1 << hw->mac.dmac_config.fcoe_tc; + reg |= ((high_pri_tc << IXGBE_DMACR_HIGH_PRI_TC_SHIFT) & + IXGBE_DMACR_HIGH_PRI_TC_MASK); + } + reg |= IXGBE_DMACR_EN_MNG_IND; + + /* Enable DMA coalescing after configuration */ + reg |= IXGBE_DMACR_DMAC_EN; + IXGBE_WRITE_REG(hw, IXGBE_DMACR, reg); + +out: + return IXGBE_SUCCESS; +} + +/** + * ixgbe_dmac_config_tcs_X550 + * @hw: pointer to hardware structure + * + * Configure DMA coalescing threshold per TC. The dmac enable bit must + * be cleared before configuring. + **/ +s32 ixgbe_dmac_config_tcs_X550(struct ixgbe_hw *hw) +{ + u32 tc, reg, pb_headroom, rx_pb_size, maxframe_size_kb; + + DEBUGFUNC("ixgbe_dmac_config_tcs_X550"); + + /* Configure DMA coalescing enabled */ + switch (hw->mac.dmac_config.link_speed) { + case IXGBE_LINK_SPEED_100_FULL: + pb_headroom = IXGBE_DMACRXT_100M; + break; + case IXGBE_LINK_SPEED_1GB_FULL: + pb_headroom = IXGBE_DMACRXT_1G; + break; + default: + pb_headroom = IXGBE_DMACRXT_10G; + break; + } + + maxframe_size_kb = ((IXGBE_READ_REG(hw, IXGBE_MAXFRS) >> + IXGBE_MHADD_MFS_SHIFT) / 1024); + + /* Set the per Rx packet buffer receive threshold */ + for (tc = 0; tc < IXGBE_DCB_MAX_TRAFFIC_CLASS; tc++) { + reg = IXGBE_READ_REG(hw, IXGBE_DMCTH(tc)); + reg &= ~IXGBE_DMCTH_DMACRXT_MASK; + + if (tc < hw->mac.dmac_config.num_tcs) { + /* Get Rx PB size */ + rx_pb_size = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(tc)); + rx_pb_size = (rx_pb_size & IXGBE_RXPBSIZE_MASK) >> + IXGBE_RXPBSIZE_SHIFT; + + /* Calculate receive buffer threshold in kilobytes */ + if (rx_pb_size > pb_headroom) + rx_pb_size = rx_pb_size - pb_headroom; + else + rx_pb_size = 0; + + /* Minimum of MFS shall be set for DMCTH */ + reg |= (rx_pb_size > maxframe_size_kb) ? + rx_pb_size : maxframe_size_kb; + } + IXGBE_WRITE_REG(hw, IXGBE_DMCTH(tc), reg); + } + return IXGBE_SUCCESS; +} + +/** + * ixgbe_dmac_update_tcs_X550 + * @hw: pointer to hardware structure + * + * Disables dmac, updates per TC settings, and then enables dmac. + **/ +s32 ixgbe_dmac_update_tcs_X550(struct ixgbe_hw *hw) +{ + u32 reg; + + DEBUGFUNC("ixgbe_dmac_update_tcs_X550"); + + /* Disable DMA coalescing before configuring */ + reg = IXGBE_READ_REG(hw, IXGBE_DMACR); + reg &= ~IXGBE_DMACR_DMAC_EN; + IXGBE_WRITE_REG(hw, IXGBE_DMACR, reg); + + ixgbe_dmac_config_tcs_X550(hw); + + /* Enable DMA coalescing after configuration */ + reg = IXGBE_READ_REG(hw, IXGBE_DMACR); + reg |= IXGBE_DMACR_DMAC_EN; + IXGBE_WRITE_REG(hw, IXGBE_DMACR, reg); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_init_eeprom_params_X550 - Initialize EEPROM params + * @hw: pointer to hardware structure + * + * Initializes the EEPROM parameters ixgbe_eeprom_info within the + * ixgbe_hw struct in order to set up EEPROM access. + **/ +s32 ixgbe_init_eeprom_params_X550(struct ixgbe_hw *hw) +{ + struct ixgbe_eeprom_info *eeprom = &hw->eeprom; + u32 eec; + u16 eeprom_size; + + DEBUGFUNC("ixgbe_init_eeprom_params_X550"); + + if (eeprom->type == ixgbe_eeprom_uninitialized) { + eeprom->semaphore_delay = 10; + eeprom->type = ixgbe_flash; + + eec = IXGBE_READ_REG(hw, IXGBE_EEC); + eeprom_size = (u16)((eec & IXGBE_EEC_SIZE) >> + IXGBE_EEC_SIZE_SHIFT); + eeprom->word_size = 1 << (eeprom_size + + IXGBE_EEPROM_WORD_SIZE_SHIFT); + + DEBUGOUT2("Eeprom params: type = %d, size = %d\n", + eeprom->type, eeprom->word_size); + } + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_setup_eee_X550 - Enable/disable EEE support + * @hw: pointer to the HW structure + * @enable_eee: boolean flag to enable EEE + * + * Enable/disable EEE based on enable_eee flag. + * Auto-negotiation must be started after BASE-T EEE bits in PHY register 7.3C + * are modified. + * + **/ +s32 ixgbe_setup_eee_X550(struct ixgbe_hw *hw, bool enable_eee) +{ + u32 eeer; + u16 autoneg_eee_reg; + u32 link_reg; + s32 status; + u32 fuse; + + DEBUGFUNC("ixgbe_setup_eee_X550"); + + eeer = IXGBE_READ_REG(hw, IXGBE_EEER); + /* Enable or disable EEE per flag */ + if (enable_eee) { + eeer |= (IXGBE_EEER_TX_LPI_EN | IXGBE_EEER_RX_LPI_EN); + + if (hw->device_id == IXGBE_DEV_ID_X550T) { + /* Advertise EEE capability */ + hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_EEE_ADVT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, &autoneg_eee_reg); + + autoneg_eee_reg |= (IXGBE_AUTO_NEG_10GBASE_EEE_ADVT | + IXGBE_AUTO_NEG_1000BASE_EEE_ADVT | + IXGBE_AUTO_NEG_100BASE_EEE_ADVT); + + hw->phy.ops.write_reg(hw, IXGBE_MDIO_AUTO_NEG_EEE_ADVT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, autoneg_eee_reg); + } else if (hw->device_id == IXGBE_DEV_ID_X550EM_X_KR) { + /* Not supported on first revision. */ + fuse = IXGBE_READ_REG(hw, IXGBE_FUSES0_GROUP(0)); + if (!(fuse & IXGBE_FUSES0_REV1)) + return IXGBE_SUCCESS; + + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, &link_reg); + if (status != IXGBE_SUCCESS) + return status; + + link_reg |= IXGBE_KRM_LINK_CTRL_1_TETH_EEE_CAP_KR | + IXGBE_KRM_LINK_CTRL_1_TETH_EEE_CAP_KX; + + /* Don't advertise FEC capability when EEE enabled. */ + link_reg &= ~IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_FEC; + + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, link_reg); + if (status != IXGBE_SUCCESS) + return status; + } + } else { + eeer &= ~(IXGBE_EEER_TX_LPI_EN | IXGBE_EEER_RX_LPI_EN); + + if (hw->device_id == IXGBE_DEV_ID_X550T) { + /* Disable advertised EEE capability */ + hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_EEE_ADVT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, &autoneg_eee_reg); + + autoneg_eee_reg &= ~(IXGBE_AUTO_NEG_10GBASE_EEE_ADVT | + IXGBE_AUTO_NEG_1000BASE_EEE_ADVT | + IXGBE_AUTO_NEG_100BASE_EEE_ADVT); + + hw->phy.ops.write_reg(hw, IXGBE_MDIO_AUTO_NEG_EEE_ADVT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, autoneg_eee_reg); + } else if (hw->device_id == IXGBE_DEV_ID_X550EM_X_KR) { + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, &link_reg); + if (status != IXGBE_SUCCESS) + return status; + + link_reg &= ~(IXGBE_KRM_LINK_CTRL_1_TETH_EEE_CAP_KR | + IXGBE_KRM_LINK_CTRL_1_TETH_EEE_CAP_KX); + + /* Advertise FEC capability when EEE is disabled. */ + link_reg |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_FEC; + + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, link_reg); + if (status != IXGBE_SUCCESS) + return status; + } + } + IXGBE_WRITE_REG(hw, IXGBE_EEER, eeer); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_set_source_address_pruning_X550 - Enable/Disbale source address pruning + * @hw: pointer to hardware structure + * @enable: enable or disable source address pruning + * @pool: Rx pool to set source address pruning for + **/ +void ixgbe_set_source_address_pruning_X550(struct ixgbe_hw *hw, bool enable, + unsigned int pool) +{ + u64 pfflp; + + /* max rx pool is 63 */ + if (pool > 63) + return; + + pfflp = (u64)IXGBE_READ_REG(hw, IXGBE_PFFLPL); + pfflp |= (u64)IXGBE_READ_REG(hw, IXGBE_PFFLPH) << 32; + + if (enable) + pfflp |= (1ULL << pool); + else + pfflp &= ~(1ULL << pool); + + IXGBE_WRITE_REG(hw, IXGBE_PFFLPL, (u32)pfflp); + IXGBE_WRITE_REG(hw, IXGBE_PFFLPH, (u32)(pfflp >> 32)); +} + +/** + * ixgbe_set_ethertype_anti_spoofing_X550 - Enable/Disable Ethertype anti-spoofing + * @hw: pointer to hardware structure + * @enable: enable or disable switch for Ethertype anti-spoofing + * @vf: Virtual Function pool - VF Pool to set for Ethertype anti-spoofing + * + **/ +void ixgbe_set_ethertype_anti_spoofing_X550(struct ixgbe_hw *hw, + bool enable, int vf) +{ + int vf_target_reg = vf >> 3; + int vf_target_shift = vf % 8 + IXGBE_SPOOF_ETHERTYPEAS_SHIFT; + u32 pfvfspoof; + + DEBUGFUNC("ixgbe_set_ethertype_anti_spoofing_X550"); + + pfvfspoof = IXGBE_READ_REG(hw, IXGBE_PFVFSPOOF(vf_target_reg)); + if (enable) + pfvfspoof |= (1 << vf_target_shift); + else + pfvfspoof &= ~(1 << vf_target_shift); + + IXGBE_WRITE_REG(hw, IXGBE_PFVFSPOOF(vf_target_reg), pfvfspoof); +} + +/** + * ixgbe_iosf_wait - Wait for IOSF command completion + * @hw: pointer to hardware structure + * @ctrl: pointer to location to receive final IOSF control value + * + * Returns failing status on timeout + * + * Note: ctrl can be NULL if the IOSF control register value is not needed + **/ +static s32 ixgbe_iosf_wait(struct ixgbe_hw *hw, u32 *ctrl) +{ + u32 i, command; + + /* Check every 10 usec to see if the address cycle completed. + * The SB IOSF BUSY bit will clear when the operation is + * complete + */ + for (i = 0; i < IXGBE_MDIO_COMMAND_TIMEOUT; i++) { + command = IXGBE_READ_REG(hw, IXGBE_SB_IOSF_INDIRECT_CTRL); + if ((command & IXGBE_SB_IOSF_CTRL_BUSY) == 0) + break; + usec_delay(10); + } + if (ctrl) + *ctrl = command; + if (i == IXGBE_MDIO_COMMAND_TIMEOUT) { + ERROR_REPORT1(IXGBE_ERROR_POLLING, "Wait timed out\n"); + return IXGBE_ERR_PHY; + } + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_write_iosf_sb_reg_x550 - Writes a value to specified register of the IOSF + * device + * @hw: pointer to hardware structure + * @reg_addr: 32 bit PHY register to write + * @device_type: 3 bit device type + * @data: Data to write to the register + **/ +s32 ixgbe_write_iosf_sb_reg_x550(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u32 data) +{ + u32 gssr = IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_PHY0_SM; + u32 command, error; + s32 ret; + + ret = ixgbe_acquire_swfw_semaphore(hw, gssr); + if (ret != IXGBE_SUCCESS) + return ret; + + ret = ixgbe_iosf_wait(hw, NULL); + if (ret != IXGBE_SUCCESS) + goto out; + + command = ((reg_addr << IXGBE_SB_IOSF_CTRL_ADDR_SHIFT) | + (device_type << IXGBE_SB_IOSF_CTRL_TARGET_SELECT_SHIFT)); + + /* Write IOSF control register */ + IXGBE_WRITE_REG(hw, IXGBE_SB_IOSF_INDIRECT_CTRL, command); + + /* Write IOSF data register */ + IXGBE_WRITE_REG(hw, IXGBE_SB_IOSF_INDIRECT_DATA, data); + + ret = ixgbe_iosf_wait(hw, &command); + + if ((command & IXGBE_SB_IOSF_CTRL_RESP_STAT_MASK) != 0) { + error = (command & IXGBE_SB_IOSF_CTRL_CMPL_ERR_MASK) >> + IXGBE_SB_IOSF_CTRL_CMPL_ERR_SHIFT; + ERROR_REPORT2(IXGBE_ERROR_POLLING, + "Failed to write, error %x\n", error); + ret = IXGBE_ERR_PHY; + } + +out: + ixgbe_release_swfw_semaphore(hw, gssr); + return ret; +} + +/** + * ixgbe_read_iosf_sb_reg_x550 - Writes a value to specified register of the IOSF + * device + * @hw: pointer to hardware structure + * @reg_addr: 32 bit PHY register to write + * @device_type: 3 bit device type + * @phy_data: Pointer to read data from the register + **/ +s32 ixgbe_read_iosf_sb_reg_x550(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u32 *data) +{ + u32 gssr = IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_PHY0_SM; + u32 command, error; + s32 ret; + + ret = ixgbe_acquire_swfw_semaphore(hw, gssr); + if (ret != IXGBE_SUCCESS) + return ret; + + ret = ixgbe_iosf_wait(hw, NULL); + if (ret != IXGBE_SUCCESS) + goto out; + + command = ((reg_addr << IXGBE_SB_IOSF_CTRL_ADDR_SHIFT) | + (device_type << IXGBE_SB_IOSF_CTRL_TARGET_SELECT_SHIFT)); + + /* Write IOSF control register */ + IXGBE_WRITE_REG(hw, IXGBE_SB_IOSF_INDIRECT_CTRL, command); + + ret = ixgbe_iosf_wait(hw, &command); + + if ((command & IXGBE_SB_IOSF_CTRL_RESP_STAT_MASK) != 0) { + error = (command & IXGBE_SB_IOSF_CTRL_CMPL_ERR_MASK) >> + IXGBE_SB_IOSF_CTRL_CMPL_ERR_SHIFT; + ERROR_REPORT2(IXGBE_ERROR_POLLING, + "Failed to read, error %x\n", error); + ret = IXGBE_ERR_PHY; + } + + if (ret == IXGBE_SUCCESS) + *data = IXGBE_READ_REG(hw, IXGBE_SB_IOSF_INDIRECT_DATA); + +out: + ixgbe_release_swfw_semaphore(hw, gssr); + return ret; +} + +/** + * ixgbe_disable_mdd_X550 + * @hw: pointer to hardware structure + * + * Disable malicious driver detection + **/ +void ixgbe_disable_mdd_X550(struct ixgbe_hw *hw) +{ + u32 reg; + + DEBUGFUNC("ixgbe_disable_mdd_X550"); + + /* Disable MDD for TX DMA and interrupt */ + reg = IXGBE_READ_REG(hw, IXGBE_DMATXCTL); + reg &= ~(IXGBE_DMATXCTL_MDP_EN | IXGBE_DMATXCTL_MBINTEN); + IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, reg); + + /* Disable MDD for RX and interrupt */ + reg = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); + reg &= ~(IXGBE_RDRXCTL_MDP_EN | IXGBE_RDRXCTL_MBINTEN); + IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg); +} + +/** + * ixgbe_enable_mdd_X550 + * @hw: pointer to hardware structure + * + * Enable malicious driver detection + **/ +void ixgbe_enable_mdd_X550(struct ixgbe_hw *hw) +{ + u32 reg; + + DEBUGFUNC("ixgbe_enable_mdd_X550"); + + /* Enable MDD for TX DMA and interrupt */ + reg = IXGBE_READ_REG(hw, IXGBE_DMATXCTL); + reg |= (IXGBE_DMATXCTL_MDP_EN | IXGBE_DMATXCTL_MBINTEN); + IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, reg); + + /* Enable MDD for RX and interrupt */ + reg = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); + reg |= (IXGBE_RDRXCTL_MDP_EN | IXGBE_RDRXCTL_MBINTEN); + IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg); +} + +/** + * ixgbe_restore_mdd_vf_X550 + * @hw: pointer to hardware structure + * @vf: vf index + * + * Restore VF that was disabled during malicious driver detection event + **/ +void ixgbe_restore_mdd_vf_X550(struct ixgbe_hw *hw, u32 vf) +{ + u32 idx, reg, num_qs, start_q, bitmask; + + DEBUGFUNC("ixgbe_restore_mdd_vf_X550"); + + /* Map VF to queues */ + reg = IXGBE_READ_REG(hw, IXGBE_MRQC); + switch (reg & IXGBE_MRQC_MRQE_MASK) { + case IXGBE_MRQC_VMDQRT8TCEN: + num_qs = 8; /* 16 VFs / pools */ + bitmask = 0x000000FF; + break; + case IXGBE_MRQC_VMDQRSS32EN: + case IXGBE_MRQC_VMDQRT4TCEN: + num_qs = 4; /* 32 VFs / pools */ + bitmask = 0x0000000F; + break; + default: /* 64 VFs / pools */ + num_qs = 2; + bitmask = 0x00000003; + break; + } + start_q = vf * num_qs; + + /* Release vf's queues by clearing WQBR_TX and WQBR_RX (RW1C) */ + idx = start_q / 32; + reg = 0; + reg |= (bitmask << (start_q % 32)); + IXGBE_WRITE_REG(hw, IXGBE_WQBR_TX(idx), reg); + IXGBE_WRITE_REG(hw, IXGBE_WQBR_RX(idx), reg); +} + +/** + * ixgbe_mdd_event_X550 + * @hw: pointer to hardware structure + * @vf_bitmap: vf bitmap of malicious vfs + * + * Handle malicious driver detection event. + **/ +void ixgbe_mdd_event_X550(struct ixgbe_hw *hw, u32 *vf_bitmap) +{ + u32 wqbr; + u32 i, j, reg, q, shift, vf, idx; + + DEBUGFUNC("ixgbe_mdd_event_X550"); + + /* figure out pool size for mapping to vf's */ + reg = IXGBE_READ_REG(hw, IXGBE_MRQC); + switch (reg & IXGBE_MRQC_MRQE_MASK) { + case IXGBE_MRQC_VMDQRT8TCEN: + shift = 3; /* 16 VFs / pools */ + break; + case IXGBE_MRQC_VMDQRSS32EN: + case IXGBE_MRQC_VMDQRT4TCEN: + shift = 2; /* 32 VFs / pools */ + break; + default: + shift = 1; /* 64 VFs / pools */ + break; + } + + /* Read WQBR_TX and WQBR_RX and check for malicious queues */ + for (i = 0; i < 4; i++) { + wqbr = IXGBE_READ_REG(hw, IXGBE_WQBR_TX(i)); + wqbr |= IXGBE_READ_REG(hw, IXGBE_WQBR_RX(i)); + + if (!wqbr) + continue; + + /* Get malicious queue */ + for (j = 0; j < 32 && wqbr; j++) { + + if (!(wqbr & (1 << j))) + continue; + + /* Get queue from bitmask */ + q = j + (i * 32); + + /* Map queue to vf */ + vf = (q >> shift); + + /* Set vf bit in vf_bitmap */ + idx = vf / 32; + vf_bitmap[idx] |= (1 << (vf % 32)); + wqbr &= ~(1 << j); + } + } +} + +/** + * ixgbe_get_media_type_X550em - Get media type + * @hw: pointer to hardware structure + * + * Returns the media type (fiber, copper, backplane) + */ +enum ixgbe_media_type ixgbe_get_media_type_X550em(struct ixgbe_hw *hw) +{ + enum ixgbe_media_type media_type; + + DEBUGFUNC("ixgbe_get_media_type_X550em"); + + /* Detect if there is a copper PHY attached. */ + switch (hw->device_id) { + case IXGBE_DEV_ID_X550EM_X_KR: + case IXGBE_DEV_ID_X550EM_X_KX4: + media_type = ixgbe_media_type_backplane; + break; + case IXGBE_DEV_ID_X550EM_X_SFP: + media_type = ixgbe_media_type_fiber; + break; + case IXGBE_DEV_ID_X550EM_X_1G_T: + case IXGBE_DEV_ID_X550EM_X_10G_T: + media_type = ixgbe_media_type_copper; + break; + default: + media_type = ixgbe_media_type_unknown; + break; + } + return media_type; +} + +/** + * ixgbe_supported_sfp_modules_X550em - Check if SFP module type is supported + * @hw: pointer to hardware structure + * @linear: TRUE if SFP module is linear + */ +static s32 ixgbe_supported_sfp_modules_X550em(struct ixgbe_hw *hw, bool *linear) +{ + DEBUGFUNC("ixgbe_supported_sfp_modules_X550em"); + + switch (hw->phy.sfp_type) { + case ixgbe_sfp_type_not_present: + return IXGBE_ERR_SFP_NOT_PRESENT; + case ixgbe_sfp_type_da_cu_core0: + case ixgbe_sfp_type_da_cu_core1: + *linear = TRUE; + break; + case ixgbe_sfp_type_srlr_core0: + case ixgbe_sfp_type_srlr_core1: + case ixgbe_sfp_type_da_act_lmt_core0: + case ixgbe_sfp_type_da_act_lmt_core1: + case ixgbe_sfp_type_1g_sx_core0: + case ixgbe_sfp_type_1g_sx_core1: + case ixgbe_sfp_type_1g_lx_core0: + case ixgbe_sfp_type_1g_lx_core1: + *linear = FALSE; + break; + case ixgbe_sfp_type_unknown: + case ixgbe_sfp_type_1g_cu_core0: + case ixgbe_sfp_type_1g_cu_core1: + default: + return IXGBE_ERR_SFP_NOT_SUPPORTED; + } + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_identify_sfp_module_X550em - Identifies SFP modules + * @hw: pointer to hardware structure + * + * Searches for and identifies the SFP module and assigns appropriate PHY type. + **/ +s32 ixgbe_identify_sfp_module_X550em(struct ixgbe_hw *hw) +{ + s32 status; + bool linear; + + DEBUGFUNC("ixgbe_identify_sfp_module_X550em"); + + status = ixgbe_identify_module_generic(hw); + + if (status != IXGBE_SUCCESS) + return status; + + /* Check if SFP module is supported */ + status = ixgbe_supported_sfp_modules_X550em(hw, &linear); + + return status; +} + +/** + * ixgbe_setup_sfp_modules_X550em - Setup MAC link ops + * @hw: pointer to hardware structure + */ +s32 ixgbe_setup_sfp_modules_X550em(struct ixgbe_hw *hw) +{ + s32 status; + bool linear; + + DEBUGFUNC("ixgbe_setup_sfp_modules_X550em"); + + /* Check if SFP module is supported */ + status = ixgbe_supported_sfp_modules_X550em(hw, &linear); + + if (status != IXGBE_SUCCESS) + return status; + + ixgbe_init_mac_link_ops_X550em(hw); + hw->phy.ops.reset = NULL; + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_init_mac_link_ops_X550em - init mac link function pointers + * @hw: pointer to hardware structure + */ +void ixgbe_init_mac_link_ops_X550em(struct ixgbe_hw *hw) +{ + struct ixgbe_mac_info *mac = &hw->mac; + + DEBUGFUNC("ixgbe_init_mac_link_ops_X550em"); + + switch (hw->mac.ops.get_media_type(hw)) { + case ixgbe_media_type_fiber: + /* CS4227 does not support autoneg, so disable the laser control + * functions for SFP+ fiber + */ + mac->ops.disable_tx_laser = NULL; + mac->ops.enable_tx_laser = NULL; + mac->ops.flap_tx_laser = NULL; + mac->ops.setup_link = ixgbe_setup_mac_link_multispeed_fiber; + mac->ops.setup_mac_link = ixgbe_setup_mac_link_sfp_x550em; + mac->ops.set_rate_select_speed = + ixgbe_set_soft_rate_select_speed; + break; + case ixgbe_media_type_copper: + mac->ops.setup_link = ixgbe_setup_mac_link_t_X550em; + mac->ops.check_link = ixgbe_check_link_t_X550em; + break; + default: + break; + } +} + +/** + * ixgbe_get_link_capabilities_x550em - Determines link capabilities + * @hw: pointer to hardware structure + * @speed: pointer to link speed + * @autoneg: TRUE when autoneg or autotry is enabled + */ +s32 ixgbe_get_link_capabilities_X550em(struct ixgbe_hw *hw, + ixgbe_link_speed *speed, + bool *autoneg) +{ + DEBUGFUNC("ixgbe_get_link_capabilities_X550em"); + + /* SFP */ + if (hw->phy.media_type == ixgbe_media_type_fiber) { + + /* CS4227 SFP must not enable auto-negotiation */ + *autoneg = FALSE; + + /* Check if 1G SFP module. */ + if (hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1 + || hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1) { + *speed = IXGBE_LINK_SPEED_1GB_FULL; + return IXGBE_SUCCESS; + } + + /* Link capabilities are based on SFP */ + if (hw->phy.multispeed_fiber) + *speed = IXGBE_LINK_SPEED_10GB_FULL | + IXGBE_LINK_SPEED_1GB_FULL; + else + *speed = IXGBE_LINK_SPEED_10GB_FULL; + } else { + *speed = IXGBE_LINK_SPEED_10GB_FULL | + IXGBE_LINK_SPEED_1GB_FULL; + *autoneg = TRUE; + } + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_get_lasi_ext_t_x550em - Determime external Base T PHY interrupt cause + * @hw: pointer to hardware structure + * @lsc: pointer to boolean flag which indicates whether external Base T + * PHY interrupt is lsc + * + * Determime if external Base T PHY interrupt cause is high temperature + * failure alarm or link status change. + * + * Return IXGBE_ERR_OVERTEMP if interrupt is high temperature + * failure alarm, else return PHY access status. + */ +static s32 ixgbe_get_lasi_ext_t_x550em(struct ixgbe_hw *hw, bool *lsc) +{ + u32 status; + u16 reg; + + *lsc = FALSE; + + /* Vendor alarm triggered */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_CHIP_STD_INT_FLAG, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS || + !(reg & IXGBE_MDIO_GLOBAL_VEN_ALM_INT_EN)) + return status; + + /* Vendor Auto-Neg alarm triggered or Global alarm 1 triggered */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_INT_CHIP_VEN_FLAG, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS || + !(reg & (IXGBE_MDIO_GLOBAL_AN_VEN_ALM_INT_EN | + IXGBE_MDIO_GLOBAL_ALARM_1_INT))) + return status; + + /* High temperature failure alarm triggered */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_ALARM_1, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + /* If high temperature failure, then return over temp error and exit */ + if (reg & IXGBE_MDIO_GLOBAL_ALM_1_HI_TMP_FAIL) { + /* power down the PHY in case the PHY FW didn't already */ + ixgbe_set_copper_phy_power(hw, FALSE); + return IXGBE_ERR_OVERTEMP; + } + + /* Vendor alarm 2 triggered */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_CHIP_STD_INT_FLAG, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, ®); + + if (status != IXGBE_SUCCESS || + !(reg & IXGBE_MDIO_GLOBAL_STD_ALM2_INT)) + return status; + + /* link connect/disconnect event occurred */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_VENDOR_TX_ALARM2, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, ®); + + if (status != IXGBE_SUCCESS) + return status; + + /* Indicate LSC */ + if (reg & IXGBE_MDIO_AUTO_NEG_VEN_LSC) + *lsc = TRUE; + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_enable_lasi_ext_t_x550em - Enable external Base T PHY interrupts + * @hw: pointer to hardware structure + * + * Enable link status change and temperature failure alarm for the external + * Base T PHY + * + * Returns PHY access status + */ +static s32 ixgbe_enable_lasi_ext_t_x550em(struct ixgbe_hw *hw) +{ + u32 status; + u16 reg; + bool lsc; + + /* Clear interrupt flags */ + status = ixgbe_get_lasi_ext_t_x550em(hw, &lsc); + + /* Enable link status change alarm */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_PMA_TX_VEN_LASI_INT_MASK, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, ®); + + if (status != IXGBE_SUCCESS) + return status; + + reg |= IXGBE_MDIO_PMA_TX_VEN_LASI_INT_EN; + + status = hw->phy.ops.write_reg(hw, IXGBE_MDIO_PMA_TX_VEN_LASI_INT_MASK, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, reg); + + if (status != IXGBE_SUCCESS) + return status; + + /* Enables high temperature failure alarm */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_INT_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + reg |= IXGBE_MDIO_GLOBAL_INT_HI_TEMP_EN; + + status = hw->phy.ops.write_reg(hw, IXGBE_MDIO_GLOBAL_INT_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + reg); + + if (status != IXGBE_SUCCESS) + return status; + + /* Enable vendor Auto-Neg alarm and Global Interrupt Mask 1 alarm */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_INT_CHIP_VEN_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + reg |= (IXGBE_MDIO_GLOBAL_AN_VEN_ALM_INT_EN | + IXGBE_MDIO_GLOBAL_ALARM_1_INT); + + status = hw->phy.ops.write_reg(hw, IXGBE_MDIO_GLOBAL_INT_CHIP_VEN_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + reg); + + if (status != IXGBE_SUCCESS) + return status; + + /* Enable chip-wide vendor alarm */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_GLOBAL_INT_CHIP_STD_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + reg |= IXGBE_MDIO_GLOBAL_VEN_ALM_INT_EN; + + status = hw->phy.ops.write_reg(hw, IXGBE_MDIO_GLOBAL_INT_CHIP_STD_MASK, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + reg); + + return status; +} + +/** + * ixgbe_setup_kr_speed_x550em - Configure the KR PHY for link speed. + * @hw: pointer to hardware structure + * @speed: link speed + * + * Configures the integrated KR PHY. + **/ +static s32 ixgbe_setup_kr_speed_x550em(struct ixgbe_hw *hw, + ixgbe_link_speed speed) +{ + s32 status; + u32 reg_val; + + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status) + return status; + + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_ENABLE; + reg_val &= ~(IXGBE_KRM_LINK_CTRL_1_TETH_AN_FEC_REQ | + IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_FEC); + reg_val &= ~(IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_KR | + IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_KX); + + /* Advertise 10G support. */ + if (speed & IXGBE_LINK_SPEED_10GB_FULL) + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_KR; + + /* Advertise 1G support. */ + if (speed & IXGBE_LINK_SPEED_1GB_FULL) + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_CAP_KX; + + /* Restart auto-negotiation. */ + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_RESTART; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + + return status; +} + +/** + * ixgbe_init_phy_ops_X550em - PHY/SFP specific init + * @hw: pointer to hardware structure + * + * Initialize any function pointers that were not able to be + * set during init_shared_code because the PHY/SFP type was + * not known. Perform the SFP init if necessary. + */ +s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw *hw) +{ + struct ixgbe_phy_info *phy = &hw->phy; + ixgbe_link_speed speed; + s32 ret_val; + + DEBUGFUNC("ixgbe_init_phy_ops_X550em"); + + hw->mac.ops.set_lan_id(hw); + + if (hw->mac.ops.get_media_type(hw) == ixgbe_media_type_fiber) { + phy->phy_semaphore_mask = IXGBE_GSSR_SHARED_I2C_SM; + ixgbe_setup_mux_ctl(hw); + + /* Save NW management interface connected on board. This is used + * to determine internal PHY mode. + */ + phy->nw_mng_if_sel = IXGBE_READ_REG(hw, IXGBE_NW_MNG_IF_SEL); + + /* If internal PHY mode is KR, then initialize KR link */ + if (phy->nw_mng_if_sel & IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE) { + speed = IXGBE_LINK_SPEED_10GB_FULL | + IXGBE_LINK_SPEED_1GB_FULL; + ret_val = ixgbe_setup_kr_speed_x550em(hw, speed); + } + + phy->ops.identify_sfp = ixgbe_identify_sfp_module_X550em; + } + + /* Identify the PHY or SFP module */ + ret_val = phy->ops.identify(hw); + if (ret_val == IXGBE_ERR_SFP_NOT_SUPPORTED) + return ret_val; + + /* Setup function pointers based on detected hardware */ + ixgbe_init_mac_link_ops_X550em(hw); + if (phy->sfp_type != ixgbe_sfp_type_unknown) + phy->ops.reset = NULL; + + /* Set functions pointers based on phy type */ + switch (hw->phy.type) { + case ixgbe_phy_x550em_kx4: + phy->ops.setup_link = ixgbe_setup_kx4_x550em; + phy->ops.read_reg = ixgbe_read_phy_reg_x550em; + phy->ops.write_reg = ixgbe_write_phy_reg_x550em; + break; + case ixgbe_phy_x550em_kr: + phy->ops.setup_link = ixgbe_setup_kr_x550em; + phy->ops.read_reg = ixgbe_read_phy_reg_x550em; + phy->ops.write_reg = ixgbe_write_phy_reg_x550em; + break; + case ixgbe_phy_x550em_ext_t: + /* Save NW management interface connected on board. This is used + * to determine internal PHY mode + */ + phy->nw_mng_if_sel = IXGBE_READ_REG(hw, IXGBE_NW_MNG_IF_SEL); + + /* If internal link mode is XFI, then setup iXFI internal link, + * else setup KR now. + */ + if (!(phy->nw_mng_if_sel & IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE)) { + phy->ops.setup_internal_link = + ixgbe_setup_internal_phy_t_x550em; + } else { + speed = IXGBE_LINK_SPEED_10GB_FULL | + IXGBE_LINK_SPEED_1GB_FULL; + ret_val = ixgbe_setup_kr_speed_x550em(hw, speed); + } + + phy->ops.enter_lplu = ixgbe_enter_lplu_t_x550em; + phy->ops.handle_lasi = ixgbe_handle_lasi_ext_t_x550em; + phy->ops.reset = ixgbe_reset_phy_t_X550em; + break; + default: + break; + } + return ret_val; +} + +/** + * ixgbe_reset_hw_X550em - Perform hardware reset + * @hw: pointer to hardware structure + * + * Resets the hardware by resetting the transmit and receive units, masks + * and clears all interrupts, perform a PHY reset, and perform a link (MAC) + * reset. + */ +s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw) +{ + ixgbe_link_speed link_speed; + s32 status; + u32 ctrl = 0; + u32 i; + u32 hlreg0; + bool link_up = FALSE; + + DEBUGFUNC("ixgbe_reset_hw_X550em"); + + /* Call adapter stop to disable Tx/Rx and clear interrupts */ + status = hw->mac.ops.stop_adapter(hw); + if (status != IXGBE_SUCCESS) + return status; + + /* flush pending Tx transactions */ + ixgbe_clear_tx_pending(hw); + + /* PHY ops must be identified and initialized prior to reset */ + + /* Identify PHY and related function pointers */ + status = hw->phy.ops.init(hw); + + if (status == IXGBE_ERR_SFP_NOT_SUPPORTED) + return status; + + /* start the external PHY */ + if (hw->phy.type == ixgbe_phy_x550em_ext_t) { + status = ixgbe_init_ext_t_x550em(hw); + if (status) + return status; + } + + /* Setup SFP module if there is one present. */ + if (hw->phy.sfp_setup_needed) { + status = hw->mac.ops.setup_sfp(hw); + hw->phy.sfp_setup_needed = FALSE; + } + + if (status == IXGBE_ERR_SFP_NOT_SUPPORTED) + return status; + + /* Reset PHY */ + if (!hw->phy.reset_disable && hw->phy.ops.reset) + hw->phy.ops.reset(hw); + +mac_reset_top: + /* Issue global reset to the MAC. Needs to be SW reset if link is up. + * If link reset is used when link is up, it might reset the PHY when + * mng is using it. If link is down or the flag to force full link + * reset is set, then perform link reset. + */ + ctrl = IXGBE_CTRL_LNK_RST; + if (!hw->force_full_reset) { + hw->mac.ops.check_link(hw, &link_speed, &link_up, FALSE); + if (link_up) + ctrl = IXGBE_CTRL_RST; + } + + ctrl |= IXGBE_READ_REG(hw, IXGBE_CTRL); + IXGBE_WRITE_REG(hw, IXGBE_CTRL, ctrl); + IXGBE_WRITE_FLUSH(hw); + + /* Poll for reset bit to self-clear meaning reset is complete */ + for (i = 0; i < 10; i++) { + usec_delay(1); + ctrl = IXGBE_READ_REG(hw, IXGBE_CTRL); + if (!(ctrl & IXGBE_CTRL_RST_MASK)) + break; + } + + if (ctrl & IXGBE_CTRL_RST_MASK) { + status = IXGBE_ERR_RESET_FAILED; + DEBUGOUT("Reset polling failed to complete.\n"); + } + + msec_delay(50); + + /* Double resets are required for recovery from certain error + * conditions. Between resets, it is necessary to stall to + * allow time for any pending HW events to complete. + */ + if (hw->mac.flags & IXGBE_FLAGS_DOUBLE_RESET_REQUIRED) { + hw->mac.flags &= ~IXGBE_FLAGS_DOUBLE_RESET_REQUIRED; + goto mac_reset_top; + } + + /* Store the permanent mac address */ + hw->mac.ops.get_mac_addr(hw, hw->mac.perm_addr); + + /* Store MAC address from RAR0, clear receive address registers, and + * clear the multicast table. Also reset num_rar_entries to 128, + * since we modify this value when programming the SAN MAC address. + */ + hw->mac.num_rar_entries = 128; + hw->mac.ops.init_rx_addrs(hw); + + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { + /* Config MDIO clock speed. */ + hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0); + hlreg0 &= ~IXGBE_HLREG0_MDCSPD; + IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0); + } + + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP) + ixgbe_setup_mux_ctl(hw); + + return status; +} + +/** + * ixgbe_init_ext_t_x550em - Start (unstall) the external Base T PHY. + * @hw: pointer to hardware structure + */ +s32 ixgbe_init_ext_t_x550em(struct ixgbe_hw *hw) +{ + u32 status; + u16 reg; + + status = hw->phy.ops.read_reg(hw, + IXGBE_MDIO_TX_VENDOR_ALARMS_3, + IXGBE_MDIO_PMA_PMD_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + /* If PHY FW reset completed bit is set then this is the first + * SW instance after a power on so the PHY FW must be un-stalled. + */ + if (reg & IXGBE_MDIO_TX_VENDOR_ALARMS_3_RST_MASK) { + status = hw->phy.ops.read_reg(hw, + IXGBE_MDIO_GLOBAL_RES_PR_10, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + ®); + + if (status != IXGBE_SUCCESS) + return status; + + reg &= ~IXGBE_MDIO_POWER_UP_STALL; + + status = hw->phy.ops.write_reg(hw, + IXGBE_MDIO_GLOBAL_RES_PR_10, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, + reg); + + if (status != IXGBE_SUCCESS) + return status; + } + + return status; +} + +/** + * ixgbe_setup_kr_x550em - Configure the KR PHY. + * @hw: pointer to hardware structure + * + * Configures the integrated KR PHY. + **/ +s32 ixgbe_setup_kr_x550em(struct ixgbe_hw *hw) +{ + return ixgbe_setup_kr_speed_x550em(hw, hw->phy.autoneg_advertised); +} + +/** + * ixgbe_setup_kx4_x550em - Configure the KX4 PHY. + * @hw: pointer to hardware structure + * + * Configures the integrated KX4 PHY. + **/ +s32 ixgbe_setup_kx4_x550em(struct ixgbe_hw *hw) +{ + s32 status; + u32 reg_val; + + status = ixgbe_read_iosf_sb_reg_x550(hw, IXGBE_KX4_LINK_CNTL_1, + IXGBE_SB_IOSF_TARGET_KX4_PCS, ®_val); + if (status) + return status; + + reg_val &= ~(IXGBE_KX4_LINK_CNTL_1_TETH_AN_CAP_KX4 | + IXGBE_KX4_LINK_CNTL_1_TETH_AN_CAP_KX); + + reg_val |= IXGBE_KX4_LINK_CNTL_1_TETH_AN_ENABLE; + + /* Advertise 10G support. */ + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_10GB_FULL) + reg_val |= IXGBE_KX4_LINK_CNTL_1_TETH_AN_CAP_KX4; + + /* Advertise 1G support. */ + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_1GB_FULL) + reg_val |= IXGBE_KX4_LINK_CNTL_1_TETH_AN_CAP_KX; + + /* Restart auto-negotiation. */ + reg_val |= IXGBE_KX4_LINK_CNTL_1_TETH_AN_RESTART; + status = ixgbe_write_iosf_sb_reg_x550(hw, IXGBE_KX4_LINK_CNTL_1, + IXGBE_SB_IOSF_TARGET_KX4_PCS, reg_val); + + return status; +} + +/** + * ixgbe_setup_mac_link_sfp_x550em - Setup internal/external the PHY for SFP + * @hw: pointer to hardware structure + * + * Configure the external PHY and the integrated KR PHY for SFP support. + **/ +s32 ixgbe_setup_mac_link_sfp_x550em(struct ixgbe_hw *hw, + ixgbe_link_speed speed, + bool autoneg_wait_to_complete) +{ + s32 ret_val; + u16 reg_slice, reg_val; + bool setup_linear = FALSE; + UNREFERENCED_1PARAMETER(autoneg_wait_to_complete); + + /* Check if SFP module is supported and linear */ + ret_val = ixgbe_supported_sfp_modules_X550em(hw, &setup_linear); + + /* If no SFP module present, then return success. Return success since + * there is no reason to configure CS4227 and SFP not present error is + * not excepted in the setup MAC link flow. + */ + if (ret_val == IXGBE_ERR_SFP_NOT_PRESENT) + return IXGBE_SUCCESS; + + if (ret_val != IXGBE_SUCCESS) + return ret_val; + + /* Configure CS4227 for LINE connection rate then type. */ + reg_slice = IXGBE_CS4227_LINE_SPARE22_MSB + (hw->bus.lan_id << 12); + reg_val = (speed & IXGBE_LINK_SPEED_10GB_FULL) ? 0 : 0x8000; + ret_val = ixgbe_write_i2c_combined(hw, IXGBE_CS4227, reg_slice, + reg_val); + + reg_slice = IXGBE_CS4227_LINE_SPARE24_LSB + (hw->bus.lan_id << 12); + if (setup_linear) + reg_val = (IXGBE_CS4227_EDC_MODE_CX1 << 1) | 0x1; + else + reg_val = (IXGBE_CS4227_EDC_MODE_SR << 1) | 0x1; + ret_val = ixgbe_write_i2c_combined(hw, IXGBE_CS4227, reg_slice, + reg_val); + + /* Configure CS4227 for HOST connection rate then type. */ + reg_slice = IXGBE_CS4227_HOST_SPARE22_MSB + (hw->bus.lan_id << 12); + reg_val = (speed & IXGBE_LINK_SPEED_10GB_FULL) ? 0 : 0x8000; + ret_val = ixgbe_write_i2c_combined(hw, IXGBE_CS4227, reg_slice, + reg_val); + + reg_slice = IXGBE_CS4227_HOST_SPARE24_LSB + (hw->bus.lan_id << 12); + if (setup_linear) + reg_val = (IXGBE_CS4227_EDC_MODE_CX1 << 1) | 0x1; + else + reg_val = (IXGBE_CS4227_EDC_MODE_SR << 1) | 0x1; + ret_val = ixgbe_write_i2c_combined(hw, IXGBE_CS4227, reg_slice, + reg_val); + + /* If internal link mode is XFI, then setup XFI internal link. */ + if (!(hw->phy.nw_mng_if_sel & IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE)) + ret_val = ixgbe_setup_ixfi_x550em(hw, &speed); + + return ret_val; +} + +/** + * ixgbe_setup_ixfi_x550em - Configure the KR PHY for iXFI mode. + * @hw: pointer to hardware structure + * @speed: the link speed to force + * + * Configures the integrated KR PHY to use iXFI mode. Used to connect an + * internal and external PHY at a specific speed, without autonegotiation. + **/ +static s32 ixgbe_setup_ixfi_x550em(struct ixgbe_hw *hw, ixgbe_link_speed *speed) +{ + s32 status; + u32 reg_val; + + /* Disable AN and force speed to 10G Serial. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + + reg_val &= ~IXGBE_KRM_LINK_CTRL_1_TETH_AN_ENABLE; + reg_val &= ~IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_MASK; + + /* Select forced link speed for internal PHY. */ + switch (*speed) { + case IXGBE_LINK_SPEED_10GB_FULL: + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_10G; + break; + case IXGBE_LINK_SPEED_1GB_FULL: + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_1G; + break; + default: + /* Other link speeds are not supported by internal KR PHY. */ + return IXGBE_ERR_LINK_SETUP; + } + + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Disable training protocol FSM. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_RX_TRN_LINKUP_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_RX_TRN_LINKUP_CTRL_CONV_WO_PROTOCOL; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_RX_TRN_LINKUP_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Disable Flex from training TXFFE. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_DSP_TXFFE_STATE_4(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_C0_EN; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_CP1_CN1_EN; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_CO_ADAPT_EN; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_DSP_TXFFE_STATE_4(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_DSP_TXFFE_STATE_5(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_C0_EN; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_CP1_CN1_EN; + reg_val &= ~IXGBE_KRM_DSP_TXFFE_STATE_CO_ADAPT_EN; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_DSP_TXFFE_STATE_5(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Enable override for coefficients. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_TX_COEFF_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_TX_COEFF_CTRL_1_OVRRD_EN; + reg_val |= IXGBE_KRM_TX_COEFF_CTRL_1_CZERO_EN; + reg_val |= IXGBE_KRM_TX_COEFF_CTRL_1_CPLUS1_OVRRD_EN; + reg_val |= IXGBE_KRM_TX_COEFF_CTRL_1_CMINUS1_OVRRD_EN; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_TX_COEFF_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Toggle port SW reset by AN reset. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_AN_RESTART; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + + return status; +} + +/** + * ixgbe_ext_phy_t_x550em_get_link - Get ext phy link status + * @hw: address of hardware structure + * @link_up: address of boolean to indicate link status + * + * Returns error code if unable to get link status. + */ +static s32 ixgbe_ext_phy_t_x550em_get_link(struct ixgbe_hw *hw, bool *link_up) +{ + u32 ret; + u16 autoneg_status; + + *link_up = FALSE; + + /* read this twice back to back to indicate current status */ + ret = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_STATUS, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_status); + if (ret != IXGBE_SUCCESS) + return ret; + + ret = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_STATUS, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_status); + if (ret != IXGBE_SUCCESS) + return ret; + + *link_up = !!(autoneg_status & IXGBE_MDIO_AUTO_NEG_LINK_STATUS); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_setup_internal_phy_t_x550em - Configure KR PHY to X557 link + * @hw: point to hardware structure + * + * Configures the link between the integrated KR PHY and the external X557 PHY + * The driver will call this function when it gets a link status change + * interrupt from the X557 PHY. This function configures the link speed + * between the PHYs to match the link speed of the BASE-T link. + * + * A return of a non-zero value indicates an error, and the base driver should + * not report link up. + */ +s32 ixgbe_setup_internal_phy_t_x550em(struct ixgbe_hw *hw) +{ + ixgbe_link_speed force_speed; + bool link_up; + u32 status; + u16 speed; + + if (hw->mac.ops.get_media_type(hw) != ixgbe_media_type_copper) + return IXGBE_ERR_CONFIG; + + /* If link is not up, then there is no setup necessary so return */ + status = ixgbe_ext_phy_t_x550em_get_link(hw, &link_up); + if (status != IXGBE_SUCCESS) + return status; + + if (!link_up) + return IXGBE_SUCCESS; + + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_VENDOR_STAT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &speed); + if (status != IXGBE_SUCCESS) + return status; + + /* If link is not still up, then no setup is necessary so return */ + status = ixgbe_ext_phy_t_x550em_get_link(hw, &link_up); + if (status != IXGBE_SUCCESS) + return status; + if (!link_up) + return IXGBE_SUCCESS; + + /* clear everything but the speed and duplex bits */ + speed &= IXGBE_MDIO_AUTO_NEG_VENDOR_STATUS_MASK; + + switch (speed) { + case IXGBE_MDIO_AUTO_NEG_VENDOR_STATUS_10GB_FULL: + force_speed = IXGBE_LINK_SPEED_10GB_FULL; + break; + case IXGBE_MDIO_AUTO_NEG_VENDOR_STATUS_1GB_FULL: + force_speed = IXGBE_LINK_SPEED_1GB_FULL; + break; + default: + /* Internal PHY does not support anything else */ + return IXGBE_ERR_INVALID_LINK_SETTINGS; + } + + return ixgbe_setup_ixfi_x550em(hw, &force_speed); +} + +/** + * ixgbe_setup_phy_loopback_x550em - Configure the KR PHY for loopback. + * @hw: pointer to hardware structure + * + * Configures the integrated KR PHY to use internal loopback mode. + **/ +s32 ixgbe_setup_phy_loopback_x550em(struct ixgbe_hw *hw) +{ + s32 status; + u32 reg_val; + + /* Disable AN and force speed to 10G Serial. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val &= ~IXGBE_KRM_LINK_CTRL_1_TETH_AN_ENABLE; + reg_val &= ~IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_MASK; + reg_val |= IXGBE_KRM_LINK_CTRL_1_TETH_FORCE_SPEED_10G; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_LINK_CTRL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Set near-end loopback clocks. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_PORT_CAR_GEN_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_PORT_CAR_GEN_CTRL_NELB_32B; + reg_val |= IXGBE_KRM_PORT_CAR_GEN_CTRL_NELB_KRPCS; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_PORT_CAR_GEN_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Set loopback enable. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_PMD_DFX_BURNIN(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_PMD_DFX_BURNIN_TX_RX_KR_LB_MASK; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_PMD_DFX_BURNIN(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + if (status != IXGBE_SUCCESS) + return status; + + /* Training bypass. */ + status = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_RX_TRN_LINKUP_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (status != IXGBE_SUCCESS) + return status; + reg_val |= IXGBE_KRM_RX_TRN_LINKUP_CTRL_PROTOCOL_BYPASS; + status = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_RX_TRN_LINKUP_CTRL(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + + return status; +} + +/** + * ixgbe_read_ee_hostif_X550 - Read EEPROM word using a host interface command + * assuming that the semaphore is already obtained. + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to read + * @data: word read from the EEPROM + * + * Reads a 16 bit word from the EEPROM using the hostif. + **/ +s32 ixgbe_read_ee_hostif_data_X550(struct ixgbe_hw *hw, u16 offset, + u16 *data) +{ + s32 status; + struct ixgbe_hic_read_shadow_ram buffer; + + DEBUGFUNC("ixgbe_read_ee_hostif_data_X550"); + buffer.hdr.req.cmd = FW_READ_SHADOW_RAM_CMD; + buffer.hdr.req.buf_lenh = 0; + buffer.hdr.req.buf_lenl = FW_READ_SHADOW_RAM_LEN; + buffer.hdr.req.checksum = FW_DEFAULT_CHECKSUM; + + /* convert offset from words to bytes */ + buffer.address = IXGBE_CPU_TO_BE32(offset * 2); + /* one word */ + buffer.length = IXGBE_CPU_TO_BE16(sizeof(u16)); + + status = ixgbe_host_interface_command(hw, (u32 *)&buffer, + sizeof(buffer), + IXGBE_HI_COMMAND_TIMEOUT, FALSE); + + if (status) + return status; + + *data = (u16)IXGBE_READ_REG_ARRAY(hw, IXGBE_FLEX_MNG, + FW_NVM_DATA_OFFSET); + + return 0; +} + +/** + * ixgbe_read_ee_hostif_X550 - Read EEPROM word using a host interface command + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to read + * @data: word read from the EEPROM + * + * Reads a 16 bit word from the EEPROM using the hostif. + **/ +s32 ixgbe_read_ee_hostif_X550(struct ixgbe_hw *hw, u16 offset, + u16 *data) +{ + s32 status = IXGBE_SUCCESS; + + DEBUGFUNC("ixgbe_read_ee_hostif_X550"); + + if (hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_EEP_SM) == + IXGBE_SUCCESS) { + status = ixgbe_read_ee_hostif_data_X550(hw, offset, data); + hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_EEP_SM); + } else { + status = IXGBE_ERR_SWFW_SYNC; + } + + return status; +} + +/** + * ixgbe_read_ee_hostif_buffer_X550- Read EEPROM word(s) using hostif + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to read + * @words: number of words + * @data: word(s) read from the EEPROM + * + * Reads a 16 bit word(s) from the EEPROM using the hostif. + **/ +s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw, + u16 offset, u16 words, u16 *data) +{ + struct ixgbe_hic_read_shadow_ram buffer; + u32 current_word = 0; + u16 words_to_read; + s32 status; + u32 i; + + DEBUGFUNC("ixgbe_read_ee_hostif_buffer_X550"); + + /* Take semaphore for the entire operation. */ + status = hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_EEP_SM); + if (status) { + DEBUGOUT("EEPROM read buffer - semaphore failed\n"); + return status; + } + while (words) { + if (words > FW_MAX_READ_BUFFER_SIZE / 2) + words_to_read = FW_MAX_READ_BUFFER_SIZE / 2; + else + words_to_read = words; + + buffer.hdr.req.cmd = FW_READ_SHADOW_RAM_CMD; + buffer.hdr.req.buf_lenh = 0; + buffer.hdr.req.buf_lenl = FW_READ_SHADOW_RAM_LEN; + buffer.hdr.req.checksum = FW_DEFAULT_CHECKSUM; + + /* convert offset from words to bytes */ + buffer.address = IXGBE_CPU_TO_BE32((offset + current_word) * 2); + buffer.length = IXGBE_CPU_TO_BE16(words_to_read * 2); + + status = ixgbe_host_interface_command(hw, (u32 *)&buffer, + sizeof(buffer), + IXGBE_HI_COMMAND_TIMEOUT, + FALSE); + + if (status) { + DEBUGOUT("Host interface command failed\n"); + goto out; + } + + for (i = 0; i < words_to_read; i++) { + u32 reg = IXGBE_FLEX_MNG + (FW_NVM_DATA_OFFSET << 2) + + 2 * i; + u32 value = IXGBE_READ_REG(hw, reg); + + data[current_word] = (u16)(value & 0xffff); + current_word++; + i++; + if (i < words_to_read) { + value >>= 16; + data[current_word] = (u16)(value & 0xffff); + current_word++; + } + } + words -= words_to_read; + } + +out: + hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_EEP_SM); + return status; +} + +/** + * ixgbe_write_ee_hostif_X550 - Write EEPROM word using hostif + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to write + * @data: word write to the EEPROM + * + * Write a 16 bit word to the EEPROM using the hostif. + **/ +s32 ixgbe_write_ee_hostif_data_X550(struct ixgbe_hw *hw, u16 offset, + u16 data) +{ + s32 status; + struct ixgbe_hic_write_shadow_ram buffer; + + DEBUGFUNC("ixgbe_write_ee_hostif_data_X550"); + + buffer.hdr.req.cmd = FW_WRITE_SHADOW_RAM_CMD; + buffer.hdr.req.buf_lenh = 0; + buffer.hdr.req.buf_lenl = FW_WRITE_SHADOW_RAM_LEN; + buffer.hdr.req.checksum = FW_DEFAULT_CHECKSUM; + + /* one word */ + buffer.length = IXGBE_CPU_TO_BE16(sizeof(u16)); + buffer.data = data; + buffer.address = IXGBE_CPU_TO_BE32(offset * 2); + + status = ixgbe_host_interface_command(hw, (u32 *)&buffer, + sizeof(buffer), + IXGBE_HI_COMMAND_TIMEOUT, FALSE); + + return status; +} + +/** + * ixgbe_write_ee_hostif_X550 - Write EEPROM word using hostif + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to write + * @data: word write to the EEPROM + * + * Write a 16 bit word to the EEPROM using the hostif. + **/ +s32 ixgbe_write_ee_hostif_X550(struct ixgbe_hw *hw, u16 offset, + u16 data) +{ + s32 status = IXGBE_SUCCESS; + + DEBUGFUNC("ixgbe_write_ee_hostif_X550"); + + if (hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_EEP_SM) == + IXGBE_SUCCESS) { + status = ixgbe_write_ee_hostif_data_X550(hw, offset, data); + hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_EEP_SM); + } else { + DEBUGOUT("write ee hostif failed to get semaphore"); + status = IXGBE_ERR_SWFW_SYNC; + } + + return status; +} + +/** + * ixgbe_write_ee_hostif_buffer_X550 - Write EEPROM word(s) using hostif + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to write + * @words: number of words + * @data: word(s) write to the EEPROM + * + * Write a 16 bit word(s) to the EEPROM using the hostif. + **/ +s32 ixgbe_write_ee_hostif_buffer_X550(struct ixgbe_hw *hw, + u16 offset, u16 words, u16 *data) +{ + s32 status = IXGBE_SUCCESS; + u32 i = 0; + + DEBUGFUNC("ixgbe_write_ee_hostif_buffer_X550"); + + /* Take semaphore for the entire operation. */ + status = hw->mac.ops.acquire_swfw_sync(hw, IXGBE_GSSR_EEP_SM); + if (status != IXGBE_SUCCESS) { + DEBUGOUT("EEPROM write buffer - semaphore failed\n"); + goto out; + } + + for (i = 0; i < words; i++) { + status = ixgbe_write_ee_hostif_data_X550(hw, offset + i, + data[i]); + + if (status != IXGBE_SUCCESS) { + DEBUGOUT("Eeprom buffered write failed\n"); + break; + } + } + + hw->mac.ops.release_swfw_sync(hw, IXGBE_GSSR_EEP_SM); +out: + + return status; +} + +/** + * ixgbe_checksum_ptr_x550 - Checksum one pointer region + * @hw: pointer to hardware structure + * @ptr: pointer offset in eeprom + * @size: size of section pointed by ptr, if 0 first word will be used as size + * @csum: address of checksum to update + * + * Returns error status for any failure + */ +static s32 ixgbe_checksum_ptr_x550(struct ixgbe_hw *hw, u16 ptr, + u16 size, u16 *csum, u16 *buffer, + u32 buffer_size) +{ + u16 buf[256]; + s32 status; + u16 length, bufsz, i, start; + u16 *local_buffer; + + bufsz = sizeof(buf) / sizeof(buf[0]); + + /* Read a chunk at the pointer location */ + if (!buffer) { + status = ixgbe_read_ee_hostif_buffer_X550(hw, ptr, bufsz, buf); + if (status) { + DEBUGOUT("Failed to read EEPROM image\n"); + return status; + } + local_buffer = buf; + } else { + if (buffer_size < ptr) + return IXGBE_ERR_PARAM; + local_buffer = &buffer[ptr]; + } + + if (size) { + start = 0; + length = size; + } else { + start = 1; + length = local_buffer[0]; + + /* Skip pointer section if length is invalid. */ + if (length == 0xFFFF || length == 0 || + (ptr + length) >= hw->eeprom.word_size) + return IXGBE_SUCCESS; + } + + if (buffer && ((u32)start + (u32)length > buffer_size)) + return IXGBE_ERR_PARAM; + + for (i = start; length; i++, length--) { + if (i == bufsz && !buffer) { + ptr += bufsz; + i = 0; + if (length < bufsz) + bufsz = length; + + /* Read a chunk at the pointer location */ + status = ixgbe_read_ee_hostif_buffer_X550(hw, ptr, + bufsz, buf); + if (status) { + DEBUGOUT("Failed to read EEPROM image\n"); + return status; + } + } + *csum += local_buffer[i]; + } + return IXGBE_SUCCESS; +} + +/** + * ixgbe_calc_checksum_X550 - Calculates and returns the checksum + * @hw: pointer to hardware structure + * @buffer: pointer to buffer containing calculated checksum + * @buffer_size: size of buffer + * + * Returns a negative error code on error, or the 16-bit checksum + **/ +s32 ixgbe_calc_checksum_X550(struct ixgbe_hw *hw, u16 *buffer, u32 buffer_size) +{ + u16 eeprom_ptrs[IXGBE_EEPROM_LAST_WORD + 1]; + u16 *local_buffer; + s32 status; + u16 checksum = 0; + u16 pointer, i, size; + + DEBUGFUNC("ixgbe_calc_eeprom_checksum_X550"); + + hw->eeprom.ops.init_params(hw); + + if (!buffer) { + /* Read pointer area */ + status = ixgbe_read_ee_hostif_buffer_X550(hw, 0, + IXGBE_EEPROM_LAST_WORD + 1, + eeprom_ptrs); + if (status) { + DEBUGOUT("Failed to read EEPROM image\n"); + return status; + } + local_buffer = eeprom_ptrs; + } else { + if (buffer_size < IXGBE_EEPROM_LAST_WORD) + return IXGBE_ERR_PARAM; + local_buffer = buffer; + } + + /* + * For X550 hardware include 0x0-0x41 in the checksum, skip the + * checksum word itself + */ + for (i = 0; i <= IXGBE_EEPROM_LAST_WORD; i++) + if (i != IXGBE_EEPROM_CHECKSUM) + checksum += local_buffer[i]; + + /* + * Include all data from pointers 0x3, 0x6-0xE. This excludes the + * FW, PHY module, and PCIe Expansion/Option ROM pointers. + */ + for (i = IXGBE_PCIE_ANALOG_PTR_X550; i < IXGBE_FW_PTR; i++) { + if (i == IXGBE_PHY_PTR || i == IXGBE_OPTION_ROM_PTR) + continue; + + pointer = local_buffer[i]; + + /* Skip pointer section if the pointer is invalid. */ + if (pointer == 0xFFFF || pointer == 0 || + pointer >= hw->eeprom.word_size) + continue; + + switch (i) { + case IXGBE_PCIE_GENERAL_PTR: + size = IXGBE_IXGBE_PCIE_GENERAL_SIZE; + break; + case IXGBE_PCIE_CONFIG0_PTR: + case IXGBE_PCIE_CONFIG1_PTR: + size = IXGBE_PCIE_CONFIG_SIZE; + break; + default: + size = 0; + break; + } + + status = ixgbe_checksum_ptr_x550(hw, pointer, size, &checksum, + buffer, buffer_size); + if (status) + return status; + } + + checksum = (u16)IXGBE_EEPROM_SUM - checksum; + + return (s32)checksum; +} + +/** + * ixgbe_calc_eeprom_checksum_X550 - Calculates and returns the checksum + * @hw: pointer to hardware structure + * + * Returns a negative error code on error, or the 16-bit checksum + **/ +s32 ixgbe_calc_eeprom_checksum_X550(struct ixgbe_hw *hw) +{ + return ixgbe_calc_checksum_X550(hw, NULL, 0); +} + +/** + * ixgbe_validate_eeprom_checksum_X550 - Validate EEPROM checksum + * @hw: pointer to hardware structure + * @checksum_val: calculated checksum + * + * Performs checksum calculation and validates the EEPROM checksum. If the + * caller does not need checksum_val, the value can be NULL. + **/ +s32 ixgbe_validate_eeprom_checksum_X550(struct ixgbe_hw *hw, u16 *checksum_val) +{ + s32 status; + u16 checksum; + u16 read_checksum = 0; + + DEBUGFUNC("ixgbe_validate_eeprom_checksum_X550"); + + /* Read the first word from the EEPROM. If this times out or fails, do + * not continue or we could be in for a very long wait while every + * EEPROM read fails + */ + status = hw->eeprom.ops.read(hw, 0, &checksum); + if (status) { + DEBUGOUT("EEPROM read failed\n"); + return status; + } + + status = hw->eeprom.ops.calc_checksum(hw); + if (status < 0) + return status; + + checksum = (u16)(status & 0xffff); + + status = ixgbe_read_ee_hostif_X550(hw, IXGBE_EEPROM_CHECKSUM, + &read_checksum); + if (status) + return status; + + /* Verify read checksum from EEPROM is the same as + * calculated checksum + */ + if (read_checksum != checksum) { + status = IXGBE_ERR_EEPROM_CHECKSUM; + ERROR_REPORT1(IXGBE_ERROR_INVALID_STATE, + "Invalid EEPROM checksum"); + } + + /* If the user cares, return the calculated checksum */ + if (checksum_val) + *checksum_val = checksum; + + return status; +} + +/** + * ixgbe_update_eeprom_checksum_X550 - Updates the EEPROM checksum and flash + * @hw: pointer to hardware structure + * + * After writing EEPROM to shadow RAM using EEWR register, software calculates + * checksum and updates the EEPROM and instructs the hardware to update + * the flash. + **/ +s32 ixgbe_update_eeprom_checksum_X550(struct ixgbe_hw *hw) +{ + s32 status; + u16 checksum = 0; + + DEBUGFUNC("ixgbe_update_eeprom_checksum_X550"); + + /* Read the first word from the EEPROM. If this times out or fails, do + * not continue or we could be in for a very long wait while every + * EEPROM read fails + */ + status = ixgbe_read_ee_hostif_X550(hw, 0, &checksum); + if (status) { + DEBUGOUT("EEPROM read failed\n"); + return status; + } + + status = ixgbe_calc_eeprom_checksum_X550(hw); + if (status < 0) + return status; + + checksum = (u16)(status & 0xffff); + + status = ixgbe_write_ee_hostif_X550(hw, IXGBE_EEPROM_CHECKSUM, + checksum); + if (status) + return status; + + status = ixgbe_update_flash_X550(hw); + + return status; +} + +/** + * ixgbe_update_flash_X550 - Instruct HW to copy EEPROM to Flash device + * @hw: pointer to hardware structure + * + * Issue a shadow RAM dump to FW to copy EEPROM from shadow RAM to the flash. + **/ +s32 ixgbe_update_flash_X550(struct ixgbe_hw *hw) +{ + s32 status = IXGBE_SUCCESS; + union ixgbe_hic_hdr2 buffer; + + DEBUGFUNC("ixgbe_update_flash_X550"); + + buffer.req.cmd = FW_SHADOW_RAM_DUMP_CMD; + buffer.req.buf_lenh = 0; + buffer.req.buf_lenl = FW_SHADOW_RAM_DUMP_LEN; + buffer.req.checksum = FW_DEFAULT_CHECKSUM; + + status = ixgbe_host_interface_command(hw, (u32 *)&buffer, + sizeof(buffer), + IXGBE_HI_COMMAND_TIMEOUT, FALSE); + + return status; +} + +/** + * ixgbe_get_supported_physical_layer_X550em - Returns physical layer type + * @hw: pointer to hardware structure + * + * Determines physical layer capabilities of the current configuration. + **/ +u32 ixgbe_get_supported_physical_layer_X550em(struct ixgbe_hw *hw) +{ + u32 physical_layer = IXGBE_PHYSICAL_LAYER_UNKNOWN; + u16 ext_ability = 0; + + DEBUGFUNC("ixgbe_get_supported_physical_layer_X550em"); + + hw->phy.ops.identify(hw); + + switch (hw->phy.type) { + case ixgbe_phy_x550em_kr: + physical_layer = IXGBE_PHYSICAL_LAYER_10GBASE_KR | + IXGBE_PHYSICAL_LAYER_1000BASE_KX; + break; + case ixgbe_phy_x550em_kx4: + physical_layer = IXGBE_PHYSICAL_LAYER_10GBASE_KX4 | + IXGBE_PHYSICAL_LAYER_1000BASE_KX; + break; + case ixgbe_phy_x550em_ext_t: + hw->phy.ops.read_reg(hw, IXGBE_MDIO_PHY_EXT_ABILITY, + IXGBE_MDIO_PMA_PMD_DEV_TYPE, + &ext_ability); + if (ext_ability & IXGBE_MDIO_PHY_10GBASET_ABILITY) + physical_layer |= IXGBE_PHYSICAL_LAYER_10GBASE_T; + if (ext_ability & IXGBE_MDIO_PHY_1000BASET_ABILITY) + physical_layer |= IXGBE_PHYSICAL_LAYER_1000BASE_T; + break; + default: + break; + } + + if (hw->mac.ops.get_media_type(hw) == ixgbe_media_type_fiber) + physical_layer = ixgbe_get_supported_phy_sfp_layer_generic(hw); + + return physical_layer; +} + +/** + * ixgbe_get_bus_info_x550em - Set PCI bus info + * @hw: pointer to hardware structure + * + * Sets bus link width and speed to unknown because X550em is + * not a PCI device. + **/ +s32 ixgbe_get_bus_info_X550em(struct ixgbe_hw *hw) +{ + + DEBUGFUNC("ixgbe_get_bus_info_x550em"); + + hw->bus.width = ixgbe_bus_width_unknown; + hw->bus.speed = ixgbe_bus_speed_unknown; + + hw->mac.ops.set_lan_id(hw); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_disable_rx_x550 - Disable RX unit + * + * Enables the Rx DMA unit for x550 + **/ +void ixgbe_disable_rx_x550(struct ixgbe_hw *hw) +{ + u32 rxctrl, pfdtxgswc; + s32 status; + struct ixgbe_hic_disable_rxen fw_cmd; + + DEBUGFUNC("ixgbe_enable_rx_dma_x550"); + + rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); + if (rxctrl & IXGBE_RXCTRL_RXEN) { + pfdtxgswc = IXGBE_READ_REG(hw, IXGBE_PFDTXGSWC); + if (pfdtxgswc & IXGBE_PFDTXGSWC_VT_LBEN) { + pfdtxgswc &= ~IXGBE_PFDTXGSWC_VT_LBEN; + IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, pfdtxgswc); + hw->mac.set_lben = TRUE; + } else { + hw->mac.set_lben = FALSE; + } + + fw_cmd.hdr.cmd = FW_DISABLE_RXEN_CMD; + fw_cmd.hdr.buf_len = FW_DISABLE_RXEN_LEN; + fw_cmd.hdr.checksum = FW_DEFAULT_CHECKSUM; + fw_cmd.port_number = (u8)hw->bus.lan_id; + + status = ixgbe_host_interface_command(hw, (u32 *)&fw_cmd, + sizeof(struct ixgbe_hic_disable_rxen), + IXGBE_HI_COMMAND_TIMEOUT, TRUE); + + /* If we fail - disable RX using register write */ + if (status) { + rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); + if (rxctrl & IXGBE_RXCTRL_RXEN) { + rxctrl &= ~IXGBE_RXCTRL_RXEN; + IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl); + } + } + } +} + +/** + * ixgbe_enter_lplu_x550em - Transition to low power states + * @hw: pointer to hardware structure + * + * Configures Low Power Link Up on transition to low power states + * (from D0 to non-D0). Link is required to enter LPLU so avoid resetting the + * X557 PHY immediately prior to entering LPLU. + **/ +s32 ixgbe_enter_lplu_t_x550em(struct ixgbe_hw *hw) +{ + u16 an_10g_cntl_reg, autoneg_reg, speed; + s32 status; + ixgbe_link_speed lcd_speed; + u32 save_autoneg; + bool link_up; + + /* If blocked by MNG FW, then don't restart AN */ + if (ixgbe_check_reset_blocked(hw)) + return IXGBE_SUCCESS; + + status = ixgbe_ext_phy_t_x550em_get_link(hw, &link_up); + if (status != IXGBE_SUCCESS) + return status; + + status = ixgbe_read_eeprom(hw, NVM_INIT_CTRL_3, &hw->eeprom.ctrl_word_3); + + if (status != IXGBE_SUCCESS) + return status; + + /* If link is down, LPLU disabled in NVM, WoL disabled, or manageability + * disabled, then force link down by entering low power mode. + */ + if (!link_up || !(hw->eeprom.ctrl_word_3 & NVM_INIT_CTRL_3_LPLU) || + !(hw->wol_enabled || ixgbe_mng_present(hw))) + return ixgbe_set_copper_phy_power(hw, FALSE); + + /* Determine LCD */ + status = ixgbe_get_lcd_t_x550em(hw, &lcd_speed); + + if (status != IXGBE_SUCCESS) + return status; + + /* If no valid LCD link speed, then force link down and exit. */ + if (lcd_speed == IXGBE_LINK_SPEED_UNKNOWN) + return ixgbe_set_copper_phy_power(hw, FALSE); + + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_VENDOR_STAT, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &speed); + + if (status != IXGBE_SUCCESS) + return status; + + /* If no link now, speed is invalid so take link down */ + status = ixgbe_ext_phy_t_x550em_get_link(hw, &link_up); + if (status != IXGBE_SUCCESS) + return ixgbe_set_copper_phy_power(hw, FALSE); + + /* clear everything but the speed bits */ + speed &= IXGBE_MDIO_AUTO_NEG_VEN_STAT_SPEED_MASK; + + /* If current speed is already LCD, then exit. */ + if (((speed == IXGBE_MDIO_AUTO_NEG_VENDOR_STATUS_1GB) && + (lcd_speed == IXGBE_LINK_SPEED_1GB_FULL)) || + ((speed == IXGBE_MDIO_AUTO_NEG_VENDOR_STATUS_10GB) && + (lcd_speed == IXGBE_LINK_SPEED_10GB_FULL))) + return status; + + /* Clear AN completed indication */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_VENDOR_TX_ALARM, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_reg); + + if (status != IXGBE_SUCCESS) + return status; + + status = hw->phy.ops.read_reg(hw, IXGBE_MII_10GBASE_T_AUTONEG_CTRL_REG, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &an_10g_cntl_reg); + + if (status != IXGBE_SUCCESS) + return status; + + status = hw->phy.ops.read_reg(hw, + IXGBE_MII_AUTONEG_VENDOR_PROVISION_1_REG, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_reg); + + if (status != IXGBE_SUCCESS) + return status; + + save_autoneg = hw->phy.autoneg_advertised; + + /* Setup link at least common link speed */ + status = hw->mac.ops.setup_link(hw, lcd_speed, FALSE); + + /* restore autoneg from before setting lplu speed */ + hw->phy.autoneg_advertised = save_autoneg; + + return status; +} + +/** + * ixgbe_get_lcd_x550em - Determine lowest common denominator + * @hw: pointer to hardware structure + * @lcd_speed: pointer to lowest common link speed + * + * Determine lowest common link speed with link partner. + **/ +s32 ixgbe_get_lcd_t_x550em(struct ixgbe_hw *hw, ixgbe_link_speed *lcd_speed) +{ + u16 an_lp_status; + s32 status; + u16 word = hw->eeprom.ctrl_word_3; + + *lcd_speed = IXGBE_LINK_SPEED_UNKNOWN; + + status = hw->phy.ops.read_reg(hw, IXGBE_AUTO_NEG_LP_STATUS, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &an_lp_status); + + if (status != IXGBE_SUCCESS) + return status; + + /* If link partner advertised 1G, return 1G */ + if (an_lp_status & IXGBE_AUTO_NEG_LP_1000BASE_CAP) { + *lcd_speed = IXGBE_LINK_SPEED_1GB_FULL; + return status; + } + + /* If 10G disabled for LPLU via NVM D10GMP, then return no valid LCD */ + if ((hw->bus.lan_id && (word & NVM_INIT_CTRL_3_D10GMP_PORT1)) || + (word & NVM_INIT_CTRL_3_D10GMP_PORT0)) + return status; + + /* Link partner not capable of lower speeds, return 10G */ + *lcd_speed = IXGBE_LINK_SPEED_10GB_FULL; + return status; +} + +/** + * ixgbe_setup_fc_X550em - Set up flow control + * @hw: pointer to hardware structure + * + * Called at init time to set up flow control. + **/ +s32 ixgbe_setup_fc_X550em(struct ixgbe_hw *hw) +{ + s32 ret_val = IXGBE_SUCCESS; + u32 pause, asm_dir, reg_val; + + DEBUGFUNC("ixgbe_setup_fc_X550em"); + + /* Validate the requested mode */ + if (hw->fc.strict_ieee && hw->fc.requested_mode == ixgbe_fc_rx_pause) { + ERROR_REPORT1(IXGBE_ERROR_UNSUPPORTED, + "ixgbe_fc_rx_pause not valid in strict IEEE mode\n"); + ret_val = IXGBE_ERR_INVALID_LINK_SETTINGS; + goto out; + } + + /* 10gig parts do not have a word in the EEPROM to determine the + * default flow control setting, so we explicitly set it to full. + */ + if (hw->fc.requested_mode == ixgbe_fc_default) + hw->fc.requested_mode = ixgbe_fc_full; + + /* Determine PAUSE and ASM_DIR bits. */ + switch (hw->fc.requested_mode) { + case ixgbe_fc_none: + pause = 0; + asm_dir = 0; + break; + case ixgbe_fc_tx_pause: + pause = 0; + asm_dir = 1; + break; + case ixgbe_fc_rx_pause: + /* Rx Flow control is enabled and Tx Flow control is + * disabled by software override. Since there really + * isn't a way to advertise that we are capable of RX + * Pause ONLY, we will advertise that we support both + * symmetric and asymmetric Rx PAUSE, as such we fall + * through to the fc_full statement. Later, we will + * disable the adapter's ability to send PAUSE frames. + */ + case ixgbe_fc_full: + pause = 1; + asm_dir = 1; + break; + default: + ERROR_REPORT1(IXGBE_ERROR_ARGUMENT, + "Flow control param set incorrectly\n"); + ret_val = IXGBE_ERR_CONFIG; + goto out; + } + + if (hw->phy.media_type == ixgbe_media_type_backplane) { + ret_val = ixgbe_read_iosf_sb_reg_x550(hw, + IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, ®_val); + if (ret_val != IXGBE_SUCCESS) + goto out; + reg_val &= ~(IXGBE_KRM_AN_CNTL_1_SYM_PAUSE | + IXGBE_KRM_AN_CNTL_1_ASM_PAUSE); + if (pause) + reg_val |= IXGBE_KRM_AN_CNTL_1_SYM_PAUSE; + if (asm_dir) + reg_val |= IXGBE_KRM_AN_CNTL_1_ASM_PAUSE; + ret_val = ixgbe_write_iosf_sb_reg_x550(hw, + IXGBE_KRM_AN_CNTL_1(hw->bus.lan_id), + IXGBE_SB_IOSF_TARGET_KR_PHY, reg_val); + + /* Not all devices fully support AN. */ + if (hw->device_id == IXGBE_DEV_ID_X550EM_X_KR) + hw->fc.disable_fc_autoneg = TRUE; + } + +out: + return ret_val; +} + +/** + * ixgbe_set_mux - Set mux for port 1 access with CS4227 + * @hw: pointer to hardware structure + * @state: set mux if 1, clear if 0 + */ +static void ixgbe_set_mux(struct ixgbe_hw *hw, u8 state) +{ + u32 esdp; + + if (!hw->bus.lan_id) + return; + esdp = IXGBE_READ_REG(hw, IXGBE_ESDP); + if (state) + esdp |= IXGBE_ESDP_SDP1; + else + esdp &= ~IXGBE_ESDP_SDP1; + IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp); + IXGBE_WRITE_FLUSH(hw); +} + +/** + * ixgbe_acquire_swfw_sync_X550em - Acquire SWFW semaphore + * @hw: pointer to hardware structure + * @mask: Mask to specify which semaphore to acquire + * + * Acquires the SWFW semaphore and sets the I2C MUX + **/ +s32 ixgbe_acquire_swfw_sync_X550em(struct ixgbe_hw *hw, u32 mask) +{ + s32 status; + + DEBUGFUNC("ixgbe_acquire_swfw_sync_X550em"); + + status = ixgbe_acquire_swfw_sync_X540(hw, mask); + if (status) + return status; + + if (mask & IXGBE_GSSR_I2C_MASK) + ixgbe_set_mux(hw, 1); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_release_swfw_sync_X550em - Release SWFW semaphore + * @hw: pointer to hardware structure + * @mask: Mask to specify which semaphore to release + * + * Releases the SWFW semaphore and sets the I2C MUX + **/ +void ixgbe_release_swfw_sync_X550em(struct ixgbe_hw *hw, u32 mask) +{ + DEBUGFUNC("ixgbe_release_swfw_sync_X550em"); + + if (mask & IXGBE_GSSR_I2C_MASK) + ixgbe_set_mux(hw, 0); + + ixgbe_release_swfw_sync_X540(hw, mask); +} + +/** + * ixgbe_handle_lasi_ext_t_x550em - Handle external Base T PHY interrupt + * @hw: pointer to hardware structure + * + * Handle external Base T PHY interrupt. If high temperature + * failure alarm then return error, else if link status change + * then setup internal/external PHY link + * + * Return IXGBE_ERR_OVERTEMP if interrupt is high temperature + * failure alarm, else return PHY access status. + */ +s32 ixgbe_handle_lasi_ext_t_x550em(struct ixgbe_hw *hw) +{ + bool lsc; + u32 status; + + status = ixgbe_get_lasi_ext_t_x550em(hw, &lsc); + + if (status != IXGBE_SUCCESS) + return status; + + if (lsc) + return ixgbe_setup_internal_phy(hw); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_setup_mac_link_t_X550em - Sets the auto advertised link speed + * @hw: pointer to hardware structure + * @speed: new link speed + * @autoneg_wait_to_complete: TRUE when waiting for completion is needed + * + * Setup internal/external PHY link speed based on link speed, then set + * external PHY auto advertised link speed. + * + * Returns error status for any failure + **/ +s32 ixgbe_setup_mac_link_t_X550em(struct ixgbe_hw *hw, + ixgbe_link_speed speed, + bool autoneg_wait_to_complete) +{ + s32 status; + ixgbe_link_speed force_speed; + + DEBUGFUNC("ixgbe_setup_mac_link_t_X550em"); + + /* Setup internal/external PHY link speed to iXFI (10G), unless + * only 1G is auto advertised then setup KX link. + */ + if (speed & IXGBE_LINK_SPEED_10GB_FULL) + force_speed = IXGBE_LINK_SPEED_10GB_FULL; + else + force_speed = IXGBE_LINK_SPEED_1GB_FULL; + + /* If internal link mode is XFI, then setup XFI internal link. */ + if (!(hw->phy.nw_mng_if_sel & IXGBE_NW_MNG_IF_SEL_INT_PHY_MODE)) { + status = ixgbe_setup_ixfi_x550em(hw, &force_speed); + + if (status != IXGBE_SUCCESS) + return status; + } + + return hw->phy.ops.setup_link_speed(hw, speed, autoneg_wait_to_complete); +} + +/** + * ixgbe_check_link_t_X550em - Determine link and speed status + * @hw: pointer to hardware structure + * @speed: pointer to link speed + * @link_up: TRUE when link is up + * @link_up_wait_to_complete: bool used to wait for link up or not + * + * Check that both the MAC and X557 external PHY have link. + **/ +s32 ixgbe_check_link_t_X550em(struct ixgbe_hw *hw, ixgbe_link_speed *speed, + bool *link_up, bool link_up_wait_to_complete) +{ + u32 status; + u16 autoneg_status; + + if (hw->mac.ops.get_media_type(hw) != ixgbe_media_type_copper) + return IXGBE_ERR_CONFIG; + + status = ixgbe_check_mac_link_generic(hw, speed, link_up, + link_up_wait_to_complete); + + /* If check link fails or MAC link is not up, then return */ + if (status != IXGBE_SUCCESS || !(*link_up)) + return status; + + /* MAC link is up, so check external PHY link. + * Read this twice back to back to indicate current status. + */ + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_STATUS, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_status); + + if (status != IXGBE_SUCCESS) + return status; + + status = hw->phy.ops.read_reg(hw, IXGBE_MDIO_AUTO_NEG_STATUS, + IXGBE_MDIO_AUTO_NEG_DEV_TYPE, + &autoneg_status); + + if (status != IXGBE_SUCCESS) + return status; + + /* If external PHY link is not up, then indicate link not up */ + if (!(autoneg_status & IXGBE_MDIO_AUTO_NEG_LINK_STATUS)) + *link_up = FALSE; + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_reset_phy_t_X550em - Performs X557 PHY reset and enables LASI + * @hw: pointer to hardware structure + **/ +s32 ixgbe_reset_phy_t_X550em(struct ixgbe_hw *hw) +{ + s32 status; + + status = ixgbe_reset_phy_generic(hw); + + if (status != IXGBE_SUCCESS) + return status; + + /* Configure Link Status Alarm and Temperature Threshold interrupts */ + return ixgbe_enable_lasi_ext_t_x550em(hw); +} + +/** + * ixgbe_led_on_t_X550em - Turns on the software controllable LEDs. + * @hw: pointer to hardware structure + * @led_idx: led number to turn on + **/ +s32 ixgbe_led_on_t_X550em(struct ixgbe_hw *hw, u32 led_idx) +{ + u16 phy_data; + + DEBUGFUNC("ixgbe_led_on_t_X550em"); + + if (led_idx >= IXGBE_X557_MAX_LED_INDEX) + return IXGBE_ERR_PARAM; + + /* To turn on the LED, set mode to ON. */ + ixgbe_read_phy_reg(hw, IXGBE_X557_LED_PROVISIONING + led_idx, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, &phy_data); + phy_data |= IXGBE_X557_LED_MANUAL_SET_MASK; + ixgbe_write_phy_reg(hw, IXGBE_X557_LED_PROVISIONING + led_idx, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, phy_data); + + return IXGBE_SUCCESS; +} + +/** + * ixgbe_led_off_t_X550em - Turns off the software controllable LEDs. + * @hw: pointer to hardware structure + * @led_idx: led number to turn off + **/ +s32 ixgbe_led_off_t_X550em(struct ixgbe_hw *hw, u32 led_idx) +{ + u16 phy_data; + + DEBUGFUNC("ixgbe_led_off_t_X550em"); + + if (led_idx >= IXGBE_X557_MAX_LED_INDEX) + return IXGBE_ERR_PARAM; + + /* To turn on the LED, set mode to ON. */ + ixgbe_read_phy_reg(hw, IXGBE_X557_LED_PROVISIONING + led_idx, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, &phy_data); + phy_data &= ~IXGBE_X557_LED_MANUAL_SET_MASK; + ixgbe_write_phy_reg(hw, IXGBE_X557_LED_PROVISIONING + led_idx, + IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, phy_data); + + return IXGBE_SUCCESS; +} + diff --git a/sys/dev/ixgbe/ixgbe_x550.h b/sys/dev/ixgbe/ixgbe_x550.h new file mode 100644 index 0000000..8a544ec --- /dev/null +++ b/sys/dev/ixgbe/ixgbe_x550.h @@ -0,0 +1,109 @@ +/****************************************************************************** + + Copyright (c) 2001-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD$*/ + +#ifndef _IXGBE_X550_H_ +#define _IXGBE_X550_H_ + +#include "ixgbe_type.h" + +s32 ixgbe_dmac_config_X550(struct ixgbe_hw *hw); +s32 ixgbe_dmac_config_tcs_X550(struct ixgbe_hw *hw); +s32 ixgbe_dmac_update_tcs_X550(struct ixgbe_hw *hw); + +s32 ixgbe_get_bus_info_X550em(struct ixgbe_hw *hw); +s32 ixgbe_init_eeprom_params_X550(struct ixgbe_hw *hw); +s32 ixgbe_update_eeprom_checksum_X550(struct ixgbe_hw *hw); +s32 ixgbe_calc_eeprom_checksum_X550(struct ixgbe_hw *hw); +s32 ixgbe_calc_checksum_X550(struct ixgbe_hw *hw, u16 *buffer, u32 buffer_size); +s32 ixgbe_validate_eeprom_checksum_X550(struct ixgbe_hw *hw, u16 *checksum_val); +s32 ixgbe_update_flash_X550(struct ixgbe_hw *hw); +s32 ixgbe_write_ee_hostif_buffer_X550(struct ixgbe_hw *hw, + u16 offset, u16 words, u16 *data); +s32 ixgbe_write_ee_hostif_X550(struct ixgbe_hw *hw, u16 offset, + u16 data); +s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw, + u16 offset, u16 words, u16 *data); +s32 ixgbe_read_ee_hostif_X550(struct ixgbe_hw *hw, u16 offset, +u16 *data); +s32 ixgbe_read_ee_hostif_data_X550(struct ixgbe_hw *hw, u16 offset, + u16 *data); +s32 ixgbe_write_ee_hostif_data_X550(struct ixgbe_hw *hw, u16 offset, + u16 data); +s32 ixgbe_set_eee_X550(struct ixgbe_hw *hw, bool enable_eee); +s32 ixgbe_setup_eee_X550(struct ixgbe_hw *hw, bool enable_eee); +void ixgbe_set_source_address_pruning_X550(struct ixgbe_hw *hw, bool enable, + unsigned int pool); +void ixgbe_set_ethertype_anti_spoofing_X550(struct ixgbe_hw *hw, + bool enable, int vf); +s32 ixgbe_write_iosf_sb_reg_x550(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u32 data); +s32 ixgbe_read_iosf_sb_reg_x550(struct ixgbe_hw *hw, u32 reg_addr, + u32 device_type, u32 *data); +void ixgbe_disable_mdd_X550(struct ixgbe_hw *hw); +void ixgbe_enable_mdd_X550(struct ixgbe_hw *hw); +void ixgbe_mdd_event_X550(struct ixgbe_hw *hw, u32 *vf_bitmap); +void ixgbe_restore_mdd_vf_X550(struct ixgbe_hw *hw, u32 vf); +enum ixgbe_media_type ixgbe_get_media_type_X550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_sfp_modules_X550em(struct ixgbe_hw *hw); +s32 ixgbe_get_link_capabilities_X550em(struct ixgbe_hw *hw, + ixgbe_link_speed *speed, bool *autoneg); +void ixgbe_init_mac_link_ops_X550em(struct ixgbe_hw *hw); +s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw); +s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_kr_x550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_kx4_x550em(struct ixgbe_hw *hw); +s32 ixgbe_init_ext_t_x550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_internal_phy_t_x550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_phy_loopback_x550em(struct ixgbe_hw *hw); +u32 ixgbe_get_supported_physical_layer_X550em(struct ixgbe_hw *hw); +void ixgbe_disable_rx_x550(struct ixgbe_hw *hw); +s32 ixgbe_get_lcd_t_x550em(struct ixgbe_hw *hw, ixgbe_link_speed *lcd_speed); +s32 ixgbe_enter_lplu_t_x550em(struct ixgbe_hw *hw); +s32 ixgbe_acquire_swfw_sync_X550em(struct ixgbe_hw *hw, u32 mask); +void ixgbe_release_swfw_sync_X550em(struct ixgbe_hw *hw, u32 mask); +s32 ixgbe_setup_fc_X550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_mac_link_sfp_x550em(struct ixgbe_hw *hw, + ixgbe_link_speed speed, + bool autoneg_wait_to_complete); +s32 ixgbe_handle_lasi_ext_t_x550em(struct ixgbe_hw *hw); +s32 ixgbe_setup_mac_link_t_X550em(struct ixgbe_hw *hw, + ixgbe_link_speed speed, + bool autoneg_wait_to_complete); +s32 ixgbe_check_link_t_X550em(struct ixgbe_hw *hw, ixgbe_link_speed *speed, + bool *link_up, bool link_up_wait_to_complete); +s32 ixgbe_reset_phy_t_X550em(struct ixgbe_hw *hw); +s32 ixgbe_identify_sfp_module_X550em(struct ixgbe_hw *hw); +s32 ixgbe_led_on_t_X550em(struct ixgbe_hw *hw, u32 led_idx); +s32 ixgbe_led_off_t_X550em(struct ixgbe_hw *hw, u32 led_idx); +#endif /* _IXGBE_X550_H_ */ diff --git a/sys/dev/ixl/ixl.h b/sys/dev/ixl/ixl.h index df8f04f..1ddfbca3 100644 --- a/sys/dev/ixl/ixl.h +++ b/sys/dev/ixl/ixl.h @@ -324,7 +324,7 @@ #define IXL_SET_IMCASTS(vsi, count) (vsi)->imcasts = (count) #define IXL_SET_OMCASTS(vsi, count) (vsi)->omcasts = (count) #define IXL_SET_IQDROPS(vsi, count) (vsi)->iqdrops = (count) -#define IXL_SET_OQDROPS(vsi, count) (vsi)->iqdrops = (count) +#define IXL_SET_OQDROPS(vsi, count) (vsi)->oqdrops = (count) #define IXL_SET_NOPROTO(vsi, count) (vsi)->noproto = (count) #else #define IXL_SET_IPACKETS(vsi, count) (vsi)->ifp->if_ipackets = (count) diff --git a/sys/dev/mii/brgphy.c b/sys/dev/mii/brgphy.c index 0d22430..32914a3 100644 --- a/sys/dev/mii/brgphy.c +++ b/sys/dev/mii/brgphy.c @@ -160,25 +160,33 @@ static const struct mii_phy_funcs brgphy_funcs = { brgphy_reset }; -#define HS21_PRODUCT_ID "IBM eServer BladeCenter HS21" -#define HS21_BCM_CHIPID 0x57081021 +static const struct hs21_type { + const uint32_t id; + const char *prod; +} hs21_type_lists[] = { + { 0x57081021, "IBM eServer BladeCenter HS21" }, + { 0x57081011, "IBM eServer BladeCenter HS21 -[8853PAU]-" }, +}; static int detect_hs21(struct bce_softc *bce_sc) { char *sysenv; - int found; + int found, i; found = 0; - if (bce_sc->bce_chipid == HS21_BCM_CHIPID) { - sysenv = kern_getenv("smbios.system.product"); - if (sysenv != NULL) { - if (strncmp(sysenv, HS21_PRODUCT_ID, - strlen(HS21_PRODUCT_ID)) == 0) - found = 1; - freeenv(sysenv); + sysenv = kern_getenv("smbios.system.product"); + if (sysenv == NULL) + return (found); + for (i = 0; i < nitems(hs21_type_lists); i++) { + if (bce_sc->bce_chipid == hs21_type_lists[i].id && + strncmp(sysenv, hs21_type_lists[i].prod, + strlen(hs21_type_lists[i].prod)) == 0) { + found++; + break; } } + freeenv(sysenv); return (found); } diff --git a/sys/dev/mxge/if_mxge.c b/sys/dev/mxge/if_mxge.c index fe8c08a..983caa3 100644 --- a/sys/dev/mxge/if_mxge.c +++ b/sys/dev/mxge/if_mxge.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/sx.h> #include <sys/taskqueue.h> +#include <sys/zlib.h> #include <net/if.h> #include <net/if_var.h> @@ -58,7 +59,6 @@ __FBSDID("$FreeBSD$"); #include <net/if_types.h> #include <net/if_vlan_var.h> -#include <net/zlib.h> #include <netinet/in_systm.h> #include <netinet/in.h> diff --git a/sys/dev/pccbb/pccbb_pci.c b/sys/dev/pccbb/pccbb_pci.c index d4f1baa..7dca418 100644 --- a/sys/dev/pccbb/pccbb_pci.c +++ b/sys/dev/pccbb/pccbb_pci.c @@ -259,32 +259,6 @@ cbb_pci_probe(device_t brdev) } /* - * Still need this because the pci code only does power for type 0 - * header devices. - */ -static void -cbb_powerstate_d0(device_t dev) -{ - u_int32_t membase, irq; - - if (pci_get_powerstate(dev) != PCI_POWERSTATE_D0) { - /* Save important PCI config data. */ - membase = pci_read_config(dev, CBBR_SOCKBASE, 4); - irq = pci_read_config(dev, PCIR_INTLINE, 4); - - /* Reset the power state. */ - device_printf(dev, "chip is in D%d power mode " - "-- setting to D0\n", pci_get_powerstate(dev)); - - pci_set_powerstate(dev, PCI_POWERSTATE_D0); - - /* Restore PCI config data. */ - pci_write_config(dev, CBBR_SOCKBASE, membase, 4); - pci_write_config(dev, PCIR_INTLINE, irq, 4); - } -} - -/* * Print out the config space */ static void @@ -321,15 +295,15 @@ cbb_pci_attach(device_t brdev) sc->cbdev = NULL; sc->exca[0].pccarddev = NULL; sc->domain = pci_get_domain(brdev); - sc->bus.sec = pci_read_config(brdev, PCIR_SECBUS_2, 1); - sc->bus.sub = pci_read_config(brdev, PCIR_SUBBUS_2, 1); sc->pribus = pcib_get_bus(parent); #if defined(NEW_PCIB) && defined(PCI_RES_BUS) pci_write_config(brdev, PCIR_PRIBUS_2, sc->pribus, 1); pcib_setup_secbus(brdev, &sc->bus, 1); +#else + sc->bus.sec = pci_read_config(brdev, PCIR_SECBUS_2, 1); + sc->bus.sub = pci_read_config(brdev, PCIR_SUBBUS_2, 1); #endif SLIST_INIT(&sc->rl); - cbb_powerstate_d0(brdev); rid = CBBR_SOCKBASE; sc->base_res = bus_alloc_resource_any(brdev, SYS_RES_MEMORY, &rid, @@ -467,18 +441,13 @@ cbb_chipinit(struct cbb_softc *sc) uint32_t mux, sysctrl, reg; /* Set CardBus latency timer */ - if (pci_read_config(sc->dev, PCIR_SECLAT_1, 1) < 0x20) - pci_write_config(sc->dev, PCIR_SECLAT_1, 0x20, 1); + if (pci_read_config(sc->dev, PCIR_SECLAT_2, 1) < 0x20) + pci_write_config(sc->dev, PCIR_SECLAT_2, 0x20, 1); /* Set PCI latency timer */ if (pci_read_config(sc->dev, PCIR_LATTIMER, 1) < 0x20) pci_write_config(sc->dev, PCIR_LATTIMER, 0x20, 1); - /* Restore bus configuration */ - pci_write_config(sc->dev, PCIR_PRIBUS_2, sc->pribus, 1); - pci_write_config(sc->dev, PCIR_SECBUS_2, sc->bus.sec, 1); - pci_write_config(sc->dev, PCIR_SUBBUS_2, sc->bus.sub, 1); - /* Enable DMA, memory access for this card and I/O acces for children */ pci_enable_busmaster(sc->dev); pci_enable_io(sc->dev, SYS_RES_IOPORT); @@ -906,15 +875,10 @@ cbb_pci_resume(device_t brdev) * from D0 and back to D0 cause the bridge to lose its config space, so * all the bus mappings and such are preserved. * - * For most drivers, the PCI layer handles this saving. However, since - * there's much black magic and arcane art hidden in these few lines of - * code that would be difficult to transition into the PCI - * layer. chipinit was several years of trial and error to write. + * The PCI layer handles standard PCI registers like the + * command register and BARs, but cbb-specific registers are + * handled here. */ - pci_write_config(brdev, CBBR_SOCKBASE, rman_get_start(sc->base_res), 4); - DEVPRINTF((brdev, "PCI Memory allocated: %08lx\n", - rman_get_start(sc->base_res))); - sc->chipinit(sc); /* reset interrupt -- Do we really need to do this? */ diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index 3fab486..b4c6151 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -583,12 +583,24 @@ pci_hdrtypedata(device_t pcib, int b, int s, int f, pcicfgregs *cfg) case PCIM_HDRTYPE_NORMAL: cfg->subvendor = REG(PCIR_SUBVEND_0, 2); cfg->subdevice = REG(PCIR_SUBDEV_0, 2); + cfg->mingnt = REG(PCIR_MINGNT, 1); + cfg->maxlat = REG(PCIR_MAXLAT, 1); cfg->nummaps = PCI_MAXMAPS_0; break; case PCIM_HDRTYPE_BRIDGE: + cfg->bridge.br_seclat = REG(PCIR_SECLAT_1, 1); + cfg->bridge.br_subbus = REG(PCIR_SUBBUS_1, 1); + cfg->bridge.br_secbus = REG(PCIR_SECBUS_1, 1); + cfg->bridge.br_pribus = REG(PCIR_PRIBUS_1, 1); + cfg->bridge.br_control = REG(PCIR_BRIDGECTL_1, 2); cfg->nummaps = PCI_MAXMAPS_1; break; case PCIM_HDRTYPE_CARDBUS: + cfg->bridge.br_seclat = REG(PCIR_SECLAT_2, 1); + cfg->bridge.br_subbus = REG(PCIR_SUBBUS_2, 1); + cfg->bridge.br_secbus = REG(PCIR_SECBUS_2, 1); + cfg->bridge.br_pribus = REG(PCIR_PRIBUS_2, 1); + cfg->bridge.br_control = REG(PCIR_BRIDGECTL_2, 2); cfg->subvendor = REG(PCIR_SUBVEND_2, 2); cfg->subdevice = REG(PCIR_SUBDEV_2, 2); cfg->nummaps = PCI_MAXMAPS_2; @@ -641,9 +653,6 @@ pci_fill_devinfo(device_t pcib, int d, int b, int s, int f, uint16_t vid, cfg->intpin = REG(PCIR_INTPIN, 1); cfg->intline = REG(PCIR_INTLINE, 1); - cfg->mingnt = REG(PCIR_MINGNT, 1); - cfg->maxlat = REG(PCIR_MAXLAT, 1); - cfg->mfdev = (cfg->hdrtype & PCIM_MFDEV) != 0; cfg->hdrtype &= ~PCIM_MFDEV; STAILQ_INIT(&cfg->maps); @@ -4425,9 +4434,17 @@ pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) *result = cfg->cachelnsz; break; case PCI_IVAR_MINGNT: + if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) { + *result = -1; + return (EINVAL); + } *result = cfg->mingnt; break; case PCI_IVAR_MAXLAT: + if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) { + *result = -1; + return (EINVAL); + } *result = cfg->maxlat; break; case PCI_IVAR_LATTIMER: @@ -5125,16 +5142,6 @@ pci_cfg_restore(device_t dev, struct pci_devinfo *dinfo) { /* - * Only do header type 0 devices. Type 1 devices are bridges, - * which we know need special treatment. Type 2 devices are - * cardbus bridges which also require special treatment. - * Other types are unknown, and we err on the side of safety - * by ignoring them. - */ - if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) - return; - - /* * Restore the device to full power mode. We must do this * before we restore the registers because moving from D3 to * D0 will cause the chip's BARs and some other registers to @@ -5144,16 +5151,44 @@ pci_cfg_restore(device_t dev, struct pci_devinfo *dinfo) */ if (pci_get_powerstate(dev) != PCI_POWERSTATE_D0) pci_set_powerstate(dev, PCI_POWERSTATE_D0); - pci_restore_bars(dev); pci_write_config(dev, PCIR_COMMAND, dinfo->cfg.cmdreg, 2); pci_write_config(dev, PCIR_INTLINE, dinfo->cfg.intline, 1); pci_write_config(dev, PCIR_INTPIN, dinfo->cfg.intpin, 1); - pci_write_config(dev, PCIR_MINGNT, dinfo->cfg.mingnt, 1); - pci_write_config(dev, PCIR_MAXLAT, dinfo->cfg.maxlat, 1); pci_write_config(dev, PCIR_CACHELNSZ, dinfo->cfg.cachelnsz, 1); pci_write_config(dev, PCIR_LATTIMER, dinfo->cfg.lattimer, 1); pci_write_config(dev, PCIR_PROGIF, dinfo->cfg.progif, 1); pci_write_config(dev, PCIR_REVID, dinfo->cfg.revid, 1); + switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) { + case PCIM_HDRTYPE_NORMAL: + pci_write_config(dev, PCIR_MINGNT, dinfo->cfg.mingnt, 1); + pci_write_config(dev, PCIR_MAXLAT, dinfo->cfg.maxlat, 1); + break; + case PCIM_HDRTYPE_BRIDGE: + pci_write_config(dev, PCIR_SECLAT_1, + dinfo->cfg.bridge.br_seclat, 1); + pci_write_config(dev, PCIR_SUBBUS_1, + dinfo->cfg.bridge.br_subbus, 1); + pci_write_config(dev, PCIR_SECBUS_1, + dinfo->cfg.bridge.br_secbus, 1); + pci_write_config(dev, PCIR_PRIBUS_1, + dinfo->cfg.bridge.br_pribus, 1); + pci_write_config(dev, PCIR_BRIDGECTL_1, + dinfo->cfg.bridge.br_control, 2); + break; + case PCIM_HDRTYPE_CARDBUS: + pci_write_config(dev, PCIR_SECLAT_2, + dinfo->cfg.bridge.br_seclat, 1); + pci_write_config(dev, PCIR_SUBBUS_2, + dinfo->cfg.bridge.br_subbus, 1); + pci_write_config(dev, PCIR_SECBUS_2, + dinfo->cfg.bridge.br_secbus, 1); + pci_write_config(dev, PCIR_PRIBUS_2, + dinfo->cfg.bridge.br_pribus, 1); + pci_write_config(dev, PCIR_BRIDGECTL_2, + dinfo->cfg.bridge.br_control, 2); + break; + } + pci_restore_bars(dev); /* * Restore extended capabilities for PCI-Express and PCI-X @@ -5222,40 +5257,57 @@ pci_cfg_save(device_t dev, struct pci_devinfo *dinfo, int setstate) int ps; /* - * Only do header type 0 devices. Type 1 devices are bridges, which - * we know need special treatment. Type 2 devices are cardbus bridges - * which also require special treatment. Other types are unknown, and - * we err on the side of safety by ignoring them. Powering down - * bridges should not be undertaken lightly. - */ - if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) - return; - - /* * Some drivers apparently write to these registers w/o updating our * cached copy. No harm happens if we update the copy, so do so here * so we can restore them. The COMMAND register is modified by the * bus w/o updating the cache. This should represent the normally - * writable portion of the 'defined' part of type 0 headers. In - * theory we also need to save/restore the PCI capability structures - * we know about, but apart from power we don't know any that are - * writable. + * writable portion of the 'defined' part of type 0/1/2 headers. */ - dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_0, 2); - dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_0, 2); dinfo->cfg.vendor = pci_read_config(dev, PCIR_VENDOR, 2); dinfo->cfg.device = pci_read_config(dev, PCIR_DEVICE, 2); dinfo->cfg.cmdreg = pci_read_config(dev, PCIR_COMMAND, 2); dinfo->cfg.intline = pci_read_config(dev, PCIR_INTLINE, 1); dinfo->cfg.intpin = pci_read_config(dev, PCIR_INTPIN, 1); - dinfo->cfg.mingnt = pci_read_config(dev, PCIR_MINGNT, 1); - dinfo->cfg.maxlat = pci_read_config(dev, PCIR_MAXLAT, 1); dinfo->cfg.cachelnsz = pci_read_config(dev, PCIR_CACHELNSZ, 1); dinfo->cfg.lattimer = pci_read_config(dev, PCIR_LATTIMER, 1); dinfo->cfg.baseclass = pci_read_config(dev, PCIR_CLASS, 1); dinfo->cfg.subclass = pci_read_config(dev, PCIR_SUBCLASS, 1); dinfo->cfg.progif = pci_read_config(dev, PCIR_PROGIF, 1); dinfo->cfg.revid = pci_read_config(dev, PCIR_REVID, 1); + switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) { + case PCIM_HDRTYPE_NORMAL: + dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_0, 2); + dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_0, 2); + dinfo->cfg.mingnt = pci_read_config(dev, PCIR_MINGNT, 1); + dinfo->cfg.maxlat = pci_read_config(dev, PCIR_MAXLAT, 1); + break; + case PCIM_HDRTYPE_BRIDGE: + dinfo->cfg.bridge.br_seclat = pci_read_config(dev, + PCIR_SECLAT_1, 1); + dinfo->cfg.bridge.br_subbus = pci_read_config(dev, + PCIR_SUBBUS_1, 1); + dinfo->cfg.bridge.br_secbus = pci_read_config(dev, + PCIR_SECBUS_1, 1); + dinfo->cfg.bridge.br_pribus = pci_read_config(dev, + PCIR_PRIBUS_1, 1); + dinfo->cfg.bridge.br_control = pci_read_config(dev, + PCIR_BRIDGECTL_1, 2); + break; + case PCIM_HDRTYPE_CARDBUS: + dinfo->cfg.bridge.br_seclat = pci_read_config(dev, + PCIR_SECLAT_2, 1); + dinfo->cfg.bridge.br_subbus = pci_read_config(dev, + PCIR_SUBBUS_2, 1); + dinfo->cfg.bridge.br_secbus = pci_read_config(dev, + PCIR_SECBUS_2, 1); + dinfo->cfg.bridge.br_pribus = pci_read_config(dev, + PCIR_PRIBUS_2, 1); + dinfo->cfg.bridge.br_control = pci_read_config(dev, + PCIR_BRIDGECTL_2, 2); + dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_2, 2); + dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_2, 2); + break; + } if (dinfo->cfg.pcie.pcie_location != 0) pci_cfg_save_pcie(dev, dinfo); diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c index d0d8fb7..789a918 100644 --- a/sys/dev/pci/pci_pci.c +++ b/sys/dev/pci/pci_pci.c @@ -553,18 +553,22 @@ void pcib_setup_secbus(device_t dev, struct pcib_secbus *bus, int min_count) { char buf[64]; - int error, rid; + int error, rid, sec_reg; switch (pci_read_config(dev, PCIR_HDRTYPE, 1) & PCIM_HDRTYPE) { case PCIM_HDRTYPE_BRIDGE: + sec_reg = PCIR_SECBUS_1; bus->sub_reg = PCIR_SUBBUS_1; break; case PCIM_HDRTYPE_CARDBUS: + sec_reg = PCIR_SECBUS_2; bus->sub_reg = PCIR_SUBBUS_2; break; default: panic("not a PCI bridge"); } + bus->sec = pci_read_config(dev, sec_reg, 1); + bus->sub = pci_read_config(dev, bus->sub_reg, 1); bus->dev = dev; bus->rman.rm_start = 0; bus->rman.rm_end = PCI_BUSMAX; @@ -849,20 +853,16 @@ pcib_set_mem_decode(struct pcib_softc *sc) static void pcib_cfg_save(struct pcib_softc *sc) { +#ifndef NEW_PCIB device_t dev; + uint16_t command; dev = sc->dev; - sc->command = pci_read_config(dev, PCIR_COMMAND, 2); - sc->pribus = pci_read_config(dev, PCIR_PRIBUS_1, 1); - sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1); - sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1); - sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2); - sc->seclat = pci_read_config(dev, PCIR_SECLAT_1, 1); -#ifndef NEW_PCIB - if (sc->command & PCIM_CMD_PORTEN) + command = pci_read_config(dev, PCIR_COMMAND, 2); + if (command & PCIM_CMD_PORTEN) pcib_get_io_decode(sc); - if (sc->command & PCIM_CMD_MEMEN) + if (command & PCIM_CMD_MEMEN) pcib_get_mem_decode(sc); #endif } @@ -874,21 +874,18 @@ static void pcib_cfg_restore(struct pcib_softc *sc) { device_t dev; - +#ifndef NEW_PCIB + uint16_t command; +#endif dev = sc->dev; - pci_write_config(dev, PCIR_COMMAND, sc->command, 2); - pci_write_config(dev, PCIR_PRIBUS_1, sc->pribus, 1); - pci_write_config(dev, PCIR_SECBUS_1, sc->bus.sec, 1); - pci_write_config(dev, PCIR_SUBBUS_1, sc->bus.sub, 1); - pci_write_config(dev, PCIR_BRIDGECTL_1, sc->bridgectl, 2); - pci_write_config(dev, PCIR_SECLAT_1, sc->seclat, 1); #ifdef NEW_PCIB pcib_write_windows(sc, WIN_IO | WIN_MEM | WIN_PMEM); #else - if (sc->command & PCIM_CMD_PORTEN) + command = pci_read_config(dev, PCIR_COMMAND, 2); + if (command & PCIM_CMD_PORTEN) pcib_set_io_decode(sc); - if (sc->command & PCIM_CMD_MEMEN) + if (command & PCIM_CMD_MEMEN) pcib_set_mem_decode(sc); #endif } @@ -922,7 +919,11 @@ pcib_attach_common(device_t dev) * Get current bridge configuration. */ sc->domain = pci_get_domain(dev); - sc->secstat = pci_read_config(dev, PCIR_SECSTAT_1, 2); +#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) + sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1); + sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1); +#endif + sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2); pcib_cfg_save(sc); /* @@ -950,7 +951,7 @@ pcib_attach_common(device_t dev) * Quirk handling. */ switch (pci_get_devid(dev)) { -#if !defined(NEW_PCIB) && !defined(PCI_RES_BUS) +#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) case 0x12258086: /* Intel 82454KX/GX (Orion) */ { uint8_t supbus; @@ -976,7 +977,7 @@ pcib_attach_common(device_t dev) sc->flags |= PCIB_SUBTRACTIVE; break; -#if !defined(NEW_PCIB) && !defined(PCI_RES_BUS) +#if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) /* Compaq R3000 BIOS sets wrong subordinate bus number. */ case 0x00dd10de: { @@ -1101,32 +1102,15 @@ pcib_attach(device_t dev) int pcib_suspend(device_t dev) { - device_t pcib; - int dstate, error; pcib_cfg_save(device_get_softc(dev)); - error = bus_generic_suspend(dev); - if (error == 0 && pci_do_power_suspend) { - dstate = PCI_POWERSTATE_D3; - pcib = device_get_parent(device_get_parent(dev)); - if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0) - pci_set_powerstate(dev, dstate); - } - return (error); + return (bus_generic_suspend(dev)); } int pcib_resume(device_t dev) { - device_t pcib; - int dstate; - - if (pci_do_power_resume) { - pcib = device_get_parent(device_get_parent(dev)); - dstate = PCI_POWERSTATE_D0; - if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0) - pci_set_powerstate(dev, dstate); - } + pcib_cfg_restore(device_get_softc(dev)); return (bus_generic_resume(dev)); } diff --git a/sys/dev/pci/pci_subr.c b/sys/dev/pci/pci_subr.c index 5d0db36..03bcdc0 100644 --- a/sys/dev/pci/pci_subr.c +++ b/sys/dev/pci/pci_subr.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2011 Advanced Computing Technologies LLC + * Copyright (c) 2011 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/dev/pci/pcib_private.h b/sys/dev/pci/pcib_private.h index d8d82b6..20cb014 100644 --- a/sys/dev/pci/pcib_private.h +++ b/sys/dev/pci/pcib_private.h @@ -106,7 +106,6 @@ struct pcib_softc #define PCIB_DISABLE_MSI 0x2 #define PCIB_DISABLE_MSIX 0x4 #define PCIB_ENABLE_ARI 0x8 - uint16_t command; /* command register */ u_int domain; /* domain number */ u_int pribus; /* primary bus number */ struct pcib_secbus bus; /* secondary bus numbers */ @@ -122,9 +121,7 @@ struct pcib_softc uint32_t iobase; /* base address of port window */ uint32_t iolimit; /* topmost address of port window */ #endif - uint16_t secstat; /* secondary bus status register */ uint16_t bridgectl; /* bridge control register */ - uint8_t seclat; /* secondary bus latency timer */ }; #define PCIB_SUPPORTED_ARI_VER 1 diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h index 478c98e..2fd76b6 100644 --- a/sys/dev/pci/pcivar.h +++ b/sys/dev/pci/pcivar.h @@ -41,6 +41,15 @@ typedef uint64_t pci_addr_t; struct nvlist; +/* Config registers for PCI-PCI and PCI-Cardbus bridges. */ +struct pcicfg_bridge { + uint8_t br_seclat; + uint8_t br_subbus; + uint8_t br_secbus; + uint8_t br_pribus; + uint16_t br_control; +}; + /* Interesting values for PCI power management */ struct pcicfg_pp { uint16_t pp_cap; /* PCI power management capabilities */ @@ -190,6 +199,7 @@ typedef struct pcicfg { uint32_t flags; /* flags defined above */ size_t devinfo_size; /* Size of devinfo for this bus type. */ + struct pcicfg_bridge bridge; /* Bridges */ struct pcicfg_pp pp; /* Power management */ struct pcicfg_vpd vpd; /* Vital product data */ struct pcicfg_msi msi; /* PCI MSI */ diff --git a/sys/dev/smbus/smb.c b/sys/dev/smbus/smb.c index 579204d..e842441 100644 --- a/sys/dev/smbus/smb.c +++ b/sys/dev/smbus/smb.c @@ -26,10 +26,6 @@ * $FreeBSD$ */ -#ifdef HAVE_KERNEL_OPTION_HEADERS -#include "opt_compat.h" -#endif - #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> @@ -104,19 +100,24 @@ smb_identify(driver_t *driver, device_t parent) static int smb_probe(device_t dev) { - device_set_desc(dev, "SMBus generic I/O"); + if (smbus_get_addr(dev) != -1) + return (ENXIO); - return (0); + device_set_desc(dev, "SMBus generic I/O"); + return (BUS_PROBE_NOWILDCARD); } static int smb_attach(device_t dev) { struct smb_softc *sc = device_get_softc(dev); - + int unit; + + unit = device_get_unit(dev); sc->sc_dev = dev; - sc->sc_devnode = make_dev(&smb_cdevsw, device_get_unit(dev), - UID_ROOT, GID_WHEEL, 0600, "smb%d", device_get_unit(dev)); + + sc->sc_devnode = make_dev(&smb_cdevsw, unit, UID_ROOT, GID_WHEEL, + 0600, "smb%d", unit); sc->sc_devnode->si_drv1 = sc; mtx_init(&sc->sc_lock, device_get_nameunit(dev), NULL, MTX_DEF); @@ -174,9 +175,16 @@ smbioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags, struct thread *t struct smb_softc *sc = dev->si_drv1; device_t smbdev = sc->sc_dev; int error; - short w; - u_char count; - char c; + int unit; + u_char bcount; + + /* + * If a specific slave device is being used, override any passed-in + * slave. + */ + unit = dev2unit(dev); + if (unit & 0x0400) + s->slave = unit & 0x03ff; parent = device_get_parent(smbdev); @@ -208,77 +216,101 @@ smbioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags, struct thread *t case SMB_WRITEB: error = smbus_error(smbus_writeb(parent, s->slave, s->cmd, - s->data.byte)); + s->wdata.byte)); break; case SMB_WRITEW: error = smbus_error(smbus_writew(parent, s->slave, - s->cmd, s->data.word)); + s->cmd, s->wdata.word)); break; case SMB_READB: - if (s->data.byte_ptr) { - error = smbus_error(smbus_readb(parent, s->slave, - s->cmd, &c)); - if (error) - break; - error = copyout(&c, s->data.byte_ptr, - sizeof(*(s->data.byte_ptr))); + error = smbus_error(smbus_readb(parent, s->slave, s->cmd, + &s->rdata.byte)); + if (error) + break; + if (s->rbuf && s->rcount >= 1) { + error = copyout(&s->rdata.byte, s->rbuf, 1); + s->rcount = 1; } break; case SMB_READW: - if (s->data.word_ptr) { - error = smbus_error(smbus_readw(parent, s->slave, - s->cmd, &w)); - if (error == 0) { - error = copyout(&w, s->data.word_ptr, - sizeof(*(s->data.word_ptr))); - } + error = smbus_error(smbus_readw(parent, s->slave, s->cmd, + &s->rdata.word)); + if (error) + break; + if (s->rbuf && s->rcount >= 2) { + buf[0] = (u_char)s->rdata.word; + buf[1] = (u_char)(s->rdata.word >> 8); + error = copyout(buf, s->rbuf, 2); + s->rcount = 2; } break; case SMB_PCALL: - if (s->data.process.rdata) { - - error = smbus_error(smbus_pcall(parent, s->slave, s->cmd, - s->data.process.sdata, &w)); - if (error) - break; - error = copyout(&w, s->data.process.rdata, - sizeof(*(s->data.process.rdata))); + error = smbus_error(smbus_pcall(parent, s->slave, s->cmd, + s->wdata.word, &s->rdata.word)); + if (error) + break; + if (s->rbuf && s->rcount >= 2) { + buf[0] = (u_char)s->rdata.word; + buf[1] = (u_char)(s->rdata.word >> 8); + error = copyout(buf, s->rbuf, 2); + s->rcount = 2; } - + break; case SMB_BWRITE: - if (s->count && s->data.byte_ptr) { - if (s->count > SMB_MAXBLOCKSIZE) - s->count = SMB_MAXBLOCKSIZE; - error = copyin(s->data.byte_ptr, buf, s->count); - if (error) - break; - error = smbus_error(smbus_bwrite(parent, s->slave, - s->cmd, s->count, buf)); + if (s->wcount < 0) { + error = EINVAL; + break; } + if (s->wcount > SMB_MAXBLOCKSIZE) + s->wcount = SMB_MAXBLOCKSIZE; + if (s->wcount) + error = copyin(s->wbuf, buf, s->wcount); + if (error) + break; + error = smbus_error(smbus_bwrite(parent, s->slave, s->cmd, + s->wcount, buf)); break; -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD6) - case SMB_OLD_BREAD: -#endif case SMB_BREAD: - if (s->count && s->data.byte_ptr) { - count = min(s->count, SMB_MAXBLOCKSIZE); - error = smbus_error(smbus_bread(parent, s->slave, - s->cmd, &count, buf)); - if (error) - break; - error = copyout(buf, s->data.byte_ptr, - min(count, s->count)); - s->count = count; + if (s->rcount < 0) { + error = EINVAL; + break; + } + if (s->rcount > SMB_MAXBLOCKSIZE) + s->rcount = SMB_MAXBLOCKSIZE; + error = smbus_error(smbus_bread(parent, s->slave, s->cmd, + &bcount, buf)); + if (error) + break; + if (s->rcount > bcount) + s->rcount = bcount; + error = copyout(buf, s->rbuf, s->rcount); + break; + + case SMB_TRANS: + if (s->rcount < 0 || s->wcount < 0) { + error = EINVAL; + break; } + if (s->rcount > SMB_MAXBLOCKSIZE) + s->rcount = SMB_MAXBLOCKSIZE; + if (s->wcount > SMB_MAXBLOCKSIZE) + s->wcount = SMB_MAXBLOCKSIZE; + if (s->wcount) + error = copyin(s->wbuf, buf, s->wcount); + if (error) + break; + error = smbus_error(smbus_trans(parent, s->slave, s->cmd, + s->op, buf, s->wcount, buf, s->rcount, &s->rcount)); + if (error == 0) + error = copyout(buf, s->rbuf, s->rcount); break; - default: error = ENOTTY; } diff --git a/sys/dev/smbus/smb.h b/sys/dev/smbus/smb.h index 32347f8..8007653 100644 --- a/sys/dev/smbus/smb.h +++ b/sys/dev/smbus/smb.h @@ -32,27 +32,33 @@ #include <sys/ioccom.h> struct smbcmd { - char cmd; - int count; - u_char slave; + u_char cmd; + u_char reserved; + u_short op; union { - char byte; - short word; - - char *byte_ptr; - short *word_ptr; - - struct { - short sdata; - short *rdata; - } process; - } data; + char byte; + char buf[2]; + short word; + } wdata; + union { + char byte; + char buf[2]; + short word; + } rdata; + int slave; + char *wbuf; /* use wdata if NULL */ + int wcount; + char *rbuf; /* use rdata if NULL */ + int rcount; }; /* * SMBus spec 2.0 says block transfers may be at most 32 bytes. + * We use SMBus for i2c as well, make the size limit something more + * reasonable. Keep in mind that a char buf array is declared on the + * kernel stack. */ -#define SMB_MAXBLOCKSIZE 32 +#define SMB_MAXBLOCKSIZE 1024 #define SMB_QUICK_WRITE _IOW('i', 1, struct smbcmd) #define SMB_QUICK_READ _IOW('i', 2, struct smbcmd) @@ -66,5 +72,6 @@ struct smbcmd { #define SMB_BWRITE _IOW('i', 10, struct smbcmd) #define SMB_OLD_BREAD _IOW('i', 11, struct smbcmd) #define SMB_BREAD _IOWR('i', 11, struct smbcmd) +#define SMB_TRANS _IOWR('i', 12, struct smbcmd) #endif diff --git a/sys/dev/smbus/smbconf.h b/sys/dev/smbus/smbconf.h index a3d403d..f39c442 100644 --- a/sys/dev/smbus/smbconf.h +++ b/sys/dev/smbus/smbconf.h @@ -68,9 +68,30 @@ #define SMB_QREAD 0x1 /* + * smbus transction op with pass-thru capabilities + * + * This smbus function is capable of doing a smbus command transaction + * (read or write), and can be flagged to not issue the 'cmd' and/or + * issue or expect a count field as well as flagged for chaining (no STOP), + * which gives it an i2c pass-through capability. + * + * NOSTOP- Caller chaining transactions, do not issue STOP + * NOCMD- Do not transmit the command field + * NOCNT- Do not transmit (wr) or expect (rd) the count field + */ +#define SMB_TRANS_NOSTOP 0x0001 /* do not send STOP at end */ +#define SMB_TRANS_NOCMD 0x0002 /* ignore cmd field (do not tx) */ +#define SMB_TRANS_NOCNT 0x0004 /* do not tx or rx count field */ +#define SMB_TRANS_7BIT 0x0008 /* change address mode to 7-bit */ +#define SMB_TRANS_10BIT 0x0010 /* change address mode to 10-bit */ +#define SMB_TRANS_NOREPORT 0x0020 /* do not report errors */ + +/* * ivars codes */ -#define SMBUS_IVAR_ADDR 0x1 /* slave address of the device */ +enum smbus_ivars { + SMBUS_IVAR_ADDR, /* slave address of the device */ +}; int smbus_request_bus(device_t, device_t, int); int smbus_release_bus(device_t, device_t); @@ -79,7 +100,12 @@ int smbus_error(int error); void smbus_intr(device_t, u_char, char low, char high, int error); -u_char smbus_get_addr(device_t); +#define SMBUS_ACCESSOR(var, ivar, type) \ + __BUS_ACCESSOR(smbus, var, SMBUS, ivar, type) + +SMBUS_ACCESSOR(addr, ADDR, int) + +#undef SMBUS_ACCESSOR extern driver_t smbus_driver; extern devclass_t smbus_devclass; @@ -104,6 +130,9 @@ extern devclass_t smbus_devclass; (SMBUS_BWRITE(device_get_parent(bus), slave, cmd, count, buf)) #define smbus_bread(bus,slave,cmd,count,buf) \ (SMBUS_BREAD(device_get_parent(bus), slave, cmd, count, buf)) +#define smbus_trans(bus,slave,cmd,op,wbuf,wcount,rbuf,rcount,actualp) \ + (SMBUS_TRANS(device_get_parent(bus), slave, cmd, op, \ + wbuf, wcount, rbuf, rcount, actualp)) #define SMBUS_MODVER 1 #define SMBUS_MINVER 1 diff --git a/sys/dev/smbus/smbus.c b/sys/dev/smbus/smbus.c index a111332..389efac 100644 --- a/sys/dev/smbus/smbus.c +++ b/sys/dev/smbus/smbus.c @@ -33,11 +33,15 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/module.h> #include <sys/mutex.h> -#include <sys/bus.h> +#include <sys/bus.h> #include <dev/smbus/smbconf.h> #include <dev/smbus/smbus.h> +#include "smbus_if.h" +#include "bus_if.h" + + /* * Autoconfiguration and support routines for System Management bus */ @@ -49,6 +53,13 @@ static int smbus_probe(device_t); static int smbus_attach(device_t); static int smbus_detach(device_t); +static int smbus_child_location_str(device_t parent, device_t child, + char *buf, size_t buflen); +static int smbus_print_child(device_t parent, device_t child); +static void smbus_probe_device(device_t dev, u_char* addr); +static int smbus_read_ivar(device_t parent, device_t child, int which, + uintptr_t *result); + static device_method_t smbus_methods[] = { /* device interface */ DEVMETHOD(device_probe, smbus_probe), @@ -57,6 +68,10 @@ static device_method_t smbus_methods[] = { /* bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_child_location_str, smbus_child_location_str), + DEVMETHOD(bus_driver_added, bus_generic_driver_added), + DEVMETHOD(bus_print_child, smbus_print_child), + DEVMETHOD(bus_read_ivar, smbus_read_ivar), DEVMETHOD_END }; @@ -87,9 +102,14 @@ static int smbus_attach(device_t dev) { struct smbus_softc *sc = device_get_softc(dev); + unsigned char addr; mtx_init(&sc->lock, device_get_nameunit(dev), "smbus", MTX_DEF); bus_generic_probe(dev); + for (addr = SMBUS_ADDR_MIN; addr < SMBUS_ADDR_MAX; ++addr) { + sc->addrs[addr] = addr; + smbus_probe_device(dev, &sc->addrs[addr]); + } bus_generic_attach(dev); return (0); @@ -114,4 +134,70 @@ smbus_generic_intr(device_t dev, u_char devaddr, char low, char high, int err) { } +static void +smbus_probe_device(device_t dev, u_char* addr) +{ + device_t child; + int error; + u_char cmd; + u_char buf[2]; + + cmd = 0x01; + error = smbus_trans(dev, *addr, cmd, + SMB_TRANS_NOCNT | SMB_TRANS_NOREPORT, + NULL, 0, buf, 1, NULL); + if (error == 0) { + if (bootverbose) + device_printf(dev, "Probed address 0x%02x\n", *addr); + child = device_add_child(dev, NULL, -1); + device_set_ivars(child, addr); + } +} + +static int +smbus_child_location_str(device_t parent, device_t child, char *buf, + size_t buflen) +{ + unsigned char *addr; + + addr = device_get_ivars(child); + if (addr) + snprintf(buf, buflen, "addr=0x%x", *addr); + else if (buflen) + buf[0] = 0; + return (0); +} + +static int +smbus_print_child(device_t parent, device_t child) +{ + unsigned char *addr; + int retval; + + addr = device_get_ivars(child); + retval = bus_print_child_header(parent, child); + if (addr) + retval += printf(" at addr 0x%x", *addr); + retval += bus_print_child_footer(parent, child); + + return (retval); +} + +static int +smbus_read_ivar(device_t parent, device_t child, int which, + uintptr_t *result) +{ + unsigned char *addr; + + addr = device_get_ivars(child); + switch (which) { + case SMBUS_IVAR_ADDR: + *result = (addr == NULL) ? -1 : *addr; + break; + default: + return (ENOENT); + } + return (0); +} + MODULE_VERSION(smbus, SMBUS_MODVER); diff --git a/sys/dev/smbus/smbus.h b/sys/dev/smbus/smbus.h index 5626835..2093aa0 100644 --- a/sys/dev/smbus/smbus.h +++ b/sys/dev/smbus/smbus.h @@ -29,9 +29,13 @@ #ifndef __SMBUS_H #define __SMBUS_H +#define SMBUS_ADDR_MIN 0x10 +#define SMBUS_ADDR_MAX 0x70 + struct smbus_softc { device_t owner; /* smbus owner device structure */ struct mtx lock; + unsigned char addrs[SMBUS_ADDR_MAX]; }; void smbus_generic_intr(device_t dev, u_char devaddr, char low, char high, int err); diff --git a/sys/dev/smbus/smbus_if.m b/sys/dev/smbus/smbus_if.m index d969e25..6a5acf5 100644 --- a/sys/dev/smbus/smbus_if.m +++ b/sys/dev/smbus/smbus_if.m @@ -149,3 +149,20 @@ METHOD int bread { u_char *count; char *buf; }; + +# +# SMB roll-up transaction with flags that also allow it to be +# used for (mostly) i2c pass-through and with 10-bit addresses. +# This function can be used to roll-up all of the above functions. +# +METHOD int trans { + device_t dev; + int slave; + char cmd; + int op; + char *wbuf; + int wcount; + char *rbuf; + int rcount; + int *actualp; +}; diff --git a/sys/dev/sound/pcm/dsp.c b/sys/dev/sound/pcm/dsp.c index 68204ff..5b92963 100644 --- a/sys/dev/sound/pcm/dsp.c +++ b/sys/dev/sound/pcm/dsp.c @@ -48,6 +48,11 @@ SYSCTL_INT(_hw_snd, OID_AUTO, compat_linux_mmap, CTLFLAG_RWTUN, &dsp_mmap_allow_prot_exec, 0, "linux mmap compatibility (-1=force disable 0=auto 1=force enable)"); +static int dsp_basename_clone = 1; +SYSCTL_INT(_hw_snd, OID_AUTO, basename_clone, CTLFLAG_RWTUN, + &dsp_basename_clone, 0, + "DSP basename cloning (0: Disable; 1: Enabled)"); + struct dsp_cdevinfo { struct pcm_channel *rdch, *wrch; struct pcm_channel *volch; @@ -2357,9 +2362,10 @@ dsp_clone(void *arg, devname = devcmp; devhw = dsp_cdevs[i].hw; devcmax = dsp_cdevs[i].max - 1; - if (strcmp(name, devcmp) == 0) - unit = snd_unit; - else if (dsp_stdclone(name, devcmp, devsep, + if (strcmp(name, devcmp) == 0) { + if (dsp_basename_clone != 0) + unit = snd_unit; + } else if (dsp_stdclone(name, devcmp, devsep, dsp_cdevs[i].use_sep, &unit, &cunit) != 0) { unit = -1; cunit = -1; diff --git a/sys/dev/usb/controller/dwc_otg.c b/sys/dev/usb/controller/dwc_otg.c index 05d2f71..37b5333 100644 --- a/sys/dev/usb/controller/dwc_otg.c +++ b/sys/dev/usb/controller/dwc_otg.c @@ -3876,20 +3876,18 @@ dwc_otg_init(struct dwc_otg_softc *sc) if (temp & GHWCFG2_MPI) { uint8_t x; - DPRINTF("Multi Process Interrupts\n"); + DPRINTF("Disable Multi Process Interrupts\n"); for (x = 0; x != sc->sc_dev_in_ep_max; x++) { - DWC_OTG_WRITE_4(sc, DOTG_DIEPEACHINTMSK(x), - DIEPMSK_XFERCOMPLMSK); + DWC_OTG_WRITE_4(sc, DOTG_DIEPEACHINTMSK(x), 0); DWC_OTG_WRITE_4(sc, DOTG_DOEPEACHINTMSK(x), 0); } - DWC_OTG_WRITE_4(sc, DOTG_DEACHINTMSK, 0xFFFF); - } else { - DWC_OTG_WRITE_4(sc, DOTG_DIEPMSK, - DIEPMSK_XFERCOMPLMSK); - DWC_OTG_WRITE_4(sc, DOTG_DOEPMSK, 0); - DWC_OTG_WRITE_4(sc, DOTG_DAINTMSK, 0xFFFF); + DWC_OTG_WRITE_4(sc, DOTG_DEACHINTMSK, 0); } + DWC_OTG_WRITE_4(sc, DOTG_DIEPMSK, + DIEPMSK_XFERCOMPLMSK); + DWC_OTG_WRITE_4(sc, DOTG_DOEPMSK, 0); + DWC_OTG_WRITE_4(sc, DOTG_DAINTMSK, 0xFFFF); } if (sc->sc_mode == DWC_MODE_OTG || sc->sc_mode == DWC_MODE_HOST) { diff --git a/sys/dev/usb/usbdevs b/sys/dev/usb/usbdevs index 4a10610..d972bdc 100644 --- a/sys/dev/usb/usbdevs +++ b/sys/dev/usb/usbdevs @@ -3755,6 +3755,7 @@ product REALTEK RTL8188CU_1 0x817a RTL8188CU product REALTEK RTL8188CU_2 0x817b RTL8188CU product REALTEK RTL8187 0x8187 RTL8187 Wireless Adapter product REALTEK RTL8187B_0 0x8189 RTL8187B Wireless Adapter +product REALTEK RTL8188CU_3 0x8191 RTL8188CU product REALTEK RTL8196EU 0x8196 RTL8196EU product REALTEK RTL8187B_1 0x8197 RTL8187B Wireless Adapter product REALTEK RTL8187B_2 0x8198 RTL8187B Wireless Adapter diff --git a/sys/dev/usb/wlan/if_urtwn.c b/sys/dev/usb/wlan/if_urtwn.c index a25114f..72004d9 100644 --- a/sys/dev/usb/wlan/if_urtwn.c +++ b/sys/dev/usb/wlan/if_urtwn.c @@ -137,6 +137,7 @@ static const STRUCT_USB_HOST_ID urtwn_devs[] = { URTWN_DEV(REALTEK, RTL8188CU_0), URTWN_DEV(REALTEK, RTL8188CU_1), URTWN_DEV(REALTEK, RTL8188CU_2), + URTWN_DEV(REALTEK, RTL8188CU_3), URTWN_DEV(REALTEK, RTL8188CU_COMBO), URTWN_DEV(REALTEK, RTL8188CUS), URTWN_DEV(REALTEK, RTL8188RU_1), @@ -145,7 +146,6 @@ static const STRUCT_USB_HOST_ID urtwn_devs[] = { URTWN_DEV(REALTEK, RTL8191CU), URTWN_DEV(REALTEK, RTL8192CE), URTWN_DEV(REALTEK, RTL8192CU), - URTWN_DEV(REALTEK, RTL8188CU_0), URTWN_DEV(SITECOMEU, RTL8188CU_1), URTWN_DEV(SITECOMEU, RTL8188CU_2), URTWN_DEV(SITECOMEU, RTL8192CU), @@ -1236,9 +1236,11 @@ urtwn_efuse_read(struct urtwn_softc *sc) static void urtwn_efuse_switch_power(struct urtwn_softc *sc) { - uint8_t vol; uint32_t reg; + if (sc->chip & URTWN_CHIP_88E) + urtwn_write_1(sc, R92C_EFUSE_ACCESS, R92C_EFUSE_ACCESS_ON); + reg = urtwn_read_2(sc, R92C_SYS_ISO_CTRL); if (!(reg & R92C_SYS_ISO_CTRL_PWC_EV12V)) { urtwn_write_2(sc, R92C_SYS_ISO_CTRL, @@ -1256,11 +1258,15 @@ urtwn_efuse_switch_power(struct urtwn_softc *sc) reg | R92C_SYS_CLKR_LOADER_EN | R92C_SYS_CLKR_ANA8M); } - /* Enable LDO 2.5V. */ - vol = urtwn_read_1(sc, R92C_EFUSE_TEST + 3); - vol &= 0x0f; - vol |= 0x30; - urtwn_write_1(sc, R92C_EFUSE_TEST + 3, (vol | 0x80)); + if (!(sc->chip & URTWN_CHIP_88E)) { + uint8_t vol; + + /* Enable LDO 2.5V. */ + vol = urtwn_read_1(sc, R92C_EFUSE_TEST + 3); + vol &= 0x0f; + vol |= 0x30; + urtwn_write_1(sc, R92C_EFUSE_TEST + 3, (vol | 0x80)); + } } static int @@ -1328,7 +1334,7 @@ urtwn_r88e_read_rom(struct urtwn_softc *sc) /* Read full ROM image. */ memset(&sc->r88e_rom, 0xff, sizeof(sc->r88e_rom)); - while (addr < 1024) { + while (addr < 512) { reg = urtwn_efuse_read_1(sc, addr); if (reg == 0xff) break; @@ -1354,6 +1360,8 @@ urtwn_r88e_read_rom(struct urtwn_softc *sc) } } + urtwn_write_1(sc, R92C_EFUSE_ACCESS, R92C_EFUSE_ACCESS_OFF); + addr = 0x10; for (i = 0; i < 6; i++) sc->cck_tx_pwr[i] = sc->r88e_rom[addr++]; @@ -2193,14 +2201,12 @@ urtwn_r92c_power_on(struct urtwn_softc *sc) static int urtwn_r88e_power_on(struct urtwn_softc *sc) { - uint8_t val; uint32_t reg; int ntries; /* Wait for power ready bit. */ for (ntries = 0; ntries < 5000; ntries++) { - val = urtwn_read_1(sc, 0x6) & 0x2; - if (val == 0x2) + if (urtwn_read_4(sc, R92C_APS_FSMCO) & R92C_APS_FSMCO_SUS_HOST) break; urtwn_ms_delay(sc); } @@ -2215,17 +2221,23 @@ urtwn_r88e_power_on(struct urtwn_softc *sc) urtwn_read_1(sc, R92C_SYS_FUNC_EN) & ~(R92C_SYS_FUNC_EN_BBRSTB | R92C_SYS_FUNC_EN_BB_GLB_RST)); - urtwn_write_1(sc, 0x26, urtwn_read_1(sc, 0x26) | 0x80); + urtwn_write_1(sc, R92C_AFE_XTAL_CTRL + 2, + urtwn_read_1(sc, R92C_AFE_XTAL_CTRL + 2) | 0x80); /* Disable HWPDN. */ - urtwn_write_1(sc, 0x5, urtwn_read_1(sc, 0x5) & ~0x80); + urtwn_write_2(sc, R92C_APS_FSMCO, + urtwn_read_2(sc, R92C_APS_FSMCO) & ~R92C_APS_FSMCO_APDM_HPDN); /* Disable WL suspend. */ - urtwn_write_1(sc, 0x5, urtwn_read_1(sc, 0x5) & ~0x18); + urtwn_write_2(sc, R92C_APS_FSMCO, + urtwn_read_2(sc, R92C_APS_FSMCO) & + ~(R92C_APS_FSMCO_AFSM_HSUS | R92C_APS_FSMCO_AFSM_PCIE)); - urtwn_write_1(sc, 0x5, urtwn_read_1(sc, 0x5) | 0x1); + urtwn_write_2(sc, R92C_APS_FSMCO, + urtwn_read_2(sc, R92C_APS_FSMCO) | R92C_APS_FSMCO_APFM_ONMAC); for (ntries = 0; ntries < 5000; ntries++) { - if (!(urtwn_read_1(sc, 0x5) & 0x1)) + if (!(urtwn_read_2(sc, R92C_APS_FSMCO) & + R92C_APS_FSMCO_APFM_ONMAC)) break; urtwn_ms_delay(sc); } @@ -2233,7 +2245,8 @@ urtwn_r88e_power_on(struct urtwn_softc *sc) return (ETIMEDOUT); /* Enable LDO normal mode. */ - urtwn_write_1(sc, 0x23, urtwn_read_1(sc, 0x23) & ~0x10); + urtwn_write_1(sc, R92C_LPLDO_CTRL, + urtwn_read_1(sc, R92C_LPLDO_CTRL) & ~0x10); /* Enable MAC DMA/WMAC/SCHEDULE/SEC blocks. */ urtwn_write_2(sc, R92C_CR, 0); @@ -2565,7 +2578,6 @@ urtwn_r88e_dma_init(struct urtwn_softc *sc) return (EIO); /* Set number of pages for normal priority queue. */ - urtwn_write_2(sc, R92C_RQPN_NPQ, 0); urtwn_write_2(sc, R92C_RQPN_NPQ, 0x000d); urtwn_write_4(sc, R92C_RQPN, 0x808e000d); @@ -3384,16 +3396,17 @@ urtwn_init_locked(void *arg) urtwn_write_1(sc, R92C_TRXDMA_CTRL, urtwn_read_1(sc, R92C_TRXDMA_CTRL) | R92C_TRXDMA_CTRL_RXDMA_AGG_EN); - urtwn_write_1(sc, R92C_USB_SPECIAL_OPTION, - urtwn_read_1(sc, R92C_USB_SPECIAL_OPTION) | - R92C_USB_SPECIAL_OPTION_AGG_EN); urtwn_write_1(sc, R92C_RXDMA_AGG_PG_TH, 48); if (sc->chip & URTWN_CHIP_88E) urtwn_write_1(sc, R92C_RXDMA_AGG_PG_TH + 1, 4); - else + else { urtwn_write_1(sc, R92C_USB_DMA_AGG_TO, 4); - urtwn_write_1(sc, R92C_USB_AGG_TH, 8); - urtwn_write_1(sc, R92C_USB_AGG_TO, 6); + urtwn_write_1(sc, R92C_USB_SPECIAL_OPTION, + urtwn_read_1(sc, R92C_USB_SPECIAL_OPTION) | + R92C_USB_SPECIAL_OPTION_AGG_EN); + urtwn_write_1(sc, R92C_USB_AGG_TH, 8); + urtwn_write_1(sc, R92C_USB_AGG_TO, 6); + } /* Initialize beacon parameters. */ urtwn_write_2(sc, R92C_BCN_CTRL, 0x1010); diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c index 39fc28d..704a3d9 100644 --- a/sys/dev/virtio/network/if_vtnet.c +++ b/sys/dev/virtio/network/if_vtnet.c @@ -2745,6 +2745,11 @@ vtnet_drain_rxtx_queues(struct vtnet_softc *sc) struct vtnet_txq *txq; int i; +#ifdef DEV_NETMAP + if (nm_native_on(NA(sc->vtnet_ifp))) + return; +#endif /* DEV_NETMAP */ + for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; vtnet_rxq_free_mbufs(rxq); diff --git a/sys/dev/vt/hw/fb/vt_fb.c b/sys/dev/vt/hw/fb/vt_fb.c index 40f56fc..b2babd1 100644 --- a/sys/dev/vt/hw/fb/vt_fb.c +++ b/sys/dev/vt/hw/fb/vt_fb.c @@ -264,46 +264,40 @@ vt_fb_bitblt_bitmap(struct vt_device *vd, const struct vt_window *vw, { struct fb_info *info; uint32_t fgc, bgc, cc, o; - int c, l, bpp, bpl; - u_long line; - uint8_t b, m; - const uint8_t *ch; + int bpp, bpl, xi, yi; + int bit, byte; info = vd->vd_softc; bpp = FBTYPE_GET_BYTESPP(info); fgc = info->fb_cmap[fg]; bgc = info->fb_cmap[bg]; - b = m = 0; - bpl = (width + 7) >> 3; /* Bytes per source line. */ + bpl = (width + 7) / 8; /* Bytes per source line. */ if (info->fb_flags & FB_FLAG_NOWRITE) return; KASSERT((info->fb_vbase != 0), ("Unmapped framebuffer")); - line = (info->fb_stride * y) + (x * bpp); - for (l = 0; - l < height && y + l < vw->vw_draw_area.tr_end.tp_row; - l++) { - ch = pattern; - for (c = 0; - c < width && x + c < vw->vw_draw_area.tr_end.tp_col; - c++) { - if (c % 8 == 0) - b = *ch++; - else - b <<= 1; - if (mask != NULL) { - if (c % 8 == 0) - m = *mask++; - else - m <<= 1; - /* Skip pixel write, if mask has no bit set. */ - if ((m & 0x80) == 0) - continue; - } - o = line + (c * bpp); - cc = b & 0x80 ? fgc : bgc; + /* Bound by right and bottom edges. */ + if (y + height > vw->vw_draw_area.tr_end.tp_row) { + if (y >= vw->vw_draw_area.tr_end.tp_row) + return; + height = vw->vw_draw_area.tr_end.tp_row - y; + } + if (x + width > vw->vw_draw_area.tr_end.tp_col) { + if (x >= vw->vw_draw_area.tr_end.tp_col) + return; + width = vw->vw_draw_area.tr_end.tp_col - x; + } + for (yi = 0; yi < height; yi++) { + for (xi = 0; xi < width; xi++) { + byte = yi * bpl + xi / 8; + bit = 0x80 >> (xi % 8); + /* Skip pixel write, if mask bit not set. */ + if (mask != NULL && (mask[byte] & bit) == 0) + continue; + o = (y + yi) * info->fb_stride + (x + xi) * bpp; + cc = pattern[byte] & bit ? fgc : bgc; switch(bpp) { case 1: @@ -326,8 +320,6 @@ vt_fb_bitblt_bitmap(struct vt_device *vd, const struct vt_window *vw, break; } } - line += info->fb_stride; - pattern += bpl; } } diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c index e939fdd..689692e 100644 --- a/sys/dev/vt/hw/vga/vt_vga.c +++ b/sys/dev/vt/hw/vga/vt_vga.c @@ -1035,11 +1035,12 @@ vga_initialize_graphics(struct vt_device *vd) REG_WRITE1(sc, VGA_GC_DATA, 0xff); } -static void +static int vga_initialize(struct vt_device *vd, int textmode) { struct vga_softc *sc = vd->vd_softc; uint8_t x; + int timeout; /* Make sure the VGA adapter is not in monochrome emulation mode. */ x = REG_READ1(sc, VGA_GEN_MISC_OUTPUT_R); @@ -1060,10 +1061,16 @@ vga_initialize(struct vt_device *vd, int textmode) * code therefore also removes that guarantee and appropriate measures * need to be taken. */ + timeout = 10000; do { + DELAY(10); x = REG_READ1(sc, VGA_GEN_INPUT_STAT_1); x &= VGA_GEN_IS1_VR | VGA_GEN_IS1_DE; - } while (x != (VGA_GEN_IS1_VR | VGA_GEN_IS1_DE)); + } while (x != (VGA_GEN_IS1_VR | VGA_GEN_IS1_DE) && --timeout != 0); + if (timeout == 0) { + printf("Timeout initializing vt_vga\n"); + return (ENXIO); + } /* Now, disable the sync. signals. */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_MODE_CONTROL); @@ -1194,6 +1201,8 @@ vga_initialize(struct vt_device *vd, int textmode) */ sc->vga_curfg = sc->vga_curbg = 0xff; } + + return (0); } static int @@ -1235,7 +1244,8 @@ vga_init(struct vt_device *vd) vd->vd_width = VT_VGA_WIDTH; vd->vd_height = VT_VGA_HEIGHT; } - vga_initialize(vd, textmode); + if (vga_initialize(vd, textmode) != 0) + return (CN_DEAD); sc->vga_enabled = true; return (CN_INTERNAL); diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index 1f8731e..a637055 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -1029,7 +1029,7 @@ vt_determine_colors(term_char_t c, int cursor, int vt_is_cursor_in_area(const struct vt_device *vd, const term_rect_t *area) { - unsigned int mx, my, x1, y1, x2, y2; + unsigned int mx, my; /* * We use the cursor position saved during the current refresh, @@ -1038,18 +1038,12 @@ vt_is_cursor_in_area(const struct vt_device *vd, const term_rect_t *area) mx = vd->vd_mx_drawn + vd->vd_curwindow->vw_draw_area.tr_begin.tp_col; my = vd->vd_my_drawn + vd->vd_curwindow->vw_draw_area.tr_begin.tp_row; - x1 = area->tr_begin.tp_col; - y1 = area->tr_begin.tp_row; - x2 = area->tr_end.tp_col; - y2 = area->tr_end.tp_row; - - if (((mx >= x1 && x2 - 1 >= mx) || - (mx < x1 && mx + vd->vd_mcursor->width >= x1)) && - ((my >= y1 && y2 - 1 >= my) || - (my < y1 && my + vd->vd_mcursor->height >= y1))) - return (1); - - return (0); + if (mx >= area->tr_end.tp_col || + mx + vd->vd_mcursor->width <= area->tr_begin.tp_col || + my >= area->tr_end.tp_row || + my + vd->vd_mcursor->height <= area->tr_begin.tp_row) + return (0); + return (1); } static void diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c index e113e2c..a6036d8 100644 --- a/sys/dev/xen/balloon/balloon.c +++ b/sys/dev/xen/balloon/balloon.c @@ -118,11 +118,6 @@ current_target(void) static unsigned long minimum_target(void) { -#ifdef XENHVM -#define max_pfn realmem -#else -#define max_pfn HYPERVISOR_shared_info->arch.max_pfn -#endif unsigned long min_pages, curr_pages = current_target(); #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) @@ -139,16 +134,15 @@ minimum_target(void) * 32768 1320 * 131072 4392 */ - if (max_pfn < MB2PAGES(128)) - min_pages = MB2PAGES(8) + (max_pfn >> 1); - else if (max_pfn < MB2PAGES(512)) - min_pages = MB2PAGES(40) + (max_pfn >> 2); - else if (max_pfn < MB2PAGES(2048)) - min_pages = MB2PAGES(104) + (max_pfn >> 3); + if (realmem < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (realmem >> 1); + else if (realmem < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (realmem >> 2); + else if (realmem < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (realmem >> 3); else - min_pages = MB2PAGES(296) + (max_pfn >> 5); + min_pages = MB2PAGES(296) + (realmem >> 5); #undef MB2PAGES -#undef max_pfn /* Don't enforce growth */ return (min(min_pages, curr_pages)); @@ -204,12 +198,9 @@ increase_reservation(unsigned long nr_pages) bs.balloon_low--; pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); - KASSERT((xen_feature(XENFEAT_auto_translated_physmap) || - !phys_to_machine_mapping_valid(pfn)), + KASSERT(xen_feature(XENFEAT_auto_translated_physmap), ("auto translated physmap but mapping is valid")); - set_phys_to_machine(pfn, frame_list[i]); - vm_page_free(page); } @@ -258,9 +249,8 @@ decrease_reservation(unsigned long nr_pages) } pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); - frame_list[i] = PFNTOMFN(pfn); + frame_list[i] = pfn; - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); bs.balloon_low++; } @@ -393,21 +383,11 @@ static int xenballoon_attach(device_t dev) { int err; -#ifndef XENHVM - vm_page_t page; - unsigned long pfn; - -#define max_pfn HYPERVISOR_shared_info->arch.max_pfn -#endif mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF); -#ifndef XENHVM - bs.current_pages = min(xen_start_info->nr_pages, max_pfn); -#else bs.current_pages = xen_pv_domain() ? HYPERVISOR_start_info->nr_pages : realmem; -#endif bs.target_pages = bs.current_pages; bs.balloon_low = 0; bs.balloon_high = 0; @@ -416,16 +396,6 @@ xenballoon_attach(device_t dev) kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon"); -#ifndef XENHVM - /* Initialise the balloon with excess memory space. */ - for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { - page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT); - TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); - bs.balloon_low++; - } -#undef max_pfn -#endif - target_watch.callback = watch_target; err = xs_register_watch(&target_watch); diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c index 1273961..b647fec 100644 --- a/sys/dev/xen/blkback/blkback.c +++ b/sys/dev/xen/blkback/blkback.c @@ -742,7 +742,6 @@ struct xbb_softc { /** Mutex protecting per-instance data. */ struct mtx lock; -#ifdef XENHVM /** * Resource representing allocated physical address space * associated with our per-instance kva region. @@ -751,7 +750,6 @@ struct xbb_softc { /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; -#endif /** * I/O statistics from BlockBack dispatch down. These are @@ -2818,16 +2816,12 @@ static void xbb_free_communication_mem(struct xbb_softc *xbb) { if (xbb->kva != 0) { -#ifndef XENHVM - kva_free(xbb->kva, xbb->kva_size); -#else if (xbb->pseudo_phys_res != NULL) { bus_release_resource(xbb->dev, SYS_RES_MEMORY, xbb->pseudo_phys_res_id, xbb->pseudo_phys_res); xbb->pseudo_phys_res = NULL; } -#endif } xbb->kva = 0; xbb->gnt_base_addr = 0; @@ -3055,12 +3049,6 @@ xbb_alloc_communication_mem(struct xbb_softc *xbb) DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", device_get_nameunit(xbb->dev), xbb->kva_size, xbb->reqlist_kva_size); -#ifndef XENHVM - xbb->kva = kva_alloc(xbb->kva_size); - if (xbb->kva == 0) - return (ENOMEM); - xbb->gnt_base_addr = xbb->kva; -#else /* XENHVM */ /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine @@ -3078,7 +3066,6 @@ xbb_alloc_communication_mem(struct xbb_softc *xbb) } xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); -#endif /* XENHVM */ DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c index 665a5ac..2a0d459 100644 --- a/sys/dev/xen/control/control.c +++ b/sys/dev/xen/control/control.c @@ -138,9 +138,7 @@ __FBSDID("$FreeBSD$"); #include <xen/gnttab.h> #include <xen/xen_intr.h> -#ifdef XENHVM #include <xen/hvm.h> -#endif #include <xen/interface/event_channel.h> #include <xen/interface/grant_table.h> @@ -192,133 +190,6 @@ xctrl_reboot() shutdown_nice(0); } -#ifndef XENHVM -extern void xencons_suspend(void); -extern void xencons_resume(void); - -/* Full PV mode suspension. */ -static void -xctrl_suspend() -{ - int i, j, k, fpp, suspend_cancelled; - unsigned long max_pfn, start_info_mfn; - - EVENTHANDLER_INVOKE(power_suspend); - -#ifdef SMP - struct thread *td; - cpuset_t map; - u_int cpuid; - - /* - * Bind us to CPU 0 and stop any other VCPUs. - */ - td = curthread; - thread_lock(td); - sched_bind(td, 0); - thread_unlock(td); - cpuid = PCPU_GET(cpuid); - KASSERT(cpuid == 0, ("xen_suspend: not running on cpu 0")); - - map = all_cpus; - CPU_CLR(cpuid, &map); - CPU_NAND(&map, &stopped_cpus); - if (!CPU_EMPTY(&map)) - stop_cpus(map); -#endif - - /* - * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE - * drivers need this. - */ - mtx_lock(&Giant); - if (DEVICE_SUSPEND(root_bus) != 0) { - mtx_unlock(&Giant); - printf("%s: device_suspend failed\n", __func__); -#ifdef SMP - if (!CPU_EMPTY(&map)) - restart_cpus(map); -#endif - return; - } - mtx_unlock(&Giant); - - local_irq_disable(); - - xencons_suspend(); - gnttab_suspend(); - intr_suspend(); - - max_pfn = HYPERVISOR_shared_info->arch.max_pfn; - - void *shared_info = HYPERVISOR_shared_info; - HYPERVISOR_shared_info = NULL; - pmap_kremove((vm_offset_t) shared_info); - PT_UPDATES_FLUSH(); - - xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn); - - /* - * We'll stop somewhere inside this hypercall. When it returns, - * we'll start resuming after the restore. - */ - start_info_mfn = VTOMFN(xen_start_info); - pmap_suspend(); - suspend_cancelled = HYPERVISOR_suspend(start_info_mfn); - pmap_resume(); - - pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info); - HYPERVISOR_shared_info = shared_info; - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - VTOMFN(xen_pfn_to_mfn_frame_list_list); - - fpp = PAGE_SIZE/sizeof(unsigned long); - for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { - if ((j % fpp) == 0) { - k++; - xen_pfn_to_mfn_frame_list_list[k] = - VTOMFN(xen_pfn_to_mfn_frame_list[k]); - j = 0; - } - xen_pfn_to_mfn_frame_list[k][j] = - VTOMFN(&xen_phys_machine[i]); - } - HYPERVISOR_shared_info->arch.max_pfn = max_pfn; - - gnttab_resume(NULL); - intr_resume(suspend_cancelled != 0); - local_irq_enable(); - xencons_resume(); - -#ifdef CONFIG_SMP - for_each_cpu(i) - vcpu_prepare(i); - -#endif - - /* - * Only resume xenbus /after/ we've prepared our VCPUs; otherwise - * the VCPU hotplug callback can race with our vcpu_prepare - */ - mtx_lock(&Giant); - DEVICE_RESUME(root_bus); - mtx_unlock(&Giant); - -#ifdef SMP - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - if (!CPU_EMPTY(&map)) - restart_cpus(map); -#endif - EVENTHANDLER_INVOKE(power_resume); -} - -#else - -/* HVM mode suspension. */ static void xctrl_suspend() { @@ -417,7 +288,6 @@ xctrl_suspend() printf("System resumed after suspension\n"); } -#endif static void xctrl_crash() diff --git a/sys/dev/xen/grant_table/grant_table.c b/sys/dev/xen/grant_table/grant_table.c index 2511657..ad65fe0 100644 --- a/sys/dev/xen/grant_table/grant_table.c +++ b/sys/dev/xen/grant_table/grant_table.c @@ -53,7 +53,6 @@ static int gnttab_free_count; static grant_ref_t gnttab_free_head; static struct mtx gnttab_list_lock; -#ifdef XENHVM /* * Resource representing allocated physical address space * for the grant table metainfo @@ -62,7 +61,6 @@ static struct resource *gnttab_pseudo_phys_res; /* Resource id for allocated physical address space. */ static int gnttab_pseudo_phys_res_id; -#endif static grant_entry_t *shared; @@ -510,72 +508,6 @@ unmap_pte_fn(pte_t *pte, struct page *pmd_page, } #endif -#ifndef XENHVM - -static int -gnttab_map(unsigned int start_idx, unsigned int end_idx) -{ - struct gnttab_setup_table setup; - u_long *frames; - - unsigned int nr_gframes = end_idx + 1; - int i, rc; - - frames = malloc(nr_gframes * sizeof(unsigned long), M_DEVBUF, M_NOWAIT); - if (!frames) - return (ENOMEM); - - setup.dom = DOMID_SELF; - setup.nr_frames = nr_gframes; - set_xen_guest_handle(setup.frame_list, frames); - - rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); - if (rc == -ENOSYS) { - free(frames, M_DEVBUF); - return (ENOSYS); - } - KASSERT(!(rc || setup.status), - ("unexpected result from grant_table_op")); - - if (shared == NULL) { - vm_offset_t area; - - area = kva_alloc(PAGE_SIZE * max_nr_grant_frames()); - KASSERT(area, ("can't allocate VM space for grant table")); - shared = (grant_entry_t *)area; - } - - for (i = 0; i < nr_gframes; i++) - PT_SET_MA(((caddr_t)shared) + i*PAGE_SIZE, - ((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V); - - free(frames, M_DEVBUF); - - return (0); -} - -int -gnttab_resume(device_t dev) -{ - - if (max_nr_grant_frames() < nr_grant_frames) - return (ENOSYS); - return (gnttab_map(0, nr_grant_frames - 1)); -} - -int -gnttab_suspend(void) -{ - int i; - - for (i = 0; i < nr_grant_frames; i++) - pmap_kremove((vm_offset_t) shared + i * PAGE_SIZE); - - return (0); -} - -#else /* XENHVM */ - static vm_paddr_t resume_frames; static int @@ -638,8 +570,6 @@ gnttab_resume(device_t dev) return (gnttab_map(0, nr_gframes - 1)); } -#endif - static int gnttab_expand(unsigned int req_entries) { diff --git a/sys/dev/xen/netback/netback.c b/sys/dev/xen/netback/netback.c index 63337ad..b5c1c13 100644 --- a/sys/dev/xen/netback/netback.c +++ b/sys/dev/xen/netback/netback.c @@ -473,7 +473,6 @@ struct xnb_softc { */ gnttab_copy_table tx_gnttab; -#ifdef XENHVM /** * Resource representing allocated physical address space * associated with our per-instance kva region. @@ -482,7 +481,6 @@ struct xnb_softc { /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; -#endif /** Ring mapping and interrupt configuration data. */ struct xnb_ring_config ring_configs[XNB_NUM_RING_TYPES]; @@ -626,16 +624,12 @@ static void xnb_free_communication_mem(struct xnb_softc *xnb) { if (xnb->kva != 0) { -#ifndef XENHVM - kva_free(xnb->kva, xnb->kva_size); -#else if (xnb->pseudo_phys_res != NULL) { bus_release_resource(xnb->dev, SYS_RES_MEMORY, xnb->pseudo_phys_res_id, xnb->pseudo_phys_res); xnb->pseudo_phys_res = NULL; } -#endif /* XENHVM */ } xnb->kva = 0; xnb->gnt_base_addr = 0; @@ -816,12 +810,7 @@ xnb_alloc_communication_mem(struct xnb_softc *xnb) for (i=0; i < XNB_NUM_RING_TYPES; i++) { xnb->kva_size += xnb->ring_configs[i].ring_pages * PAGE_SIZE; } -#ifndef XENHVM - xnb->kva = kva_alloc(xnb->kva_size); - if (xnb->kva == 0) - return (ENOMEM); - xnb->gnt_base_addr = xnb->kva; -#else /* defined XENHVM */ + /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine @@ -840,7 +829,6 @@ xnb_alloc_communication_mem(struct xnb_softc *xnb) } xnb->kva = (vm_offset_t)rman_get_virtual(xnb->pseudo_phys_res); xnb->gnt_base_addr = rman_get_start(xnb->pseudo_phys_res); -#endif /* !defined XENHVM */ return (0); } diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index 27900ba..3c1f952 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -879,12 +879,11 @@ refill: if (sc->copying_receiver == 0) { gnttab_grant_foreign_transfer_ref(ref, otherend_id, pfn); - sc->rx_pfn_array[nr_flips] = PFNTOMFN(pfn); + sc->rx_pfn_array[nr_flips] = pfn; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Remove this page before passing * back to Xen. */ - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); MULTI_update_va_mapping(&sc->rx_mcl[i], vaddr, 0, 0); } @@ -892,7 +891,7 @@ refill: } else { gnttab_grant_foreign_access_ref(ref, otherend_id, - PFNTOMFN(pfn), 0); + pfn, 0); } req->id = id; req->gref = ref; @@ -907,7 +906,6 @@ refill: * We may have allocated buffers which have entries outstanding * in the page * update queue -- make sure we flush those first! */ - PT_UPDATES_FLUSH(); if (nr_flips != 0) { #ifdef notyet /* Tell the ballon driver what is going on. */ @@ -1361,8 +1359,6 @@ xennet_get_responses(struct netfront_info *np, mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; mmu->val = pfn; - - set_phys_to_machine(pfn, mfn); } pages_flipped++; } else { @@ -1927,7 +1923,7 @@ network_connect(struct netfront_info *np) } else { gnttab_grant_foreign_access_ref(ref, xenbus_get_otherend_id(np->xbdev), - PFNTOMFN(pfn), 0); + pfn, 0); } req->gref = ref; req->id = requeue_idx; diff --git a/sys/fs/nfs/nfsport.h b/sys/fs/nfs/nfsport.h index 9f08854..42a7f94 100644 --- a/sys/fs/nfs/nfsport.h +++ b/sys/fs/nfs/nfsport.h @@ -950,7 +950,7 @@ struct nfsreq { }; #ifndef NFS_MAXBSIZE -#define NFS_MAXBSIZE MAXBSIZE +#define NFS_MAXBSIZE MAXBCACHEBUF #endif /* diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c index 964ea1f..73c6eb6 100644 --- a/sys/fs/nfsclient/nfs_clvfsops.c +++ b/sys/fs/nfsclient/nfs_clvfsops.c @@ -201,16 +201,16 @@ newnfs_iosize(struct nfsmount *nmp) } if (nmp->nm_rsize > maxio || nmp->nm_rsize == 0) nmp->nm_rsize = maxio; - if (nmp->nm_rsize > MAXBSIZE) - nmp->nm_rsize = MAXBSIZE; + if (nmp->nm_rsize > NFS_MAXBSIZE) + nmp->nm_rsize = NFS_MAXBSIZE; if (nmp->nm_readdirsize > maxio || nmp->nm_readdirsize == 0) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if (nmp->nm_wsize > maxio || nmp->nm_wsize == 0) nmp->nm_wsize = maxio; - if (nmp->nm_wsize > MAXBSIZE) - nmp->nm_wsize = MAXBSIZE; + if (nmp->nm_wsize > NFS_MAXBSIZE) + nmp->nm_wsize = NFS_MAXBSIZE; /* * Calculate the size used for io buffers. Use the larger diff --git a/sys/fs/nfsserver/nfs_nfsdkrpc.c b/sys/fs/nfsserver/nfs_nfsdkrpc.c index 4fb9c93..f9b8eb8 100644 --- a/sys/fs/nfsserver/nfs_nfsdkrpc.c +++ b/sys/fs/nfsserver/nfs_nfsdkrpc.c @@ -117,7 +117,8 @@ nfssvc_program(struct svc_req *rqst, SVCXPRT *xprt) memset(&nd, 0, sizeof(nd)); if (rqst->rq_vers == NFS_VER2) { - if (rqst->rq_proc > NFSV2PROC_STATFS) { + if (rqst->rq_proc > NFSV2PROC_STATFS || + newnfs_nfsv3_procid[rqst->rq_proc] == NFSPROC_NOOP) { svcerr_noproc(rqst); svc_freereq(rqst); goto out; diff --git a/sys/geom/uncompress/g_uncompress.c b/sys/geom/uncompress/g_uncompress.c index d819371..8c2d5cb 100644 --- a/sys/geom/uncompress/g_uncompress.c +++ b/sys/geom/uncompress/g_uncompress.c @@ -45,10 +45,10 @@ __FBSDID("$FreeBSD$"); #include <sys/mutex.h> #include <sys/malloc.h> #include <sys/systm.h> +#include <sys/zlib.h> #include <geom/geom.h> -#include <net/zlib.h> #include <contrib/xz-embedded/linux/include/linux/xz.h> #ifdef GEOM_UNCOMPRESS_DEBUG diff --git a/sys/geom/uzip/g_uzip.c b/sys/geom/uzip/g_uzip.c index c2ed64b..732de9d 100644 --- a/sys/geom/uzip/g_uzip.c +++ b/sys/geom/uzip/g_uzip.c @@ -38,9 +38,9 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/sysctl.h> #include <sys/systm.h> +#include <sys/zlib.h> #include <geom/geom.h> -#include <net/zlib.h> FEATURE(geom_uzip, "GEOM uzip read-only compressed disks support"); diff --git a/sys/i386/conf/DEFAULTS b/sys/i386/conf/DEFAULTS index 78d807c..d5bdb1d 100644 --- a/sys/i386/conf/DEFAULTS +++ b/sys/i386/conf/DEFAULTS @@ -26,7 +26,6 @@ options GEOM_PART_EBR_COMPAT options GEOM_PART_MBR # enable support for native hardware -options NATIVE device atpic options NEW_PCIB diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index 086844a..b1740f3 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -358,7 +358,9 @@ device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/i386/conf/XEN b/sys/i386/conf/XEN deleted file mode 100644 index dd83670..0000000 --- a/sys/i386/conf/XEN +++ /dev/null @@ -1,96 +0,0 @@ -# -# XEN -- Kernel configuration for i386 XEN DomU -# -# $FreeBSD$ - -cpu I686_CPU -ident XEN - -makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols - -# The following drivers don't build with PAE or XEN enabled. -makeoptions WITHOUT_MODULES="ctl dpt drm drm2 hptmv ida" - -# The following drivers don't work with PAE enabled. -makeoptions WITHOUT_MODULES+="ncr pst" - -options SCHED_ULE # ULE scheduler -options PREEMPTION # Enable kernel thread preemption - -options INET # InterNETworking -options INET6 # IPv6 communications protocols -options SCTP # Stream Control Transmission Protocol -options FFS # Berkeley Fast Filesystem -options SOFTUPDATES # Enable FFS soft updates support -options UFS_ACL # Support for access control lists -options UFS_DIRHASH # Improve performance on big directories -options UFS_GJOURNAL # Enable gjournal-based UFS journaling -options NFSCL # Network Filesystem Client -options NFSD # Network Filesystem Server -options NFSLOCKD # Network Lock Manager -options NFS_ROOT # NFS usable as /, requires NFSCL -options MSDOSFS # MSDOS Filesystem -options CD9660 # ISO 9660 Filesystem -options PROCFS # Process filesystem (requires PSEUDOFS) -options PSEUDOFS # Pseudo-filesystem framework -options GEOM_PART_GPT # GUID Partition Tables. -options GEOM_LABEL # Provides labelization -options COMPAT_FREEBSD4 # Compatible with FreeBSD4 -options COMPAT_FREEBSD5 # Compatible with FreeBSD5 -options COMPAT_FREEBSD6 # Compatible with FreeBSD6 -options COMPAT_FREEBSD7 # Compatible with FreeBSD7 -options COMPAT_FREEBSD9 # Compatible with FreeBSD9 -options COMPAT_FREEBSD10 # Compatible with FreeBSD10 -options KTRACE # ktrace(1) support -options STACK # stack(9) support -options SYSVSHM # SYSV-style shared memory -options SYSVMSG # SYSV-style message queues -options SYSVSEM # SYSV-style semaphores -options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions -options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options AUDIT # Security event auditing - -# Debugging for use in -current -options KDB # Enable kernel debugger support. -options DDB # Support DDB. -options GDB # Support remote GDB. -options DEADLKRES # Enable the deadlock resolver -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed - -options PAE -nooption NATIVE -option XEN -nodevice atpic -nodevice isa -options MCLSHIFT=12 - -# To make an SMP kernel, the next two lines are needed -options SMP # Symmetric MultiProcessor Kernel -device apic # I/O APIC - -#device atkbdc # AT keyboard controller -#device atkbd # AT keyboard -device psm # PS/2 mouse -device pci - -#device kbdmux # keyboard multiplexer - -# Pseudo devices. -device loop # Network loopback -device random # Entropy device -device ether # Ethernet support -device tun # Packet tunnel. -device md # Memory "disks" -device gif # IPv6 and IPv4 tunneling - -# Wireless cards -options IEEE80211_SUPPORT_MESH -options AH_SUPPORT_AR5416 - -# The `bpf' device enables the Berkeley Packet Filter. -# Be aware of the administrative consequences of enabling this! -# Note that 'bpf' is required for DHCP. -device bpf # Berkeley packet filter diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index 316bc7a..18b3c5d 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -181,6 +181,25 @@ IDTVEC(xen_intr_upcall) jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + SET_KERNEL_SREGS + cld + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call hv_vector_handler + add $4, %esp + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown. @@ -247,7 +266,6 @@ IDTVEC(invlcache) /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ -#ifndef XEN .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) @@ -262,7 +280,7 @@ IDTVEC(ipi_intr_bitmap_handler) call ipi_bitmap_handler MEXITCOUNT jmp doreti -#endif + /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ @@ -282,7 +300,6 @@ IDTVEC(cpustop) /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ -#ifndef XEN .text SUPERALIGN_TEXT IDTVEC(cpususpend) @@ -295,7 +312,6 @@ IDTVEC(cpususpend) POP_FRAME jmp doreti_iret -#endif /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index 97e2e97..7a00740 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -238,11 +238,6 @@ ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base)); ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); #endif -#ifdef XEN -ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3)); -ASSYM(XEN_HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_START); -#endif - #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 72f7685..a5867c4 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -160,24 +160,6 @@ int arch_i386_is_xbox = 0; uint32_t arch_i386_xbox_memsize = 0; #endif -#ifdef XEN -/* XEN includes */ -#include <xen/xen-os.h> -#include <xen/hypervisor.h> -#include <machine/xen/xenvar.h> -#include <machine/xen/xenfunc.h> -#include <xen/xen_intr.h> - -void Xhypervisor_callback(void); -void failsafe_callback(void); - -extern trap_info_t trap_table[]; -struct proc_ldt default_proc_ldt; -extern int init_first; -int running_xen = 1; -extern unsigned long physfree; -#endif /* XEN */ - /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); @@ -356,9 +338,7 @@ cpu_startup(dummy) */ bufinit(); vm_pager_bufferinit(); -#ifndef XEN cpu_setregs(); -#endif } /* @@ -1291,13 +1271,8 @@ SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, int _default_ldt; -#ifdef XEN -union descriptor *gdt; -union descriptor *ldt; -#else union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ -#endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ @@ -1397,7 +1372,6 @@ struct soft_segment_descriptor gdt_segs[] = { .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, -#ifndef XEN /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, @@ -1489,7 +1463,6 @@ struct soft_segment_descriptor gdt_segs[] = { .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, -#endif /* !XEN */ }; static struct soft_segment_descriptor ldt_segs[] = { @@ -1641,7 +1614,7 @@ sdtossd(sd, ssd) ssd->ssd_gran = sd->sd_gran; } -#if !defined(PC98) && !defined(XEN) +#if !defined(PC98) static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) @@ -1748,9 +1721,8 @@ add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } -#endif /* !PC98 && !XEN */ +#endif /* !PC98 */ -#ifndef XEN static void basemem_setup(void) { @@ -1798,7 +1770,6 @@ basemem_setup(void) for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } -#endif /* !XEN */ /* * Populate the (physmap) array with base/bound pairs describing the @@ -2074,8 +2045,6 @@ do_next: for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); - - PT_UPDATES_FLUSH(); } #else /* PC98 */ static void @@ -2086,7 +2055,6 @@ getmemsize(int first) vm_paddr_t physmap[PHYSMAP_SIZE]; pt_entry_t *pte; quad_t dcons_addr, dcons_size, physmem_tunable; -#ifndef XEN int hasbrokenint12, i, res; u_int extmem; struct vm86frame vmf; @@ -2094,17 +2062,8 @@ getmemsize(int first) vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; -#endif has_smap = 0; -#if defined(XEN) - Maxmem = xen_start_info->nr_pages - init_first; - physmem = Maxmem; - basemem = 0; - physmap[0] = init_first << PAGE_SHIFT; - physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); - physmap_idx = 0; -#else #ifdef XBOX if (arch_i386_is_xbox) { /* @@ -2247,7 +2206,6 @@ have_smap: physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: -#endif /* * Now, physmap contains a map of physical memory. */ @@ -2321,7 +2279,6 @@ physmap_done: getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; -#ifndef XEN /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. @@ -2442,13 +2399,6 @@ do_next: } *pte = 0; invltlb(); -#else - phys_avail[0] = physfree; - phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; - dump_avail[0] = 0; - dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; - -#endif /* * XXX @@ -2472,272 +2422,9 @@ do_next: for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); - - PT_UPDATES_FLUSH(); } #endif /* PC98 */ -#ifdef XEN -#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) - -register_t -init386(first) - int first; -{ - unsigned long gdtmachpfn; - int error, gsel_tss, metadata_missing, x, pa; - struct pcpu *pc; -#ifdef CPU_ENABLE_SSE - struct xstate_hdr *xhdr; -#endif - struct callback_register event = { - .type = CALLBACKTYPE_event, - .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, - }; - struct callback_register failsafe = { - .type = CALLBACKTYPE_failsafe, - .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, - }; - - thread0.td_kstack = proc0kstack; - thread0.td_kstack_pages = KSTACK_PAGES; - - /* - * This may be done better later if it gets more high level - * components in it. If so just link td->td_proc here. - */ - proc_linkup0(&proc0, &thread0); - - metadata_missing = 0; - if (xen_start_info->mod_start) { - preload_metadata = (caddr_t)xen_start_info->mod_start; - preload_bootstrap_relocate(KERNBASE); - } else { - metadata_missing = 1; - } - if (envmode == 1) - kern_envp = static_env; - else if ((caddr_t)xen_start_info->cmd_line) - kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); - - boothowto |= xen_boothowto(kern_envp); - - /* Init basic tunables, hz etc */ - init_param1(); - - /* - * XEN occupies a portion of the upper virtual address space - * At its base it manages an array mapping machine page frames - * to physical page frames - hence we need to be able to - * access 4GB - (64MB - 4MB + 64k) - */ - gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - - pc = &__pcpu[0]; - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; - - PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); - bzero(gdt, PAGE_SIZE); - for (x = 0; x < NGDT; x++) - ssdtosd(&gdt_segs[x], &gdt[x].sd); - - mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); - - gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; - PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); - PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); - lgdt(&r_gdt); - gdtset = 1; - - if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { - panic("set_trap_table failed - error %d\n", error); - } - - error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); - if (error == 0) - error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); -#if CONFIG_XEN_COMPAT <= 0x030002 - if (error == -ENOXENSYS) - HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), - (unsigned long)Xhypervisor_callback, - GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); -#endif - pcpu_init(pc, 0, sizeof(struct pcpu)); - for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) - pmap_kenter(pa + KERNBASE, pa); - dpcpu_init((void *)(first + KERNBASE), 0); - first += DPCPU_SIZE; - physfree += DPCPU_SIZE; - init_first += DPCPU_SIZE / PAGE_SIZE; - - PCPU_SET(prvspace, pc); - PCPU_SET(curthread, &thread0); - - /* - * Initialize mutexes. - * - * icu_lock: in order to allow an interrupt to occur in a critical - * section, to set pcpu->ipending (etc...) properly, we - * must be able to get the icu lock, so it can't be - * under witness. - */ - mutex_init(); - mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); - - /* make ldt memory segments */ - PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); - bzero(ldt, PAGE_SIZE); - ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); - ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); - for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) - ssdtosd(&ldt_segs[x], &ldt[x].sd); - - default_proc_ldt.ldt_base = (caddr_t)ldt; - default_proc_ldt.ldt_len = 6; - _default_ldt = (int)&default_proc_ldt; - PCPU_SET(currentldt, _default_ldt); - PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); - xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); - -#if defined(XEN_PRIVILEGED) - /* - * Initialize the i8254 before the console so that console - * initialization can use DELAY(). - */ - i8254_init(); -#endif - - /* - * Initialize the console before we print anything out. - */ - cninit(); - - if (metadata_missing) - printf("WARNING: loader(8) metadata is missing!\n"); - -#ifdef DEV_ISA -#ifdef DEV_ATPIC - elcr_probe(); - atpic_startup(); -#else - /* Reset and mask the atpics and leave them shut down. */ - atpic_reset(); - - /* - * Point the ICU spurious interrupt vectors at the APIC spurious - * interrupt handler. - */ - setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); -#endif -#endif - -#ifdef DDB - db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); -#endif - - kdb_init(); - -#ifdef KDB - if (boothowto & RB_KDB) - kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); -#endif - - finishidentcpu(); /* Final stage of CPU initialization */ - setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - initializecpu(); /* Initialize CPU registers */ - initializecpucache(); - - /* pointer to selector slot for %fs/%gs */ - PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); - - dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = - dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; - dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = - dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); -#if defined(PAE) || defined(PAE_TABLES) - dblfault_tss.tss_cr3 = (int)IdlePDPT; -#else - dblfault_tss.tss_cr3 = (int)IdlePTD; -#endif - dblfault_tss.tss_eip = (int)dblfault_handler; - dblfault_tss.tss_eflags = PSL_KERNEL; - dblfault_tss.tss_ds = dblfault_tss.tss_es = - dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); - dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); - dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); - dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); - - vm86_initialize(); - getmemsize(first); - init_param2(physmem); - - /* now running on new page tables, configured,and u/iom is accessible */ - - msgbufinit(msgbufp, msgbufsize); -#ifdef DEV_NPX - npxinit(true); -#endif - /* - * Set up thread0 pcb after npxinit calculated pcb + fpu save - * area size. Zero out the extended state header in fpu save - * area. - */ - thread0.td_pcb = get_pcb_td(&thread0); - bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); -#ifdef CPU_ENABLE_SSE - if (use_xsave) { - xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + - 1); - xhdr->xstate_bv = xsave_mask; - } -#endif - PCPU_SET(curpcb, thread0.td_pcb); - /* make an initial tss so cpu can get interrupt stack on syscall! */ - /* Note: -16 is so we can grow the trapframe if we came from vm86 */ - PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); - HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), - PCPU_GET(common_tss.tss_esp0)); - - /* transfer to user mode */ - - _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); - _udatasel = GSEL(GUDATA_SEL, SEL_UPL); - - /* setup proc 0's pcb */ - thread0.td_pcb->pcb_flags = 0; -#if defined(PAE) || defined(PAE_TABLES) - thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; -#else - thread0.td_pcb->pcb_cr3 = (int)IdlePTD; -#endif - thread0.td_pcb->pcb_ext = 0; - thread0.td_frame = &proc0_tf; - thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; - thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; - - cpu_probe_amdc1e(); - - /* Location of kernel stack for locore */ - return ((register_t)thread0.td_pcb); -} - -#else register_t init386(first) int first; @@ -3061,7 +2748,6 @@ init386(first) /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } -#endif void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) diff --git a/sys/i386/i386/minidump_machdep.c b/sys/i386/i386/minidump_machdep.c index a7d2a69..8f78abc 100644 --- a/sys/i386/i386/minidump_machdep.c +++ b/sys/i386/i386/minidump_machdep.c @@ -68,10 +68,6 @@ static void *dump_va; static uint64_t counter, progress; CTASSERT(sizeof(*vm_page_dump) == 4); -#ifndef XEN -#define xpmap_mtop(x) (x) -#define xpmap_ptom(x) (x) -#endif static int @@ -205,7 +201,7 @@ minidumpsys(struct dumperinfo *di) j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is an entire 2M page. */ - pa = xpmap_mtop(pd[j] & PG_PS_FRAME); + pa = pd[j] & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (is_dumpable(pa)) dump_add_page(pa); @@ -215,10 +211,10 @@ minidumpsys(struct dumperinfo *di) } if ((pd[j] & PG_V) == PG_V) { /* set bit for each valid page in this 2MB block */ - pt = pmap_kenter_temporary(xpmap_mtop(pd[j] & PG_FRAME), 0); + pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0); for (k = 0; k < NPTEPG; k++) { if ((pt[k] & PG_V) == PG_V) { - pa = xpmap_mtop(pt[k] & PG_FRAME); + pa = pt[k] & PG_FRAME; if (is_dumpable(pa)) dump_add_page(pa); } @@ -318,24 +314,8 @@ minidumpsys(struct dumperinfo *di) continue; } if ((pd[j] & PG_V) == PG_V) { - pa = xpmap_mtop(pd[j] & PG_FRAME); -#ifndef XEN + pa = pd[j] & PG_FRAME; error = blk_write(di, 0, pa, PAGE_SIZE); -#else - pt = pmap_kenter_temporary(pa, 0); - memcpy(fakept, pt, PAGE_SIZE); - for (i = 0; i < NPTEPG; i++) - fakept[i] = xpmap_mtop(fakept[i]); - error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); - if (error) - goto fail; - /* flush, in case we reuse fakept in the same block */ - error = blk_flush(di); - if (error) - goto fail; - bzero(fakept, sizeof(fakept)); -#endif - if (error) goto fail; } else { diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index b396752..bad69ee 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -130,331 +130,23 @@ __FBSDID("$FreeBSD$"); #endif /* CHECK_POINTS */ -/* lock region used by kernel profiling */ -int mcount_lock; - -int mp_naps; /* # of Applications processors */ -int boot_cpu_id = -1; /* designated BSP */ - extern struct pcpu __pcpu[]; -/* AP uses this during bootstrap. Do not staticize. */ -char *bootSTK; -static int bootAP; - -/* Free these after use */ -void *bootstacks[MAXCPU]; -static void *dpcpu; - -struct pcb stoppcbs[MAXCPU]; -struct susppcb **susppcbs; - /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; -#ifdef COUNT_IPIS -/* Interrupt counts. */ -static u_long *ipi_preempt_counts[MAXCPU]; -static u_long *ipi_ast_counts[MAXCPU]; -u_long *ipi_invltlb_counts[MAXCPU]; -u_long *ipi_invlrng_counts[MAXCPU]; -u_long *ipi_invlpg_counts[MAXCPU]; -u_long *ipi_invlcache_counts[MAXCPU]; -u_long *ipi_rendezvous_counts[MAXCPU]; -static u_long *ipi_hardclock_counts[MAXCPU]; -#endif - -/* Default cpu_ops implementation. */ -struct cpu_ops cpu_ops; - /* * Local data and functions. */ -static volatile cpuset_t ipi_nmi_pending; - -/* used to hold the AP's until we are ready to release them */ -static struct mtx ap_boot_mtx; - -/* Set to 1 once we're ready to let the APs out of the pen. */ -static volatile int aps_ready = 0; - -/* - * Store data from cpu_add() until later in the boot when we actually setup - * the APs. - */ -struct cpu_info { - int cpu_present:1; - int cpu_bsp:1; - int cpu_disabled:1; - int cpu_hyperthread:1; -} static cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; -int apic_cpuids[MAX_APIC_ID + 1]; - -/* Holds pending bitmap based IPIs per CPU */ -volatile u_int cpu_ipi_pending[MAXCPU]; - -static u_int boot_address; -static int cpu_logical; /* logical cpus per core */ -static int cpu_cores; /* cores per package */ - -static void assign_cpu_ids(void); static void install_ap_tramp(void); -static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); -static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ -static int hyperthreading_allowed = 1; - -static void -mem_range_AP_init(void) -{ - if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) - mem_range_softc.mr_op->initAP(&mem_range_softc); -} - -static void -topo_probe_amd(void) -{ - int core_id_bits; - int id; - - /* AMD processors do not support HTT. */ - cpu_logical = 1; - - if ((amd_feature2 & AMDID2_CMP) == 0) { - cpu_cores = 1; - return; - } - - core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> - AMDID_COREID_SIZE_SHIFT; - if (core_id_bits == 0) { - cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; - return; - } - - /* Fam 10h and newer should get here. */ - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) - continue; - cpu_cores++; - } -} - -/* - * Round up to the next power of two, if necessary, and then - * take log2. - * Returns -1 if argument is zero. - */ -static __inline int -mask_width(u_int x) -{ - - return (fls(x << (1 - powerof2(x))) - 1); -} - -static void -topo_probe_0x4(void) -{ - u_int p[4]; - int pkg_id_bits; - int core_id_bits; - int max_cores; - int max_logical; - int id; - - /* Both zero and one here mean one logical processor per package. */ - max_logical = (cpu_feature & CPUID_HTT) != 0 ? - (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; - if (max_logical <= 1) - return; - - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. Further, we count number of - * logical processors that belong to the same core - * as BSP thus deducing number of threads per core. - */ - if (cpu_high >= 0x4) { - cpuid_count(0x04, 0, p); - max_cores = ((p[0] >> 26) & 0x3f) + 1; - } else - max_cores = 1; - core_id_bits = mask_width(max_logical/max_cores); - if (core_id_bits < 0) - return; - pkg_id_bits = core_id_bits + mask_width(max_cores); - - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) - continue; - cpu_cores++; - /* Check if logical CPU has the same package and core IDs. */ - if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) - cpu_logical++; - } - - KASSERT(cpu_cores >= 1 && cpu_logical >= 1, - ("topo_probe_0x4 couldn't find BSP")); - - cpu_cores /= cpu_logical; - hyperthreading_cpus = cpu_logical; -} - -static void -topo_probe_0xb(void) -{ - u_int p[4]; - int bits; - int cnt; - int i; - int logical; - int type; - int x; - - /* We only support three levels for now. */ - for (i = 0; i < 3; i++) { - cpuid_count(0x0b, i, p); - - /* Fall back if CPU leaf 11 doesn't really exist. */ - if (i == 0 && p[1] == 0) { - topo_probe_0x4(); - return; - } - - bits = p[0] & 0x1f; - logical = p[1] &= 0xffff; - type = (p[2] >> 8) & 0xff; - if (type == 0 || logical == 0) - break; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. - */ - for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || - cpu_info[x].cpu_disabled) - continue; - if (x >> bits == boot_cpu_id >> bits) - cnt++; - } - if (type == CPUID_TYPE_SMT) - cpu_logical = cnt; - else if (type == CPUID_TYPE_CORE) - cpu_cores = cnt; - } - if (cpu_logical == 0) - cpu_logical = 1; - cpu_cores /= cpu_logical; -} - -/* - * Both topology discovery code and code that consumes topology - * information assume top-down uniformity of the topology. - * That is, all physical packages must be identical and each - * core in a package must have the same number of threads. - * Topology information is queried only on BSP, on which this - * code runs and for which it can query CPUID information. - * Then topology is extrapolated on all packages using the - * uniformity assumption. - */ -static void -topo_probe(void) -{ - static int cpu_topo_probed = 0; - - if (cpu_topo_probed) - return; - - CPU_ZERO(&logical_cpus_mask); - if (mp_ncpus <= 1) - cpu_cores = cpu_logical = 1; - else if (cpu_vendor_id == CPU_VENDOR_AMD) - topo_probe_amd(); - else if (cpu_vendor_id == CPU_VENDOR_INTEL) { - /* - * See Intel(R) 64 Architecture Processor - * Topology Enumeration article for details. - * - * Note that 0x1 <= cpu_high < 4 case should be - * compatible with topo_probe_0x4() logic when - * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) - * or it should trigger the fallback otherwise. - */ - if (cpu_high >= 0xb) - topo_probe_0xb(); - else if (cpu_high >= 0x1) - topo_probe_0x4(); - } - - /* - * Fallback: assume each logical CPU is in separate - * physical package. That is, no multi-core, no SMT. - */ - if (cpu_cores == 0 || cpu_logical == 0) - cpu_cores = cpu_logical = 1; - cpu_topo_probed = 1; -} - -struct cpu_group * -cpu_topo(void) -{ - int cg_flags; - - /* - * Determine whether any threading flags are - * necessry. - */ - topo_probe(); - if (cpu_logical > 1 && hyperthreading_cpus) - cg_flags = CG_FLAG_HTT; - else if (cpu_logical > 1) - cg_flags = CG_FLAG_SMT; - else - cg_flags = 0; - if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { - printf("WARNING: Non-uniform processors.\n"); - printf("WARNING: Using suboptimal topology.\n"); - return (smp_topo_none()); - } - /* - * No multi-core or hyper-threaded. - */ - if (cpu_logical * cpu_cores == 1) - return (smp_topo_none()); - /* - * Only HTT no multi-core. - */ - if (cpu_logical > 1 && cpu_cores == 1) - return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); - /* - * Only multi-core no HTT. - */ - if (cpu_cores > 1 && cpu_logical == 1) - return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); - /* - * Both HTT and multi-core. - */ - return (smp_topo_2level(CG_SHARE_L2, cpu_cores, - CG_SHARE_L1, cpu_logical, cg_flags)); -} - +static u_int boot_address; /* * Calculate usable address in base memory for AP trampoline code. @@ -470,85 +162,6 @@ mp_bootaddress(u_int basemem) return boot_address; } -void -cpu_add(u_int apic_id, char boot_cpu) -{ - - if (apic_id > MAX_APIC_ID) { - panic("SMP: APIC ID %d too high", apic_id); - return; - } - KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", - apic_id)); - cpu_info[apic_id].cpu_present = 1; - if (boot_cpu) { - KASSERT(boot_cpu_id == -1, - ("CPU %d claims to be BSP, but CPU %d already is", apic_id, - boot_cpu_id)); - boot_cpu_id = apic_id; - cpu_info[apic_id].cpu_bsp = 1; - } - if (mp_ncpus < MAXCPU) { - mp_ncpus++; - mp_maxid = mp_ncpus - 1; - } - if (bootverbose) - printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : - "AP"); -} - -void -cpu_mp_setmaxid(void) -{ - - /* - * mp_maxid should be already set by calls to cpu_add(). - * Just sanity check its value here. - */ - if (mp_ncpus == 0) - KASSERT(mp_maxid == 0, - ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); - else if (mp_ncpus == 1) - mp_maxid = 0; - else - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, - mp_maxid, mp_ncpus)); -} - -int -cpu_mp_probe(void) -{ - - /* - * Always record BSP in CPU map so that the mbuf init code works - * correctly. - */ - CPU_SETOF(0, &all_cpus); - if (mp_ncpus == 0) { - /* - * No CPUs were found, so this must be a UP system. Setup - * the variables to represent a system with a single CPU - * with an id of 0. - */ - mp_ncpus = 1; - return (0); - } - - /* At least one CPU was found. */ - if (mp_ncpus == 1) { - /* - * One CPU was found, so this must be a UP system with - * an I/O APIC. - */ - mp_maxid = 0; - return (0); - } - - /* At least two CPUs were found. */ - return (1); -} - /* * Initialize the IPI handlers and start up the AP's. */ @@ -610,48 +223,6 @@ cpu_mp_start(void) set_interrupt_apic_ids(); } - -/* - * Print various information about the SMP system hardware and setup. - */ -void -cpu_mp_announce(void) -{ - const char *hyperthread; - int i; - - printf("FreeBSD/SMP: %d package(s) x %d core(s)", - mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); - if (hyperthreading_cpus > 1) - printf(" x %d HTT threads", cpu_logical); - else if (cpu_logical > 1) - printf(" x %d SMT threads", cpu_logical); - printf("\n"); - - /* List active CPUs first. */ - printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); - for (i = 1; i < mp_ncpus; i++) { - if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, - cpu_apic_ids[i]); - } - - /* List disabled CPUs last. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) - continue; - if (cpu_info[i].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, - i); - } -} - /* * AP CPU's call this to initialize themselves. */ @@ -662,7 +233,7 @@ init_secondary(void) vm_offset_t addr; int gsel_tss; int x, myid; - u_int cpuid, cr0; + u_int cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; @@ -731,84 +302,7 @@ init_secondary(void) lidt(&r_idt); #endif - /* - * On real hardware, switch to x2apic mode if possible. Do it - * after aps_ready was signalled, to avoid manipulating the - * mode while BSP might still want to send some IPI to us - * (second startup IPI is ignored on modern hardware etc). - */ - lapic_xapic_mode(); - - /* Initialize the PAT MSR if present. */ - pmap_init_pat(); - - /* set up CPU registers and state */ - cpu_setregs(); - - /* set up SSE/NX */ - initializecpu(); - - /* set up FPU state on the AP */ - npxinit(false); - - if (cpu_ops.cpu_init) - cpu_ops.cpu_init(); - - /* A quick check from sanity claus */ - cpuid = PCPU_GET(cpuid); - if (PCPU_GET(apic_id) != lapic_id()) { - printf("SMP: cpuid = %d\n", cpuid); - printf("SMP: actual apic_id = %d\n", lapic_id()); - printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); - panic("cpuid mismatch! boom!!"); - } - - /* Initialize curthread. */ - KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); - PCPU_SET(curthread, PCPU_GET(idlethread)); - - mca_init(); - - mtx_lock_spin(&ap_boot_mtx); - - /* Init local apic for irq's */ - lapic_setup(1); - - /* Set memory range attributes for this CPU to match the BSP */ - mem_range_AP_init(); - - smp_cpus++; - - CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); - printf("SMP: AP CPU #%d Launched!\n", cpuid); - - /* Determine if we are a logical CPU. */ - /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ - if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) - CPU_SET(cpuid, &logical_cpus_mask); - - if (bootverbose) - lapic_dump("AP"); - - if (smp_cpus == mp_ncpus) { - /* enable IPI's, tlb shootdown, freezes etc */ - atomic_store_rel_int(&smp_started, 1); - } - - mtx_unlock_spin(&ap_boot_mtx); - - /* Wait until all the AP's are up. */ - while (smp_started == 0) - ia32_pause(); - - /* Start per-CPU event timers. */ - cpu_initclocks_ap(); - - /* Enter the scheduler. */ - sched_throw(NULL); - - panic("scheduler returned us to %s", __func__); - /* NOTREACHED */ + init_secondary_tail(); } /******************************************************************* @@ -816,108 +310,6 @@ init_secondary(void) */ /* - * We tell the I/O APIC code about all the CPUs we want to receive - * interrupts. If we don't want certain CPUs to receive IRQs we - * can simply not tell the I/O APIC code about them in this function. - * We also do not tell it about the BSP since it tells itself about - * the BSP internally to work with UP kernels and on UP machines. - */ -static void -set_interrupt_apic_ids(void) -{ - u_int i, apic_id; - - for (i = 0; i < MAXCPU; i++) { - apic_id = cpu_apic_ids[i]; - if (apic_id == -1) - continue; - if (cpu_info[apic_id].cpu_bsp) - continue; - if (cpu_info[apic_id].cpu_disabled) - continue; - - /* Don't let hyperthreads service interrupts. */ - if (cpu_logical > 1 && - apic_id % cpu_logical != 0) - continue; - - intr_add_cpu(i); - } -} - -/* - * Assign logical CPU IDs to local APICs. - */ -static void -assign_cpu_ids(void) -{ - u_int i; - - TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", - &hyperthreading_allowed); - - /* Check for explicitly disabled CPUs. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) - continue; - - if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { - cpu_info[i].cpu_hyperthread = 1; - - /* - * Don't use HT CPU if it has been disabled by a - * tunable. - */ - if (hyperthreading_allowed == 0) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - /* Don't use this CPU if it has been disabled by a tunable. */ - if (resource_disabled("lapic", i)) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { - hyperthreading_cpus = 0; - cpu_logical = 1; - } - - /* - * Assign CPU IDs to local APIC IDs and disable any CPUs - * beyond MAXCPU. CPU 0 is always assigned to the BSP. - * - * To minimize confusion for userland, we attempt to number - * CPUs such that all threads and cores in a package are - * grouped together. For now we assume that the BSP is always - * the first thread in a package and just start adding APs - * starting with the BSP's APIC ID. - */ - mp_ncpus = 1; - cpu_apic_ids[0] = boot_cpu_id; - apic_cpuids[boot_cpu_id] = 0; - for (i = boot_cpu_id + 1; i != boot_cpu_id; - i == MAX_APIC_ID ? i = 0 : i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || - cpu_info[i].cpu_disabled) - continue; - - if (mp_ncpus < MAXCPU) { - cpu_apic_ids[mp_ncpus] = i; - apic_cpuids[i] = mp_ncpus; - mp_ncpus++; - } else - cpu_info[i].cpu_disabled = 1; - } - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, - mp_ncpus)); -} - -/* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ @@ -1094,129 +486,6 @@ start_ap(int apic_id) return 0; /* return FAILURE */ } -#ifdef COUNT_XINVLTLB_HITS -u_int xhits_gbl[MAXCPU]; -u_int xhits_pg[MAXCPU]; -u_int xhits_rng[MAXCPU]; -static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, - sizeof(xhits_gbl), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, - sizeof(xhits_pg), "IU", ""); -SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, - sizeof(xhits_rng), "IU", ""); - -u_int ipi_global; -u_int ipi_page; -u_int ipi_range; -u_int ipi_range_size; -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, - 0, ""); - -u_int ipi_masked_global; -u_int ipi_masked_page; -u_int ipi_masked_range; -u_int ipi_masked_range_size; -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, - &ipi_masked_global, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, - &ipi_masked_page, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, - &ipi_masked_range, 0, ""); -SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, - &ipi_masked_range_size, 0, ""); -#endif /* COUNT_XINVLTLB_HITS */ - -/* - * Init and startup IPI. - */ -void -ipi_startup(int apic_id, int vector) -{ - - /* - * This attempts to follow the algorithm described in the - * Intel Multiprocessor Specification v1.4 in section B.4. - * For each IPI, we allow the local APIC ~20us to deliver the - * IPI. If that times out, we panic. - */ - - /* - * first we do an INIT IPI: this INIT IPI might be run, resetting - * and running the target CPU. OR this INIT IPI might be latched (P5 - * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be - * ignored. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); - lapic_ipi_wait(100); - - /* Explicitly deassert the INIT IPI. */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | - APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, - apic_id); - - DELAY(10000); /* wait ~10mS */ - - /* - * next we do a STARTUP IPI: the previous INIT IPI might still be - * latched, (P5 bug) this 1st STARTUP would then terminate - * immediately, and the previously started INIT IPI would continue. OR - * the previous INIT IPI has already run. and this STARTUP IPI will - * run. OR the previous INIT IPI was ignored. and this STARTUP IPI - * will run. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | - vector, apic_id); - if (!lapic_ipi_wait(100)) - panic("Failed to deliver first STARTUP IPI to APIC %d", - apic_id); - DELAY(200); /* wait ~200uS */ - - /* - * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF - * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR - * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is - * recognized after hardware RESET or INIT IPI. - */ - lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | - APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | - vector, apic_id); - if (!lapic_ipi_wait(100)) - panic("Failed to deliver second STARTUP IPI to APIC %d", - apic_id); - - DELAY(200); /* wait ~200uS */ -} - -/* - * Send an IPI to specified CPU handling the bitmap logic. - */ -static void -ipi_send_cpu(int cpu, u_int ipi) -{ - u_int bitmap, old_pending, new_pending; - - KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); - - if (IPI_IS_BITMAPED(ipi)) { - bitmap = 1 << ipi; - ipi = IPI_BITMAP_VECTOR; - do { - old_pending = cpu_ipi_pending[cpu]; - new_pending = old_pending | bitmap; - } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], - old_pending, new_pending)); - if (old_pending) - return; - } - lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); -} - /* * Flush the TLB on all other CPU's */ @@ -1280,14 +549,6 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_of } void -smp_cache_flush(void) -{ - - if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); -} - -void smp_invltlb(void) { @@ -1362,203 +623,11 @@ smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) } void -ipi_bitmap_handler(struct trapframe frame) -{ - struct trapframe *oldframe; - struct thread *td; - int cpu = PCPU_GET(cpuid); - u_int ipi_bitmap; - - critical_enter(); - td = curthread; - td->td_intr_nesting_level++; - oldframe = td->td_intr_frame; - td->td_intr_frame = &frame; - ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); - if (ipi_bitmap & (1 << IPI_PREEMPT)) { -#ifdef COUNT_IPIS - (*ipi_preempt_counts[cpu])++; -#endif - sched_preempt(td); - } - if (ipi_bitmap & (1 << IPI_AST)) { -#ifdef COUNT_IPIS - (*ipi_ast_counts[cpu])++; -#endif - /* Nothing to do for AST */ - } - if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { -#ifdef COUNT_IPIS - (*ipi_hardclock_counts[cpu])++; -#endif - hardclockintr(); - } - td->td_intr_frame = oldframe; - td->td_intr_nesting_level--; - critical_exit(); -} - -/* - * send an IPI to a set of cpus. - */ -void -ipi_selected(cpuset_t cpus, u_int ipi) -{ - int cpu; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); - - while ((cpu = CPU_FFS(&cpus)) != 0) { - cpu--; - CPU_CLR(cpu, &cpus); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); - } -} - -/* - * send an IPI to a specific CPU. - */ -void -ipi_cpu(int cpu, u_int ipi) -{ - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); - - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); -} - -/* - * send an IPI to all CPUs EXCEPT myself - */ -void -ipi_all_but_self(u_int ipi) -{ - cpuset_t other_cpus; - - other_cpus = all_cpus; - CPU_CLR(PCPU_GET(cpuid), &other_cpus); - if (IPI_IS_BITMAPED(ipi)) { - ipi_selected(other_cpus, ipi); - return; - } - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); - - CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); - lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); -} - -int -ipi_nmi_handler() -{ - u_int cpuid; - - /* - * As long as there is not a simple way to know about a NMI's - * source, if the bitmask for the current CPU is present in - * the global pending bitword an IPI_STOP_HARD has been issued - * and should be handled. - */ - cpuid = PCPU_GET(cpuid); - if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) - return (1); - - CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); - cpustop_handler(); - return (0); -} - -/* - * Handle an IPI_STOP by saving our current context and spinning until we - * are resumed. - */ -void -cpustop_handler(void) -{ - u_int cpu; - - cpu = PCPU_GET(cpuid); - - savectx(&stoppcbs[cpu]); - - /* Indicate that we are stopped */ - CPU_SET_ATOMIC(cpu, &stopped_cpus); - - /* Wait for restart */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - CPU_CLR_ATOMIC(cpu, &started_cpus); - CPU_CLR_ATOMIC(cpu, &stopped_cpus); - - if (cpu == 0 && cpustop_restartfunc != NULL) { - cpustop_restartfunc(); - cpustop_restartfunc = NULL; - } -} - -/* - * Handle an IPI_SUSPEND by saving our current context and spinning until we - * are resumed. - */ -void -cpususpend_handler(void) +smp_cache_flush(void) { - u_int cpu; - - mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); - - cpu = PCPU_GET(cpuid); - if (savectx(&susppcbs[cpu]->sp_pcb)) { - npxsuspend(susppcbs[cpu]->sp_fpususpend); - wbinvd(); - CPU_SET_ATOMIC(cpu, &suspended_cpus); - } else { - npxresume(susppcbs[cpu]->sp_fpususpend); - pmap_init_pat(); - initializecpu(); - PCPU_SET(switchtime, 0); - PCPU_SET(switchticks, ticks); - - /* Indicate that we are resumed */ - CPU_CLR_ATOMIC(cpu, &suspended_cpus); - } - - /* Wait for resume */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - if (cpu_ops.cpu_resume) - cpu_ops.cpu_resume(); - /* Resume MCA and local APIC */ - lapic_xapic_mode(); - mca_resume(); - lapic_setup(0); - - /* Indicate that we are resumed */ - CPU_CLR_ATOMIC(cpu, &suspended_cpus); - CPU_CLR_ATOMIC(cpu, &started_cpus); + if (smp_started) + smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } /* @@ -1614,62 +683,3 @@ invlrng_handler(void) atomic_add_int(&smp_tlb_wait, 1); } - -void -invlcache_handler(void) -{ -#ifdef COUNT_IPIS - (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); -} - -/* - * This is called once the rest of the system is up and running and we're - * ready to let the AP's out of the pen. - */ -static void -release_aps(void *dummy __unused) -{ - - if (mp_ncpus == 1) - return; - atomic_store_rel_int(&aps_ready, 1); - while (smp_started == 0) - ia32_pause(); -} -SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); - -#ifdef COUNT_IPIS -/* - * Setup interrupt counters for IPI handlers. - */ -static void -mp_ipi_intrcnt(void *dummy) -{ - char buf[64]; - int i; - - CPU_FOREACH(i) { - snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); - intrcnt_add(buf, &ipi_invltlb_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); - intrcnt_add(buf, &ipi_invlrng_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); - intrcnt_add(buf, &ipi_invlpg_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); - intrcnt_add(buf, &ipi_invlcache_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:preempt", i); - intrcnt_add(buf, &ipi_preempt_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:ast", i); - intrcnt_add(buf, &ipi_ast_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); - intrcnt_add(buf, &ipi_rendezvous_counts[i]); - snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); - intrcnt_add(buf, &ipi_hardclock_counts[i]); - } -} -SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); -#endif diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index 0a08012..78d76ef 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -695,11 +695,9 @@ END(bcmp) */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) -#ifndef XEN /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) -#endif /* flush the prefetch q */ jmp 1f diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index 26bfd3b..6bedb48 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -88,7 +88,7 @@ ENTRY(cpu_throw) movl 8(%esp),%ecx /* New thread */ movl TD_PCB(%ecx),%edx movl PCB_CR3(%edx),%eax - LOAD_CR3(%eax) + movl %eax,%cr3 /* set bit in new pm_active */ movl TD_PROC(%ecx),%eax movl P_VMSPACE(%eax), %ebx @@ -174,10 +174,10 @@ ENTRY(cpu_switch) /* switch address space */ movl PCB_CR3(%edx),%eax - READ_CR3(%ebx) /* The same address space? */ + movl %cr3,%ebx /* The same address space? */ cmpl %ebx,%eax je sw0 - LOAD_CR3(%eax) /* new address space */ + movl %eax,%cr3 /* new address space */ movl %esi,%eax movl PCPU(CPUID),%esi SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */ @@ -204,18 +204,6 @@ sw0: SETOP %esi,TD_LOCK(%edi) /* Switchout td_lock */ sw1: BLOCK_SPIN(%ecx) -#ifdef XEN - pushl %eax - pushl %ecx - pushl %edx - call xen_handle_thread_switch - popl %edx - popl %ecx - popl %eax - /* - * XXX set IOPL - */ -#else /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. @@ -264,7 +252,7 @@ sw1: movl 12(%esi), %ebx movl %eax, 8(%edi) movl %ebx, 12(%edi) -#endif + /* Restore context. */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp @@ -290,7 +278,7 @@ sw1: movl _default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f - LLDT(_default_ldt) + lldt _default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: diff --git a/sys/i386/i386/sys_machdep.c b/sys/i386/i386/sys_machdep.c index 9518862..9044d19 100644 --- a/sys/i386/i386/sys_machdep.c +++ b/sys/i386/i386/sys_machdep.c @@ -59,20 +59,6 @@ __FBSDID("$FreeBSD$"); #include <security/audit/audit.h> -#ifdef XEN -#include <machine/xen/xenfunc.h> - -void i386_reset_ldt(struct proc_ldt *pldt); - -void -i386_reset_ldt(struct proc_ldt *pldt) -{ - xen_set_ldt((vm_offset_t)pldt->ldt_base, pldt->ldt_len); -} -#else -#define i386_reset_ldt(x) -#endif - #include <vm/vm_kern.h> /* for kernel_map */ #define MAX_LD 8192 @@ -211,12 +197,7 @@ sysarch(td, uap) */ sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; -#ifdef XEN - /* need to do nosegneg like Linux */ - sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff; -#else sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ -#endif sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -226,12 +207,7 @@ sysarch(td, uap) sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_fsd = sd; -#ifdef XEN - HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[0]), - *(uint64_t *)&sd); -#else PCPU_GET(fsgs_gdt)[0] = sd; -#endif critical_exit(); td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL); } @@ -252,12 +228,7 @@ sysarch(td, uap) sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; -#ifdef XEN - /* need to do nosegneg like Linux */ - sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff; -#else sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ -#endif sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -267,12 +238,7 @@ sysarch(td, uap) sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_gsd = sd; -#ifdef XEN - HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[1]), - *(uint64_t *)&sd); -#else PCPU_GET(fsgs_gdt)[1] = sd; -#endif critical_exit(); load_gs(GSEL(GUGS_SEL, SEL_UPL)); } @@ -434,10 +400,6 @@ set_user_ldt(struct mdproc *mdp) } pldt = mdp->md_ldt; -#ifdef XEN - i386_reset_ldt(pldt); - PCPU_SET(currentldt, (int)pldt); -#else #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd; #else @@ -445,7 +407,6 @@ set_user_ldt(struct mdproc *mdp) #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); -#endif /* XEN */ if (dtlocked) mtx_unlock_spin(&dt_lock); } @@ -464,43 +425,6 @@ set_user_ldt_rv(struct vmspace *vmsp) } #endif -#ifdef XEN - -/* - * dt_lock must be held. Returns with dt_lock held. - */ -struct proc_ldt * -user_ldt_alloc(struct mdproc *mdp, int len) -{ - struct proc_ldt *pldt, *new_ldt; - - mtx_assert(&dt_lock, MA_OWNED); - mtx_unlock_spin(&dt_lock); - new_ldt = malloc(sizeof(struct proc_ldt), - M_SUBPROC, M_WAITOK); - - new_ldt->ldt_len = len = NEW_MAX_LD(len); - new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena, - round_page(len * sizeof(union descriptor)), M_WAITOK); - new_ldt->ldt_refcnt = 1; - new_ldt->ldt_active = 0; - - mtx_lock_spin(&dt_lock); - if ((pldt = mdp->md_ldt)) { - if (len > pldt->ldt_len) - len = pldt->ldt_len; - bcopy(pldt->ldt_base, new_ldt->ldt_base, - len * sizeof(union descriptor)); - } else { - bcopy(ldt, new_ldt->ldt_base, PAGE_SIZE); - } - mtx_unlock_spin(&dt_lock); /* XXX kill once pmap locking fixed. */ - pmap_map_readonly(kernel_pmap, (vm_offset_t)new_ldt->ldt_base, - new_ldt->ldt_len*sizeof(union descriptor)); - mtx_lock_spin(&dt_lock); /* XXX kill once pmap locking fixed. */ - return (new_ldt); -} -#else /* * dt_lock must be held. Returns with dt_lock held. */ @@ -535,7 +459,6 @@ user_ldt_alloc(struct mdproc *mdp, int len) return (new_ldt); } -#endif /* !XEN */ /* * Must be called with dt_lock held. Returns with dt_lock unheld. @@ -553,13 +476,8 @@ user_ldt_free(struct thread *td) } if (td == curthread) { -#ifdef XEN - i386_reset_ldt(&default_proc_ldt); - PCPU_SET(currentldt, (int)&default_proc_ldt); -#else lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); -#endif } mdp->md_ldt = NULL; @@ -785,27 +703,7 @@ again: td->td_retval[0] = uap->start; return (error); } -#ifdef XEN -static int -i386_set_ldt_data(struct thread *td, int start, int num, - union descriptor *descs) -{ - struct mdproc *mdp = &td->td_proc->p_md; - struct proc_ldt *pldt = mdp->md_ldt; - mtx_assert(&dt_lock, MA_OWNED); - - while (num) { - xen_update_descriptor( - &((union descriptor *)(pldt->ldt_base))[start], - descs); - num--; - start++; - descs++; - } - return (0); -} -#else static int i386_set_ldt_data(struct thread *td, int start, int num, union descriptor *descs) @@ -821,7 +719,6 @@ i386_set_ldt_data(struct thread *td, int start, int num, num * sizeof(union descriptor)); return (0); } -#endif /* !XEN */ static int i386_ldt_grow(struct thread *td, int len) diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index 5fd4a16..5671dc9 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -89,9 +89,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_map.h> #include <vm/vm_param.h> -#ifdef XEN -#include <xen/hypervisor.h> -#endif #ifdef PC98 #include <pc98/cbus/cbus.h> #else @@ -304,10 +301,8 @@ cpu_fork(td1, p2, td2, flags) /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; - /* - * XXX XEN need to check on PSL_USER is handled - */ td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + /* * Now, cpu_switch() can schedule the new process. * pcb_esp is loaded pointing to the cpu_switch() stack frame @@ -698,12 +693,6 @@ cpu_reset_real() #endif disable_intr(); -#ifdef XEN - if (smp_processor_id() == 0) - HYPERVISOR_shutdown(SHUTDOWN_reboot); - else - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -#endif #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; @@ -797,13 +786,8 @@ sf_buf_map(struct sf_buf *sf, int flags) */ ptep = vtopte(sf->kva); opte = *ptep; -#ifdef XEN - PT_SET_MA(sf->kva, xpmap_ptom(VM_PAGE_TO_PHYS(sf->m)) | pgeflag - | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0)); -#else *ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0); -#endif /* * Avoid unnecessary TLB invalidations: If the sf_buf's old @@ -854,15 +838,8 @@ sf_buf_shootdown(struct sf_buf *sf, int flags) int sf_buf_unmap(struct sf_buf *sf) { -#ifdef XEN - /* - * Xen doesn't like having dangling R/W mappings - */ - pmap_qremove(sf->kva, 1); - return (1); -#else + return (0); -#endif } static void diff --git a/sys/i386/include/asmacros.h b/sys/i386/include/asmacros.h index c1c3f64..716915c 100644 --- a/sys/i386/include/asmacros.h +++ b/sys/i386/include/asmacros.h @@ -176,37 +176,6 @@ movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \ movl %eax, %fs -#ifdef XEN -#define LOAD_CR3(reg) \ - movl reg,PCPU(CR3); \ - pushl %ecx ; \ - pushl %edx ; \ - pushl %esi ; \ - pushl reg ; \ - call xen_load_cr3 ; \ - addl $4,%esp ; \ - popl %esi ; \ - popl %edx ; \ - popl %ecx ; \ - -#define READ_CR3(reg) movl PCPU(CR3),reg; -#define LLDT(arg) \ - pushl %edx ; \ - pushl %eax ; \ - xorl %eax,%eax ; \ - movl %eax,%gs ; \ - call i386_reset_ldt ; \ - popl %eax ; \ - popl %edx -#define CLI call ni_cli -#else -#define LOAD_CR3(reg) movl reg,%cr3; -#define READ_CR3(reg) movl %cr3,reg; -#define LLDT(arg) lldt arg; -#define CLI cli -#endif /* !XEN */ - - #endif /* LOCORE */ #ifdef __STDC__ diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h index f80a898..3bc25d4 100644 --- a/sys/i386/include/cpufunc.h +++ b/sys/i386/include/cpufunc.h @@ -42,17 +42,6 @@ #error this file needs sys/cdefs.h as a prerequisite #endif -#ifdef XEN -extern void xen_cli(void); -extern void xen_sti(void); -extern u_int xen_rcr2(void); -extern void xen_load_cr3(u_int data); -extern void xen_tlb_flush(void); -extern void xen_invlpg(u_int addr); -extern void write_eflags(u_int eflags); -extern u_int read_eflags(void); -#endif - struct region_descriptor; #define readb(va) (*(volatile uint8_t *) (va)) @@ -106,11 +95,8 @@ clts(void) static __inline void disable_intr(void) { -#ifdef XEN - xen_cli(); -#else + __asm __volatile("cli" : : : "memory"); -#endif } static __inline void @@ -132,11 +118,8 @@ cpuid_count(u_int ax, u_int cx, u_int *p) static __inline void enable_intr(void) { -#ifdef XEN - xen_sti(); -#else + __asm __volatile("sti"); -#endif } static __inline void @@ -325,11 +308,7 @@ ia32_pause(void) } static __inline u_int -#ifdef XEN -_read_eflags(void) -#else read_eflags(void) -#endif { u_int ef; @@ -389,11 +368,7 @@ wbinvd(void) } static __inline void -#ifdef XEN -_write_eflags(u_int ef) -#else write_eflags(u_int ef) -#endif { __asm __volatile("pushl %0; popfl" : : "r" (ef)); } @@ -425,9 +400,6 @@ rcr2(void) { u_int data; -#ifdef XEN - return (xen_rcr2()); -#endif __asm __volatile("movl %%cr2,%0" : "=r" (data)); return (data); } @@ -435,11 +407,8 @@ rcr2(void) static __inline void load_cr3(u_int data) { -#ifdef XEN - xen_load_cr3(data); -#else + __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory"); -#endif } static __inline u_int @@ -491,11 +460,8 @@ load_xcr(u_int reg, uint64_t val) static __inline void invltlb(void) { -#ifdef XEN - xen_tlb_flush(); -#else + load_cr3(rcr3()); -#endif } /* @@ -506,11 +472,7 @@ static __inline void invlpg(u_int addr) { -#ifdef XEN - xen_invlpg(addr); -#else __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -#endif } static __inline u_short diff --git a/sys/i386/include/intr_machdep.h b/sys/i386/include/intr_machdep.h index 082b649..96ac06a 100644 --- a/sys/i386/include/intr_machdep.h +++ b/sys/i386/include/intr_machdep.h @@ -58,13 +58,7 @@ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) -#elif defined(XEN) -#include <xen/xen-os.h> -#define NUM_EVTCHN_INTS NR_EVENT_CHANNELS -#define FIRST_EVTCHN_INT 0 -#define LAST_EVTCHN_INT \ - (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) -#else /* !XEN && !XENHVM */ +#else /* !XENHVM */ #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h index dc29b6d..231f80f 100644 --- a/sys/i386/include/pcpu.h +++ b/sys/i386/include/pcpu.h @@ -44,34 +44,6 @@ * other processors" */ -#if defined(XEN) - -/* These are peridically updated in shared_info, and then copied here. */ -struct shadow_time_info { - uint64_t tsc_timestamp; /* TSC at last update of time vals. */ - uint64_t system_timestamp; /* Time, in nanosecs, since boot. */ - uint32_t tsc_to_nsec_mul; - uint32_t tsc_to_usec_mul; - int tsc_shift; - uint32_t version; -}; - -#define PCPU_XEN_FIELDS \ - ; \ - u_int pc_cr3; /* track cr3 for R1/R3*/ \ - vm_paddr_t *pc_pdir_shadow; \ - uint64_t pc_processed_system_time; \ - struct shadow_time_info pc_shadow_time; \ - char __pad[185] - -#else /* !XEN */ - -#define PCPU_XEN_FIELDS \ - ; \ - char __pad[233] - -#endif - #define PCPU_MD_FIELDS \ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ @@ -85,8 +57,8 @@ struct shadow_time_info { u_int pc_apic_id; \ int pc_private_tss; /* Flag indicating private tss*/\ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ - u_int pc_vcpu_id /* Xen vCPU ID */ \ - PCPU_XEN_FIELDS + u_int pc_vcpu_id; /* Xen vCPU ID */ \ + char __pad[233] #ifdef _KERNEL diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 0d8057f..76822b1 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -219,76 +219,6 @@ extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ */ #define vtophys(va) pmap_kextract((vm_offset_t)(va)) -#if defined(XEN) -#include <sys/param.h> - -#include <xen/xen-os.h> - -#include <machine/xen/xenvar.h> -#include <machine/xen/xenpmap.h> - -extern pt_entry_t pg_nx; - -#define PG_KERNEL (PG_V | PG_A | PG_RW | PG_M) - -#define MACH_TO_VM_PAGE(ma) PHYS_TO_VM_PAGE(xpmap_mtop((ma))) -#define VM_PAGE_TO_MACH(m) xpmap_ptom(VM_PAGE_TO_PHYS((m))) - -#define VTOM(va) xpmap_ptom(VTOP(va)) - -static __inline vm_paddr_t -pmap_kextract_ma(vm_offset_t va) -{ - vm_paddr_t ma; - if ((ma = PTD[va >> PDRSHIFT]) & PG_PS) { - ma = (ma & ~(NBPDR - 1)) | (va & (NBPDR - 1)); - } else { - ma = (*vtopte(va) & PG_FRAME) | (va & PAGE_MASK); - } - return ma; -} - -static __inline vm_paddr_t -pmap_kextract(vm_offset_t va) -{ - return xpmap_mtop(pmap_kextract_ma(va)); -} -#define vtomach(va) pmap_kextract_ma(((vm_offset_t) (va))) - -vm_paddr_t pmap_extract_ma(struct pmap *pmap, vm_offset_t va); - -void pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa); -void pmap_map_readonly(struct pmap *pmap, vm_offset_t va, int len); -void pmap_map_readwrite(struct pmap *pmap, vm_offset_t va, int len); - -static __inline pt_entry_t -pte_load_store(pt_entry_t *ptep, pt_entry_t v) -{ - pt_entry_t r; - - r = *ptep; - PT_SET_VA(ptep, v, TRUE); - return (r); -} - -static __inline pt_entry_t -pte_load_store_ma(pt_entry_t *ptep, pt_entry_t v) -{ - pt_entry_t r; - - r = *ptep; - PT_SET_VA_MA(ptep, v, TRUE); - return (r); -} - -#define pte_load_clear(ptep) pte_load_store((ptep), (pt_entry_t)0ULL) - -#define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) -#define pte_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) -#define pde_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) - -#elif !defined(XEN) - /* * KPTmap is a linear mapping of the kernel page table. It differs from the * recursive mapping in two ways: (1) it only provides access to kernel page @@ -328,13 +258,8 @@ pmap_kextract(vm_offset_t va) } return (pa); } -#endif - -#if !defined(XEN) -#define PT_UPDATES_FLUSH() -#endif -#if (defined(PAE) || defined(PAE_TABLES)) && !defined(XEN) +#if (defined(PAE) || defined(PAE_TABLES)) #define pde_cmpset(pdep, old, new) atomic_cmpset_64_i586(pdep, old, new) #define pte_load_store(ptep, pte) atomic_swap_64_i586(ptep, pte) @@ -343,7 +268,7 @@ pmap_kextract(vm_offset_t va) extern pt_entry_t pg_nx; -#elif !defined(PAE) && !defined(PAE_TABLES) && !defined(XEN) +#else /* !(PAE || PAE_TABLES) */ #define pde_cmpset(pdep, old, new) atomic_cmpset_int(pdep, old, new) #define pte_load_store(ptep, pte) atomic_swap_int(ptep, pte) @@ -352,7 +277,7 @@ extern pt_entry_t pg_nx; *(u_int *)(ptep) = (u_int)(pte); \ } while (0) -#endif /* PAE */ +#endif /* !(PAE || PAE_TABLES) */ #define pte_clear(ptep) pte_store(ptep, 0) diff --git a/sys/i386/include/segments.h b/sys/i386/include/segments.h index d67f2e0..635dffc 100644 --- a/sys/i386/include/segments.h +++ b/sys/i386/include/segments.h @@ -82,14 +82,8 @@ struct region_descriptor { #ifdef _KERNEL extern int _default_ldt; -#ifdef XEN -extern struct proc_ldt default_proc_ldt; -extern union descriptor *gdt; -extern union descriptor *ldt; -#else extern union descriptor gdt[]; extern union descriptor ldt[NLDT]; -#endif extern struct soft_segment_descriptor gdt_segs[]; extern struct gate_descriptor *idt; extern struct region_descriptor r_gdt, r_idt; diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index 0abbca4..71c830e 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -36,6 +36,37 @@ extern int mp_naps; extern int boot_cpu_id; extern struct pcb stoppcbs[]; extern int cpu_apic_ids[]; +extern int bootAP; +extern void *dpcpu; +extern char *bootSTK; +extern int bootAP; +extern void *bootstacks[]; +extern volatile u_int cpu_ipi_pending[]; +extern volatile int aps_ready; +extern struct mtx ap_boot_mtx; +extern int cpu_logical; +extern int cpu_cores; +extern volatile int smp_tlb_wait; +extern u_int xhits_gbl[]; +extern u_int xhits_pg[]; +extern u_int xhits_rng[]; +extern u_int ipi_global; +extern u_int ipi_page; +extern u_int ipi_range; +extern u_int ipi_range_size; +extern u_int ipi_masked_global; +extern u_int ipi_masked_page; +extern u_int ipi_masked_range; +extern u_int ipi_masked_range_size; + +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; + int cpu_disabled:1; + int cpu_hyperthread:1; +}; +extern struct cpu_info cpu_info[]; + #ifdef COUNT_IPIS extern u_long *ipi_invltlb_counts[MAXCPU]; extern u_long *ipi_invlrng_counts[MAXCPU]; @@ -56,11 +87,11 @@ inthand_t IDTVEC(rendezvous); /* handle CPU rendezvous */ /* functions in mp_machdep.c */ +void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); -#ifndef XEN void cpususpend_handler(void); -#endif +void init_secondary_tail(void); void invltlb_handler(void); void invlpg_handler(void); void invlrng_handler(void); @@ -68,13 +99,12 @@ void invlcache_handler(void); void init_secondary(void); void ipi_startup(int apic_id, int vector); void ipi_all_but_self(u_int ipi); -#ifndef XEN void ipi_bitmap_handler(struct trapframe frame); -#endif void ipi_cpu(int cpu, u_int ipi); int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); u_int mp_bootaddress(u_int); +void set_interrupt_apic_ids(void); void smp_cache_flush(void); void smp_invlpg(vm_offset_t addr); void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr); @@ -83,10 +113,10 @@ void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, vm_offset_t endva); void smp_invltlb(void); void smp_masked_invltlb(cpuset_t mask); +void mem_range_AP_init(void); +void topo_probe(void); +void ipi_send_cpu(int cpu, u_int ipi); -#ifdef XEN -void ipi_to_irq_init(void); -#endif #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/i386/include/vm.h b/sys/i386/include/vm.h index 6573e37..22d2eca 100644 --- a/sys/i386/include/vm.h +++ b/sys/i386/include/vm.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009 Advanced Computing Technologies LLC + * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/i386/include/vmparam.h b/sys/i386/include/vmparam.h index 7775669..cb23b05 100644 --- a/sys/i386/include/vmparam.h +++ b/sys/i386/include/vmparam.h @@ -135,11 +135,7 @@ * Kernel physical load address. */ #ifndef KERNLOAD -#if defined(XEN) && !defined(XEN_PRIVILEGED_GUEST) -#define KERNLOAD 0 -#else #define KERNLOAD (1 << PDRSHIFT) -#endif #endif /* !defined(KERNLOAD) */ /* @@ -149,11 +145,7 @@ * messy at times, but hey, we'll do anything to save a page :-) */ -#ifdef XEN -#define VM_MAX_KERNEL_ADDRESS HYPERVISOR_VIRT_START -#else #define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1) -#endif #define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI) diff --git a/sys/i386/include/xen/features.h b/sys/i386/include/xen/features.h deleted file mode 100644 index fb4f680..0000000 --- a/sys/i386/include/xen/features.h +++ /dev/null @@ -1,22 +0,0 @@ -/****************************************************************************** - * features.h - * - * Query the features reported by Xen. - * - * Copyright (c) 2006, Ian Campbell - * - * $FreeBSD$ - */ - -#ifndef __ASM_XEN_FEATURES_H__ -#define __ASM_XEN_FEATURES_H__ - -#include <xen/interface/version.h> - -extern void setup_xen_features(void); - -extern uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; - -#define xen_feature(flag) (xen_features[flag]) - -#endif /* __ASM_XEN_FEATURES_H__ */ diff --git a/sys/i386/include/xen/hypercall.h b/sys/i386/include/xen/hypercall.h index c7e2a00..1c4d529 100644 --- a/sys/i386/include/xen/hypercall.h +++ b/sys/i386/include/xen/hypercall.h @@ -246,14 +246,8 @@ HYPERVISOR_memory_op( return _hypercall2(int, memory_op, cmd, arg); } -#if defined(XEN) -int HYPERVISOR_multicall(multicall_entry_t *, int); -static inline int -_HYPERVISOR_multicall( -#else /* XENHVM */ static inline int HYPERVISOR_multicall( -#endif void *call_list, int nr_calls) { return _hypercall2(int, multicall, call_list, nr_calls); diff --git a/sys/i386/include/xen/xen-os.h b/sys/i386/include/xen/xen-os.h index 3d1ef04..9b9b63f 100644 --- a/sys/i386/include/xen/xen-os.h +++ b/sys/i386/include/xen/xen-os.h @@ -44,105 +44,6 @@ static inline void rep_nop(void) } #define cpu_relax() rep_nop() -#ifndef XENHVM - -#ifdef SMP -extern int gdtset; - -#include <sys/time.h> /* XXX for pcpu.h */ -#include <sys/pcpu.h> /* XXX for PCPU_GET */ -static inline int -smp_processor_id(void) -{ - if (__predict_true(gdtset)) - return PCPU_GET(cpuid); - return 0; -} - -#else -#define smp_processor_id() 0 -#endif - -#ifndef PANIC_IF -#define PANIC_IF(exp) if (__predict_false(exp)) {printf("panic - %s: %s:%d\n",#exp, __FILE__, __LINE__); panic("%s: %s:%d", #exp, __FILE__, __LINE__);} -#endif - -/* - * Crude memory allocator for memory allocation early in boot. - */ -void *bootmem_alloc(unsigned int size); -void bootmem_free(void *ptr, unsigned int size); - -/* - * STI/CLI equivalents. These basically set and clear the virtual - * event_enable flag in the shared_info structure. Note that when - * the enable bit is set, there may be pending events to be handled. - * We may therefore call into do_hypervisor_callback() directly. - */ - -#define __cli() \ -do { \ - vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - -#define __sti() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if (__predict_false(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ -} while (0) - -#define __restore_flags(x) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ - barrier(); /* unmask then check (avoid races) */ \ - if (__predict_false(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ - } \ -} while (0) - -/* - * Add critical_{enter, exit}? - * - */ -#define __save_and_cli(x) \ -do { \ - vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ - (x) = _vcpu->evtchn_upcall_mask; \ - _vcpu->evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - - -#define cli() __cli() -#define sti() __sti() -#define save_flags(x) __save_flags(x) -#define restore_flags(x) __restore_flags(x) -#define save_and_cli(x) __save_and_cli(x) - -#define local_irq_save(x) __save_and_cli(x) -#define local_irq_restore(x) __restore_flags(x) -#define local_irq_disable() __cli() -#define local_irq_enable() __sti() - -#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));} -#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); } -#define spin_lock_irqsave mtx_lock_irqsave -#define spin_unlock_irqrestore mtx_unlock_irqrestore - -#endif /* !XENHVM */ - /* This is a barrier for the compiler only, NOT the processor! */ #define barrier() __asm__ __volatile__("": : :"memory") diff --git a/sys/i386/include/xen/xenfunc.h b/sys/i386/include/xen/xenfunc.h index f02ee12..f48b1f1 100644 --- a/sys/i386/include/xen/xenfunc.h +++ b/sys/i386/include/xen/xenfunc.h @@ -34,7 +34,6 @@ #include <vm/pmap.h> -#include <machine/xen/xenpmap.h> #include <machine/segments.h> #include <sys/pcpu.h> diff --git a/sys/i386/include/xen/xenpmap.h b/sys/i386/include/xen/xenpmap.h deleted file mode 100644 index 8287e72..0000000 --- a/sys/i386/include/xen/xenpmap.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * - * Copyright (c) 2004 Christian Limpach. - * Copyright (c) 2004,2005 Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christian Limpach. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * $FreeBSD$ - */ - -#ifndef _XEN_XENPMAP_H_ -#define _XEN_XENPMAP_H_ - -#if defined(XEN) -void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int); -void xen_pt_switch(vm_paddr_t); -void xen_set_ldt(vm_paddr_t, unsigned long); -void xen_pgdpt_pin(vm_paddr_t); -void xen_pgd_pin(vm_paddr_t); -void xen_pgd_unpin(vm_paddr_t); -void xen_pt_pin(vm_paddr_t); -void xen_pt_unpin(vm_paddr_t); -void xen_flush_queue(void); -void pmap_ref(pt_entry_t *pte, vm_paddr_t ma); -void pmap_suspend(void); -void pmap_resume(void); -void xen_check_queue(void); - -#ifdef INVARIANTS -#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__) -#else -#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0) -#endif - - -#include <sys/param.h> -#include <sys/pcpu.h> - -#ifdef PMAP_DEBUG -#define PMAP_REF pmap_ref -#define PMAP_DEC_REF_PAGE pmap_dec_ref_page -#define PMAP_MARK_PRIV pmap_mark_privileged -#define PMAP_MARK_UNPRIV pmap_mark_unprivileged -#else -#define PMAP_MARK_PRIV(a) -#define PMAP_MARK_UNPRIV(a) -#define PMAP_REF(a, b) -#define PMAP_DEC_REF_PAGE(a) -#endif - -#define ALWAYS_SYNC 0 - -#ifdef PT_DEBUG -#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) -#else -#define PT_LOG() -#endif - -#define INVALID_P2M_ENTRY (~0UL) - -#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ - -#define SH_PD_SET_VA 1 -#define SH_PD_SET_VA_MA 2 -#define SH_PD_SET_VA_CLEAR 3 - -struct pmap; -void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type); -#ifdef notyet -static vm_paddr_t -vptetomachpte(vm_paddr_t *pte) -{ - vm_offset_t offset, ppte; - vm_paddr_t pgoffset, retval, *pdir_shadow_ptr; - int pgindex; - - ppte = (vm_offset_t)pte; - pgoffset = (ppte & PAGE_MASK); - offset = ppte - (vm_offset_t)PTmap; - pgindex = ppte >> PDRSHIFT; - - pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow); - retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset; - return (retval); -} -#endif -#define PT_GET(_ptp) \ - (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0)) - -#ifdef WRITABLE_PAGETABLES - -#define PT_SET_VA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - PT_LOG(); \ - *(_ptp) = xpmap_ptom((_npte)); \ -} while (/*CONSTCOND*/0) -#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - PT_LOG(); \ - *(_ptp) = (_npte); \ -} while (/*CONSTCOND*/0) -#define PT_CLEAR_VA(_ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - PT_LOG(); \ - *(_ptp) = 0; \ -} while (/*CONSTCOND*/0) - -#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#else /* !WRITABLE_PAGETABLES */ - -#define PT_SET_VA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - xen_queue_pt_update(vtomach(_ptp), \ - xpmap_ptom(_npte)); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - xen_queue_pt_update(vtomach(_ptp), _npte); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PT_CLEAR_VA(_ptp, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - xen_queue_pt_update(vtomach(_ptp), 0); \ - if (sync || ALWAYS_SYNC) \ - xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \ - PMAP_REF((_ptp), xpmap_ptom(_npte)); \ - pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \ - PMAP_REF((_ptp), (_npte)); \ - pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) -#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \ - PMAP_REF((pt_entry_t *)(_ptp), 0); \ - pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \ - if (sync || ALWAYS_SYNC) xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -#endif - -#define PT_SET_MA(_va, _ma) \ -do { \ - PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\ - (_ma), \ - UVMF_INVLPG| UVMF_ALL) < 0); \ -} while (/*CONSTCOND*/0) - -#define PT_UPDATES_FLUSH() do { \ - xen_flush_queue(); \ -} while (/*CONSTCOND*/0) - -static __inline vm_paddr_t -xpmap_mtop(vm_paddr_t mpa) -{ - vm_paddr_t tmp = (mpa & PG_FRAME); - - return machtophys(tmp) | (mpa & ~PG_FRAME); -} - -static __inline vm_paddr_t -xpmap_ptom(vm_paddr_t ppa) -{ - vm_paddr_t tmp = (ppa & PG_FRAME); - - return phystomach(tmp) | (ppa & ~PG_FRAME); -} - -static __inline void -set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ -#ifdef notyet - PANIC_IF(max_mapnr && pfn >= max_mapnr); -#endif - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef notyet - PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY)); -#endif - return; - } - xen_phys_machine[pfn] = mfn; -} - -static __inline int -phys_to_machine_mapping_valid(unsigned long pfn) -{ - return xen_phys_machine[pfn] != INVALID_P2M_ENTRY; -} - -#endif /* !XEN */ - -#endif /* _XEN_XENPMAP_H_ */ diff --git a/sys/i386/include/xen/xenstored.h b/sys/i386/include/xen/xenstored.h deleted file mode 100644 index e584fa5..0000000 --- a/sys/i386/include/xen/xenstored.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Simple prototyle Xen Store Daemon providing simple tree-like database. - * Copyright (C) 2005 Rusty Russell IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENSTORED_H -#define _XENSTORED_H - -enum xsd_sockmsg_type -{ - XS_DEBUG, - XS_SHUTDOWN, - XS_DIRECTORY, - XS_READ, - XS_GET_PERMS, - XS_WATCH, - XS_WATCH_ACK, - XS_UNWATCH, - XS_TRANSACTION_START, - XS_TRANSACTION_END, - XS_OP_READ_ONLY = XS_TRANSACTION_END, - XS_INTRODUCE, - XS_RELEASE, - XS_GETDOMAINPATH, - XS_WRITE, - XS_MKDIR, - XS_RM, - XS_SET_PERMS, - XS_WATCH_EVENT, - XS_ERROR, -}; - -#define XS_WRITE_NONE "NONE" -#define XS_WRITE_CREATE "CREATE" -#define XS_WRITE_CREATE_EXCL "CREATE|EXCL" - -/* We hand errors as strings, for portability. */ -struct xsd_errors -{ - int errnum; - const char *errstring; -}; -#define XSD_ERROR(x) { x, #x } -static struct xsd_errors xsd_errors[] __attribute__((unused)) = { - XSD_ERROR(EINVAL), - XSD_ERROR(EACCES), - XSD_ERROR(EEXIST), - XSD_ERROR(EISDIR), - XSD_ERROR(ENOENT), - XSD_ERROR(ENOMEM), - XSD_ERROR(ENOSPC), - XSD_ERROR(EIO), - XSD_ERROR(ENOTEMPTY), - XSD_ERROR(ENOSYS), - XSD_ERROR(EROFS), - XSD_ERROR(EBUSY), - XSD_ERROR(ETIMEDOUT), - XSD_ERROR(EISCONN), -}; -struct xsd_sockmsg -{ - uint32_t type; - uint32_t len; /* Length of data following this. */ - - /* Generally followed by nul-terminated string(s). */ -}; - -#endif /* _XENSTORED_H */ diff --git a/sys/i386/include/xen/xenvar.h b/sys/i386/include/xen/xenvar.h index 5694607..484c279 100644 --- a/sys/i386/include/xen/xenvar.h +++ b/sys/i386/include/xen/xenvar.h @@ -29,91 +29,8 @@ #ifndef XENVAR_H_ #define XENVAR_H_ -#include <machine/xen/features.h> - -#if defined(XEN) - -#define XBOOTUP 0x1 -#define XPMAP 0x2 -extern int xendebug_flags; -#ifndef NOXENDEBUG -/* Print directly to the Xen console during debugging. */ -#define XENPRINTF xc_printf -#else -#define XENPRINTF printf -#endif - -extern xen_pfn_t *xen_phys_machine; -extern xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; -extern xen_pfn_t *xen_pfn_to_mfn_frame_list_list; - -#if 0 -#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__) -#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__) -#define TRACE_DEBUG(argflags, _f, _a...) \ -if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a); -#else -#define TRACE_ENTER -#define TRACE_EXIT -#define TRACE_DEBUG(argflags, _f, _a...) -#endif - -extern xen_pfn_t *xen_machine_phys; -/* Xen starts physical pages after the 4MB ISA hole - - * FreeBSD doesn't - */ - - -#undef ADD_ISA_HOLE /* XXX */ - -#ifdef ADD_ISA_HOLE -#define ISA_INDEX_OFFSET 1024 -#define ISA_PDR_OFFSET 1 -#else -#define ISA_INDEX_OFFSET 0 -#define ISA_PDR_OFFSET 0 -#endif - - -#define PFNTOMFN(i) (xen_phys_machine[(i)]) -#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) - -#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) -#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) - -#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) -#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) - -#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT) -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) - -#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT) -#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT) - - -void xpq_init(void); - -#define BITS_PER_LONG 32 -#define NR_CPUS XEN_LEGACY_MAX_VCPUS - -#define BITS_TO_LONGS(bits) \ - (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) -#define DECLARE_BITMAP(name,bits) \ - unsigned long name[BITS_TO_LONGS(bits)] - -int xen_create_contiguous_region(vm_page_t pages, int npages); - -void xen_destroy_contiguous_region(void * addr, int npages); - -#elif defined(XENHVM) +#include <xen/features.h> #define vtomach(va) pmap_kextract((vm_offset_t) (va)) -#define PFNTOMFN(pa) (pa) -#define MFNTOPFN(ma) (ma) - -#define set_phys_to_machine(pfn, mfn) ((void)0) -#define phys_to_machine_mapping_valid(pfn) (TRUE) - -#endif /* !XEN && !XENHVM */ #endif diff --git a/sys/i386/isa/npx.c b/sys/i386/isa/npx.c index b0ea4e7..a7113e2 100644 --- a/sys/i386/isa/npx.c +++ b/sys/i386/isa/npx.c @@ -69,10 +69,6 @@ __FBSDID("$FreeBSD$"); #include <machine/ucontext.h> #include <machine/intr_machdep.h> -#ifdef XEN -#include <xen/xen-os.h> -#include <xen/hypervisor.h> -#endif #ifdef DEV_ISA #include <isa/isavar.h> @@ -157,13 +153,8 @@ void xsaveopt(char *addr, uint64_t mask); #endif /* __GNUCLIKE_ASM && !lint */ -#ifdef XEN -#define start_emulating() (HYPERVISOR_fpu_taskswitch(1)) -#define stop_emulating() (HYPERVISOR_fpu_taskswitch(0)) -#else #define start_emulating() load_cr0(rcr0() | CR0_TS) #define stop_emulating() clts() -#endif #ifdef CPU_ENABLE_SSE #define GET_FPU_CW(thread) \ diff --git a/sys/i386/pci/pci_cfgreg.c b/sys/i386/pci/pci_cfgreg.c index 5d57e89..2716a7a 100644 --- a/sys/i386/pci/pci_cfgreg.c +++ b/sys/i386/pci/pci_cfgreg.c @@ -93,9 +93,7 @@ static uint32_t pci_docfgregread(int bus, int slot, int func, int reg, int bytes); static int pcireg_cfgread(int bus, int slot, int func, int reg, int bytes); static void pcireg_cfgwrite(int bus, int slot, int func, int reg, int data, int bytes); -#ifndef XEN static int pcireg_cfgopen(void); -#endif static int pciereg_cfgread(int bus, unsigned slot, unsigned func, unsigned reg, unsigned bytes); static void pciereg_cfgwrite(int bus, unsigned slot, unsigned func, @@ -116,7 +114,6 @@ pci_i386_map_intline(int line) return (line); } -#ifndef XEN static u_int16_t pcibios_get_version(void) { @@ -137,7 +134,6 @@ pcibios_get_version(void) } return (args.ebx & 0xffff); } -#endif /* * Initialise access to PCI configuration space @@ -145,9 +141,6 @@ pcibios_get_version(void) int pci_cfgregopen(void) { -#ifdef XEN - return (0); -#else static int opened = 0; uint64_t pciebar; u_int16_t vid, did; @@ -202,7 +195,6 @@ pci_cfgregopen(void) } return(1); -#endif } static uint32_t @@ -390,7 +382,6 @@ pcireg_cfgwrite(int bus, int slot, int func, int reg, int data, int bytes) mtx_unlock_spin(&pcicfg_mtx); } -#ifndef XEN /* check whether the configuration mechanism has been correctly identified */ static int pci_cfgcheck(int maxdev) @@ -607,7 +598,6 @@ pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus) return (1); } -#endif /* !XEN */ #define PCIE_PADDR(base, reg, bus, slot, func) \ ((base) + \ diff --git a/sys/i386/pci/pci_pir.c b/sys/i386/pci/pci_pir.c index 0d64cab..6aeaae3 100644 --- a/sys/i386/pci/pci_pir.c +++ b/sys/i386/pci/pci_pir.c @@ -137,9 +137,6 @@ pci_pir_open(void) int i; uint8_t ck, *cv; -#ifdef XEN - return; -#else /* Don't try if we've already found a table. */ if (pci_route_table != NULL) return; @@ -150,7 +147,7 @@ pci_pir_open(void) sigaddr = bios_sigsearch(0, "_PIR", 4, 16, 0); if (sigaddr == 0) return; -#endif + /* If we found something, check the checksum and length. */ /* XXX - Use pmap_mapdev()? */ pt = (struct PIR_table *)(uintptr_t)BIOS_PADDRTOVADDR(sigaddr); @@ -481,11 +478,7 @@ pci_pir_biosroute(int bus, int device, int func, int pin, int irq) args.eax = PCIBIOS_ROUTE_INTERRUPT; args.ebx = (bus << 8) | (device << 3) | func; args.ecx = (irq << 8) | (0xa + pin); -#ifdef XEN - return (0); -#else return (bios32(&args, PCIbios.ventry, GSEL(GCODE_SEL, SEL_KPL))); -#endif } diff --git a/sys/i386/xen/clock.c b/sys/i386/xen/clock.c deleted file mode 100644 index ffb436e..0000000 --- a/sys/i386/xen/clock.c +++ /dev/null @@ -1,570 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz and Don Ahn. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* #define DELAYDEBUG */ -/* - * Routines to handle clock hardware. - */ - -#include "opt_ddb.h" -#include "opt_clock.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/bus.h> -#include <sys/clock.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/proc.h> -#include <sys/time.h> -#include <sys/timeet.h> -#include <sys/timetc.h> -#include <sys/kernel.h> -#include <sys/limits.h> -#include <sys/sysctl.h> -#include <sys/cons.h> -#include <sys/power.h> - -#include <machine/clock.h> -#include <machine/cputypes.h> -#include <machine/frame.h> -#include <machine/intr_machdep.h> -#include <machine/md_var.h> -#include <machine/psl.h> -#include <machine/pvclock.h> -#if defined(SMP) -#include <machine/smp.h> -#endif -#include <machine/specialreg.h> -#include <machine/timerreg.h> - -#include <x86/isa/icu.h> -#include <isa/isareg.h> -#include <isa/rtc.h> - -#include <vm/vm.h> -#include <vm/pmap.h> -#include <machine/pmap.h> -#include <xen/hypervisor.h> -#include <xen/xen-os.h> -#include <machine/xen/xenfunc.h> -#include <xen/interface/vcpu.h> -#include <machine/cpu.h> -#include <xen/xen_intr.h> - -/* - * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we - * can use a simple formula for leap years. - */ -#define LEAPYEAR(y) (!((y) % 4)) -#define DAYSPERYEAR (28+30*4+31*7) - -#ifndef TIMER_FREQ -#define TIMER_FREQ 1193182 -#endif - -#ifdef CYC2NS_SCALE_FACTOR -#undef CYC2NS_SCALE_FACTOR -#endif -#define CYC2NS_SCALE_FACTOR 10 - -/* Values for timerX_state: */ -#define RELEASED 0 -#define RELEASE_PENDING 1 -#define ACQUIRED 2 -#define ACQUIRE_PENDING 3 - -struct mtx clock_lock; -#define RTC_LOCK_INIT \ - mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE) -#define RTC_LOCK mtx_lock_spin(&clock_lock) -#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) -#define NS_PER_TICK (1000000000ULL/hz) - -int adjkerntz; /* local offset from UTC in seconds */ -int clkintr_pending; -int pscnt = 1; -int psdiv = 1; -int wall_cmos_clock; -u_int timer_freq = TIMER_FREQ; -static u_long cyc2ns_scale; -static uint64_t processed_system_time; /* stime (ns) at last processing. */ - -#define do_div(n,base) ({ \ - unsigned long __upper, __low, __high, __mod, __base; \ - __base = (base); \ - __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ - __upper = __high; \ - if (__high) { \ - __upper = __high % (__base); \ - __high = __high / (__base); \ - } \ - __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ - __asm("":"=A" (n):"a" (__low),"d" (__high)); \ - __mod; \ -}) - - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return ((cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR); -} - -static uint32_t -getit(void) -{ - return (pvclock_get_last_cycles()); -} - - -/* - * XXX: timer needs more SMP work. - */ -void -i8254_init(void) -{ - - RTC_LOCK_INIT; -} - -/* - * Wait "n" microseconds. - * Relies on timer 1 counting down from (timer_freq / hz) - * Note: timer had better have been programmed before this is first used! - */ -void -i8254_delay(int n) -{ - int delta, ticks_left; - uint32_t tick, prev_tick; -#ifdef DELAYDEBUG - int getit_calls = 1; - int n1; - static int state = 0; - - if (state == 0) { - state = 1; - for (n1 = 1; n1 <= 10000000; n1 *= 10) - DELAY(n1); - state = 2; - } - if (state == 1) - printf("DELAY(%d)...", n); -#endif - /* - * Read the counter first, so that the rest of the setup overhead is - * counted. Guess the initial overhead is 20 usec (on most systems it - * takes about 1.5 usec for each of the i/o's in getit(). The loop - * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The - * multiplications and divisions to scale the count take a while). - * - * However, if ddb is active then use a fake counter since reading - * the i8254 counter involves acquiring a lock. ddb must not go - * locking for many reasons, but it calls here for at least atkbd - * input. - */ - prev_tick = getit(); - - n -= 0; /* XXX actually guess no initial overhead */ - /* - * Calculate (n * (timer_freq / 1e6)) without using floating point - * and without any avoidable overflows. - */ - if (n <= 0) - ticks_left = 0; - else if (n < 256) - /* - * Use fixed point to avoid a slow division by 1000000. - * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. - * 2^15 is the first power of 2 that gives exact results - * for n between 0 and 256. - */ - ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; - else - /* - * Don't bother using fixed point, although gcc-2.7.2 - * generates particularly poor code for the long long - * division, since even the slow way will complete long - * before the delay is up (unless we're interrupted). - */ - ticks_left = ((u_int)n * (long long)timer_freq + 999999) - / 1000000; - - while (ticks_left > 0) { - tick = getit(); -#ifdef DELAYDEBUG - ++getit_calls; -#endif - delta = tick - prev_tick; - prev_tick = tick; - if (delta < 0) { - /* - * Guard against timer0_max_count being wrong. - * This shouldn't happen in normal operation, - * but it may happen if set_timer_freq() is - * traced. - */ - /* delta += timer0_max_count; ??? */ - if (delta < 0) - delta = 0; - } - ticks_left -= delta; - } -#ifdef DELAYDEBUG - if (state == 1) - printf(" %d calls to getit() at %d usec each\n", - getit_calls, (n + 5) / getit_calls); -#endif -} - -void -startrtclock() -{ - uint64_t __cpu_khz; - uint32_t cpu_khz; - struct vcpu_time_info *info; - - __cpu_khz = 1000000ULL << 32; - info = &HYPERVISOR_shared_info->vcpu_info[0].time; - - (void)do_div(__cpu_khz, info->tsc_to_system_mul); - if ( info->tsc_shift < 0 ) - cpu_khz = __cpu_khz << -info->tsc_shift; - else - cpu_khz = __cpu_khz >> info->tsc_shift; - - printf("Xen reported: %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = - (2^32 * 1 / (clocks/us)) */ - - set_cyc2ns_scale(cpu_khz/1000); - tsc_freq = cpu_khz * 1000; -} - -/* - * RTC support routines - */ - - -static __inline int -readrtc(int port) -{ - return(bcd2bin(rtcin(port))); -} - - -#ifdef XEN_PRIVILEGED_GUEST - -/* - * Initialize the time of day register, based on the time base which is, e.g. - * from a filesystem. - */ -static void -domu_inittodr(time_t base) -{ - unsigned long sec; - int s, y; - struct timespec ts; - - update_wallclock(); - add_uptime_to_wallclock(); - - RTC_LOCK; - - if (base) { - ts.tv_sec = base; - ts.tv_nsec = 0; - tc_setclock(&ts); - } - - sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - y = time_second - shadow_tv.tv_sec; - if (y <= -2 || y >= 2) { - /* badly off, adjust it */ - tc_setclock(&shadow_tv); - } - RTC_UNLOCK; -} - -/* - * Write system time back to RTC. - */ -static void -domu_resettodr(void) -{ - unsigned long tm; - int s; - dom0_op_t op; - struct shadow_time_info *shadow; - struct pcpu *pc; - - pc = pcpu_find(smp_processor_id()); - shadow = &pc->pc_shadow_time; - if (xen_disable_rtc_set) - return; - - s = splclock(); - tm = time_second; - splx(s); - - tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - if ((xen_start_info->flags & SIF_INITDOMAIN) && - !independent_wallclock) - { - op.cmd = DOM0_SETTIME; - op.u.settime.secs = tm; - op.u.settime.nsecs = 0; - op.u.settime.system_time = shadow->system_timestamp; - HYPERVISOR_dom0_op(&op); - update_wallclock(); - add_uptime_to_wallclock(); - } else if (independent_wallclock) { - /* notyet */ - ; - } -} - -/* - * Initialize the time of day register, based on the time base which is, e.g. - * from a filesystem. - */ -void -inittodr(time_t base) -{ - unsigned long sec, days; - int year, month; - int y, m, s; - struct timespec ts; - - if (!(xen_start_info->flags & SIF_INITDOMAIN)) { - domu_inittodr(base); - return; - } - - if (base) { - s = splclock(); - ts.tv_sec = base; - ts.tv_nsec = 0; - tc_setclock(&ts); - splx(s); - } - - /* Look if we have a RTC present and the time is valid */ - if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) - goto wrong_time; - - /* wait for time update to complete */ - /* If RTCSA_TUP is zero, we have at least 244us before next update */ - s = splhigh(); - while (rtcin(RTC_STATUSA) & RTCSA_TUP) { - splx(s); - s = splhigh(); - } - - days = 0; -#ifdef USE_RTC_CENTURY - year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; -#else - year = readrtc(RTC_YEAR) + 1900; - if (year < 1970) - year += 100; -#endif - if (year < 1970) { - splx(s); - goto wrong_time; - } - month = readrtc(RTC_MONTH); - for (m = 1; m < month; m++) - days += daysinmonth[m-1]; - if ((month > 2) && LEAPYEAR(year)) - days ++; - days += readrtc(RTC_DAY) - 1; - for (y = 1970; y < year; y++) - days += DAYSPERYEAR + LEAPYEAR(y); - sec = ((( days * 24 + - readrtc(RTC_HRS)) * 60 + - readrtc(RTC_MIN)) * 60 + - readrtc(RTC_SEC)); - /* sec now contains the number of seconds, since Jan 1 1970, - in the local time zone */ - - sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - y = time_second - sec; - if (y <= -2 || y >= 2) { - /* badly off, adjust it */ - ts.tv_sec = sec; - ts.tv_nsec = 0; - tc_setclock(&ts); - } - splx(s); - return; - - wrong_time: - printf("Invalid time in real time clock.\n"); - printf("Check and reset the date immediately!\n"); -} - - -/* - * Write system time back to RTC - */ -void -resettodr() -{ - unsigned long tm; - int y, m, s; - - if (!(xen_start_info->flags & SIF_INITDOMAIN)) { - domu_resettodr(); - return; - } - - if (xen_disable_rtc_set) - return; - - s = splclock(); - tm = time_second; - splx(s); - - /* Disable RTC updates and interrupts. */ - writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); - - /* Calculate local time to put in RTC */ - - tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ - writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ - writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ - - /* We have now the days since 01-01-1970 in tm */ - writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ - for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); - tm >= m; - y++, m = DAYSPERYEAR + LEAPYEAR(y)) - tm -= m; - - /* Now we have the years in y and the day-of-the-year in tm */ - writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ -#ifdef USE_RTC_CENTURY - writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ -#endif - for (m = 0; ; m++) { - int ml; - - ml = daysinmonth[m]; - if (m == 1 && LEAPYEAR(y)) - ml++; - if (tm < ml) - break; - tm -= ml; - } - - writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ - writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ - - /* Reenable RTC updates and interrupts. */ - writertc(RTC_STATUSB, RTCSB_24HR); - rtcin(RTC_INTR); -} -#endif - -/* - * Start clocks running. - */ -void -cpu_initclocks(void) -{ - cpu_initclocks_bsp(); -} - -/* Return system time offset by ticks */ -uint64_t -get_system_time(int ticks) -{ - return (processed_system_time + (ticks * NS_PER_TICK)); -} - -int -timer_spkr_acquire(void) -{ - - return (0); -} - -int -timer_spkr_release(void) -{ - - return (0); -} - -void -timer_spkr_setfreq(int freq) -{ - -} - diff --git a/sys/i386/xen/exception.s b/sys/i386/xen/exception.s deleted file mode 100644 index 95f1c0e..0000000 --- a/sys/i386/xen/exception.s +++ /dev/null @@ -1,494 +0,0 @@ -/*- - * Copyright (c) 1989, 1990 William F. Jolitz. - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include "opt_apic.h" -#include "opt_npx.h" - -#include <machine/asmacros.h> -#include <machine/psl.h> -#include <machine/trap.h> - -#include "assym.s" - -#define SEL_RPL_MASK 0x0002 -#define __HYPERVISOR_iret 23 - -/* Offsets into shared_info_t. */ - -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 6 - - -#ifdef SMP -#define GET_VCPU_INFO(reg) movl PCPU(CPUID),reg ; \ - shl $sizeof_vcpu_shift,reg ; \ - addl HYPERVISOR_shared_info,reg -#else -#define GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg -#endif - -#define __DISABLE_INTERRUPTS(reg) movb $1,evtchn_upcall_mask(reg) -#define __ENABLE_INTERRUPTS(reg) movb $0,evtchn_upcall_mask(reg) -#define DISABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ - __DISABLE_INTERRUPTS(reg) -#define ENABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ - __ENABLE_INTERRUPTS(reg) -#define __TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) - -#define POPA \ - popl %edi; \ - popl %esi; \ - popl %ebp; \ - popl %ebx; \ - popl %ebx; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - - .text - -/*****************************************************************************/ -/* Trap handling */ -/*****************************************************************************/ -/* - * Trap and fault vector routines. - * - * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on - * the stack that mostly looks like an interrupt, but does not disable - * interrupts. A few of the traps we are use are interrupt gates, - * SDT_SYS386IGT, which are nearly the same thing except interrupts are - * disabled on entry. - * - * The cpu will push a certain amount of state onto the kernel stack for - * the current process. The amount of state depends on the type of trap - * and whether the trap crossed rings or not. See i386/include/frame.h. - * At the very least the current EFLAGS (status register, which includes - * the interrupt disable state prior to the trap), the code segment register, - * and the return instruction pointer are pushed by the cpu. The cpu - * will also push an 'error' code for certain traps. We push a dummy - * error code for those traps where the cpu doesn't in order to maintain - * a consistent frame. We also push a contrived 'trap number'. - * - * The cpu does not push the general registers, we must do that, and we - * must restore them prior to calling 'iret'. The cpu adjusts the %cs and - * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we - * must load them with appropriate values for supervisor mode operation. - */ - -MCOUNT_LABEL(user) -MCOUNT_LABEL(btrap) - -#define TRAP(a) pushl $(a) ; jmp alltraps - -IDTVEC(div) - pushl $0; TRAP(T_DIVIDE) -IDTVEC(dbg) - pushl $0; TRAP(T_TRCTRAP) -IDTVEC(nmi) - pushl $0; TRAP(T_NMI) -IDTVEC(bpt) - pushl $0; TRAP(T_BPTFLT) -IDTVEC(ofl) - pushl $0; TRAP(T_OFLOW) -IDTVEC(bnd) - pushl $0; TRAP(T_BOUND) -IDTVEC(ill) - pushl $0; TRAP(T_PRIVINFLT) -IDTVEC(dna) - pushl $0; TRAP(T_DNA) -IDTVEC(fpusegm) - pushl $0; TRAP(T_FPOPFLT) -IDTVEC(tss) - TRAP(T_TSSFLT) -IDTVEC(missing) - TRAP(T_SEGNPFLT) -IDTVEC(stk) - TRAP(T_STKFLT) -IDTVEC(prot) - TRAP(T_PROTFLT) -IDTVEC(page) - TRAP(T_PAGEFLT) -IDTVEC(mchk) - pushl $0; TRAP(T_MCHK) -IDTVEC(rsvd) - pushl $0; TRAP(T_RESERVED) -IDTVEC(fpu) - pushl $0; TRAP(T_ARITHTRAP) -IDTVEC(align) - TRAP(T_ALIGNFLT) -IDTVEC(xmm) - pushl $0; TRAP(T_XMMFLT) - -IDTVEC(hypervisor_callback) - pushl $0; - pushl $0; - pushal - pushl %ds - pushl %es - pushl %fs -upcall_with_regs_pushed: - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) -call_evtchn_upcall: - movl TF_EIP(%esp),%eax - cmpl $scrit,%eax - jb 10f - cmpl $ecrit,%eax - jb critical_region_fixup - -10: pushl %esp - call xen_intr_handle_upcall - addl $4,%esp - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - - -hypervisor_callback_pending: - DISABLE_INTERRUPTS(%esi) /* cli */ - jmp 10b - /* - * alltraps entry point. Interrupts are enabled if this was a trap - * gate (TGT), else disabled if this was an interrupt gate (IGT). - * Note that int0x80_syscall is a trap gate. Only page faults - * use an interrupt gate. - */ - SUPERALIGN_TEXT - .globl alltraps - .type alltraps,@function -alltraps: - pushal - pushl %ds - pushl %es - pushl %fs - -alltraps_with_regs_pushed: - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - -calltrap: - push %esp - call trap - add $4, %esp - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - -/* - * SYSCALL CALL GATE (old entry point for a.out binaries) - * - * The intersegment call has been set up to specify one dummy parameter. - * - * This leaves a place to put eflags so that the call frame can be - * converted to a trap frame. Note that the eflags is (semi-)bogusly - * pushed into (what will be) tf_err and then copied later into the - * final spot. It has to be done this way because esp can't be just - * temporarily altered for the pushfl - an interrupt might come in - * and clobber the saved cs/eip. - */ - SUPERALIGN_TEXT -IDTVEC(lcall_syscall) - pushfl /* save eflags */ - popl 8(%esp) /* shuffle into tf_eflags */ - pushl $7 /* sizeof "lcall 7,0" */ - subl $4,%esp /* skip over tf_trapno */ - pushal - pushl %ds - pushl %es - pushl %fs - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - pushl %esp - call syscall - add $4, %esp - MEXITCOUNT - jmp doreti - -/* - * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) - * - * Even though the name says 'int0x80', this is actually a TGT (trap gate) - * rather then an IGT (interrupt gate). Thus interrupts are enabled on - * entry just as they are for a normal syscall. - */ - SUPERALIGN_TEXT -IDTVEC(int0x80_syscall) - pushl $2 /* sizeof "int 0x80" */ - pushl $0xBEEF /* for debug */ - pushal - pushl %ds - pushl %es - pushl %fs - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - pushl %esp - call syscall - add $4, %esp - MEXITCOUNT - jmp doreti - -ENTRY(fork_trampoline) - pushl %esp /* trapframe pointer */ - pushl %ebx /* arg1 */ - pushl %esi /* function */ - call fork_exit - addl $12,%esp - /* cut from syscall */ - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - - -/* - * To efficiently implement classification of trap and interrupt handlers - * for profiling, there must be only trap handlers between the labels btrap - * and bintr, and only interrupt handlers between the labels bintr and - * eintr. This is implemented (partly) by including files that contain - * some of the handlers. Before including the files, set up a normal asm - * environment so that the included files doen't need to know that they are - * included. - */ - - .data - .p2align 4 - .text - SUPERALIGN_TEXT -MCOUNT_LABEL(bintr) - -#ifdef DEV_APIC - .data - .p2align 4 - .text - SUPERALIGN_TEXT - -#include <i386/i386/apic_vector.s> -#endif - - .data - .p2align 4 - .text - SUPERALIGN_TEXT -#include <i386/i386/vm86bios.s> - - .text -MCOUNT_LABEL(eintr) - -/* - * void doreti(struct trapframe) - * - * Handle return from interrupts, traps and syscalls. - */ - .text - SUPERALIGN_TEXT - .type doreti,@function -doreti: - FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ -doreti_next: -#ifdef notyet - /* - * Check if ASTs can be handled now. PSL_VM must be checked first - * since segment registers only have an RPL in non-VM86 mode. - */ - testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ - jz doreti_notvm86 - movl PCPU(CURPCB),%ecx - testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ - jz doreti_ast /* can handle ASTS now if not */ - jmp doreti_exit - -doreti_notvm86: -#endif - testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ - jz doreti_exit /* can't handle ASTs now if not */ - -doreti_ast: - /* - * Check for ASTs atomically with returning. Disabling CPU - * interrupts provides sufficient locking even in the SMP case, - * since we will be informed of any new ASTs by an IPI. - */ - DISABLE_INTERRUPTS(%esi) /* cli */ - movl PCPU(CURTHREAD),%eax - testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) - je doreti_exit - ENABLE_INTERRUPTS(%esi) /* sti */ - pushl %esp /* pass a pointer to the trapframe */ - call ast - add $4,%esp - jmp doreti_ast - - /* - * doreti_exit: pop registers, iret. - * - * The segment register pop is a special case, since it may - * fault if (for example) a sigreturn specifies bad segment - * registers. The fault is handled in trap.c. - */ -doreti_exit: - ENABLE_INTERRUPTS(%esi) # reenable event callbacks (sti) - - .globl scrit -scrit: - __TEST_PENDING(%esi) - jnz hypervisor_callback_pending /* More to go */ - - MEXITCOUNT - - .globl doreti_popl_fs -doreti_popl_fs: - popl %fs - .globl doreti_popl_es -doreti_popl_es: - popl %es - .globl doreti_popl_ds -doreti_popl_ds: - popl %ds - - /* - * This is important: as nothing is atomic over here (we can get - * interrupted any time), we use the critical_region_fixup() in - * order to figure out where out stack is. Therefore, do NOT use - * 'popal' here without fixing up the table! - */ - POPA - addl $8,%esp - .globl doreti_iret -doreti_iret: - jmp hypercall_page + (__HYPERVISOR_iret * 32) - .globl ecrit -ecrit: - /* - * doreti_iret_fault and friends. Alternative return code for - * the case where we get a fault in the doreti_exit code - * above. trap() (i386/i386/trap.c) catches this specific - * case, sends the process a signal and continues in the - * corresponding place in the code below. - */ - ALIGN_TEXT - .globl doreti_iret_fault -doreti_iret_fault: - subl $8,%esp - pushal - pushl %ds - .globl doreti_popl_ds_fault -doreti_popl_ds_fault: - pushl %es - .globl doreti_popl_es_fault -doreti_popl_es_fault: - pushl %fs - .globl doreti_popl_fs_fault -doreti_popl_fs_fault: - movl $0,TF_ERR(%esp) /* XXX should be the error code */ - movl $T_PROTFLT,TF_TRAPNO(%esp) - jmp alltraps_with_regs_pushed - - /* -# [How we do the fixup]. We want to merge the current stack frame with the -# just-interrupted frame. How we do this depends on where in the critical -# region the interrupted handler was executing, and so how many saved -# registers are in each frame. We do this quickly using the lookup table -# 'critical_fixup_table'. For each byte offset in the critical region, it -# provides the number of bytes which have already been popped from the -# interrupted stack frame. -*/ - -.globl critical_region_fixup -critical_region_fixup: - addl $critical_fixup_table-scrit,%eax - movzbl (%eax),%eax # %eax contains num bytes popped - movl %esp,%esi - add %eax,%esi # %esi points at end of src region - movl %esp,%edi - add $0x40,%edi # %edi points at end of dst region - movl %eax,%ecx - shr $2,%ecx # convert bytes to words - je 16f # skip loop if nothing to copy -15: subl $4,%esi # pre-decrementing copy loop - subl $4,%edi - movl (%esi),%eax - movl %eax,(%edi) - loop 15b -16: movl %edi,%esp # final %edi is top of merged stack - jmp hypervisor_callback_pending - - -critical_fixup_table: -.byte 0x0,0x0,0x0 #testb $0x1,(%esi) -.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea -.byte 0x0,0x0 #pop %fs -.byte 0x04 #pop %es -.byte 0x08 #pop %ds -.byte 0x0c #pop %edi -.byte 0x10 #pop %esi -.byte 0x14 #pop %ebp -.byte 0x18 #pop %ebx -.byte 0x1c #pop %ebx -.byte 0x20 #pop %edx -.byte 0x24 #pop %ecx -.byte 0x28 #pop %eax -.byte 0x2c,0x2c,0x2c #add $0x8,%esp -#if 0 - .byte 0x34 #iret -#endif -.byte 0x34,0x34,0x34,0x34,0x34 #HYPERVISOR_iret - - -/* # Hypervisor uses this for application faults while it executes.*/ -ENTRY(failsafe_callback) - pushal - call xen_failsafe_handler -/*# call install_safe_pf_handler */ - movl 28(%esp),%ebx -1: movl %ebx,%ds - movl 32(%esp),%ebx -2: movl %ebx,%es - movl 36(%esp),%ebx -3: movl %ebx,%fs - movl 40(%esp),%ebx -4: movl %ebx,%gs -/*# call install_normal_pf_handler */ - popal - addl $12,%esp - iret - - diff --git a/sys/i386/xen/locore.s b/sys/i386/xen/locore.s deleted file mode 100644 index 7e67684..0000000 --- a/sys/i386/xen/locore.s +++ /dev/null @@ -1,360 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 - * $FreeBSD$ - * - * originally from: locore.s, by William F. Jolitz - * - * Substantially rewritten by David Greenman, Rod Grimes, - * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp - * and many others. - */ - -#include "opt_bootp.h" -#include "opt_compat.h" -#include "opt_nfsroot.h" -#include "opt_pmap.h" - -#include <sys/syscall.h> -#include <sys/reboot.h> - -#include <machine/asmacros.h> -#include <machine/cputypes.h> -#include <machine/psl.h> -#include <machine/pmap.h> -#include <machine/specialreg.h> - -#define __ASSEMBLY__ -#include <xen/interface/elfnote.h> - -/* The defines below have been lifted out of <machine/xen-public/arch-x86_32.h> */ -#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ -#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ -#define KERNEL_CS FLAT_RING1_CS -#define KERNEL_DS FLAT_RING1_DS - -#include "assym.s" - -.section __xen_guest - .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" - .byte 0 - - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") - ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) - ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, XEN_HYPERVISOR_VIRT_START) -#if 0 - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") -#endif - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") - -#ifdef PAE - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") - ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") - ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) -#endif - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") - ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) - - - -/* - * XXX - * - * Note: This version greatly munged to avoid various assembler errors - * that may be fixed in newer versions of gas. Perhaps newer versions - * will have more pleasant appearance. - */ - -/* - * PTmap is recursive pagemap at top of virtual address space. - * Within PTmap, the page directory can be found (third indirection). - */ - .globl PTmap,PTD,PTDpde - .set PTmap,(PTDPTDI << PDRSHIFT) - .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) - .set PTDpde,PTD + (PTDPTDI * PDESIZE) - -/* - * Compiled KERNBASE location and the kernel load address - */ - .globl kernbase - .set kernbase,KERNBASE - .globl kernload - .set kernload,KERNLOAD - -/* - * Globals - */ - .data - ALIGN_DATA /* just to be sure */ - - .space 0x2000 /* space for tmpstk - temporary stack */ -tmpstk: - - .globl bootinfo -bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ - - .globl KERNend -KERNend: .long 0 /* phys addr end of kernel (just after bss) */ - .globl physfree -physfree: .long 0 /* phys addr of next free page */ - - .globl IdlePTD -IdlePTD: .long 0 /* phys addr of kernel PTD */ - -#ifdef PAE - .globl IdlePDPT -IdlePDPT: .long 0 /* phys addr of kernel PDPT */ -#endif - -#ifdef SMP - .globl KPTphys -#endif -KPTphys: .long 0 /* phys addr of kernel page tables */ - .globl gdtset -gdtset: .long 0 /* GDT is valid */ - - .globl proc0kstack -proc0kstack: .long 0 /* address of proc 0 kstack space */ -p0kpa: .long 0 /* phys addr of proc0's STACK */ - -vm86phystk: .long 0 /* PA of vm86/bios stack */ - - .globl vm86paddr, vm86pa -vm86paddr: .long 0 /* address of vm86 region */ -vm86pa: .long 0 /* phys addr of vm86 region */ - -#ifdef PC98 - .globl pc98_system_parameter -pc98_system_parameter: - .space 0x240 -#endif - - .globl avail_space -avail_space: .long 0 - -/********************************************************************** - * - * Some handy macros - * - */ - -/* - * We're already in protected mode, so no remapping is needed. - */ -#define R(foo) (foo) - -#define ALLOCPAGES(foo) \ - movl R(physfree), %esi ; \ - movl $((foo)*PAGE_SIZE), %eax ; \ - addl %esi, %eax ; \ - movl %eax, R(physfree) ; \ - movl %esi, %edi ; \ - movl $((foo)*PAGE_SIZE),%ecx ; \ - xorl %eax,%eax ; \ - cld ; \ - rep ; \ - stosb - -/* - * fillkpt - * eax = page frame address - * ebx = index into page table - * ecx = how many pages to map - * base = base address of page dir/table - * prot = protection bits - */ -#define fillkpt(base, prot) \ - shll $PTESHIFT,%ebx ; \ - addl base,%ebx ; \ - orl $PG_V,%eax ; \ - orl prot,%eax ; \ -1: movl %eax,(%ebx) ; \ - addl $PAGE_SIZE,%eax ; /* increment physical address */ \ - addl $PTESIZE,%ebx ; /* next pte */ \ - loop 1b - -/* - * fillkptphys(prot) - * eax = physical address - * ecx = how many pages to map - * prot = protection bits - */ -#define fillkptphys(prot) \ - movl %eax, %ebx ; \ - shrl $PAGE_SHIFT, %ebx ; \ - fillkpt(R(KPTphys), prot) - -/* Temporary stack */ -.space 8192 -tmpstack: - .long tmpstack, KERNEL_DS - - .text - -.p2align 12, 0x90 - -#define HYPERCALL_PAGE_OFFSET 0x1000 -.org HYPERCALL_PAGE_OFFSET -ENTRY(hypercall_page) - .cfi_startproc - .skip 0x1000 - .cfi_endproc - -/********************************************************************** - * - * This is where the bootblocks start us, set the ball rolling... - * - */ -NON_GPROF_ENTRY(btext) - /* At the end of our stack, we shall have free space - so store it */ - movl %esp,%ebx - movl %ebx,R(avail_space) - - lss tmpstack,%esp - - pushl %esi - call initvalues - popl %esi - - /* Store the CPUID information */ - xorl %eax,%eax - cpuid # cpuid 0 - movl %eax,R(cpu_high) # highest capability - movl %ebx,R(cpu_vendor) # store vendor string - movl %edx,R(cpu_vendor+4) - movl %ecx,R(cpu_vendor+8) - movb $0,R(cpu_vendor+12) - - movl $1,%eax - cpuid # cpuid 1 - movl %eax,R(cpu_id) # store cpu_id - movl %ebx,R(cpu_procinfo) # store cpu_procinfo - movl %edx,R(cpu_feature) # store cpu_feature - movl %ecx,R(cpu_feature2) # store cpu_feature2 - rorl $8,%eax # extract family type - andl $15,%eax - cmpl $5,%eax - movl $CPU_686,R(cpu) - - movl proc0kstack,%eax - leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp - xorl %ebp,%ebp /* mark end of frames */ -#ifdef PAE - movl IdlePDPT,%esi -#else - movl IdlePTD,%esi -#endif - movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) - pushl physfree - call init386 - addl $4, %esp - call mi_startup - /* NOTREACHED */ - int $3 - -/* - * Signal trampoline, copied to top of user stack - */ -NON_GPROF_ENTRY(sigcode) - calll *SIGF_HANDLER(%esp) - leal SIGF_UC(%esp),%eax /* get ucontext */ - pushl %eax - testl $PSL_VM,UC_EFLAGS(%eax) - jne 1f - mov UC_GS(%eax), %gs /* restore %gs */ -1: - movl $SYS_sigreturn,%eax - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ - /* on stack */ -1: - jmp 1b - -#ifdef COMPAT_FREEBSD4 - ALIGN_TEXT -freebsd4_sigcode: - calll *SIGF_HANDLER(%esp) - leal SIGF_UC4(%esp),%eax /* get ucontext */ - pushl %eax - testl $PSL_VM,UC4_EFLAGS(%eax) - jne 1f - mov UC4_GS(%eax),%gs /* restore %gs */ -1: - movl $344,%eax /* 4.x SYS_sigreturn */ - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ - /* on stack */ -1: - jmp 1b -#endif - -#ifdef COMPAT_43 - ALIGN_TEXT -osigcode: - call *SIGF_HANDLER(%esp) /* call signal handler */ - lea SIGF_SC(%esp),%eax /* get sigcontext */ - pushl %eax - testl $PSL_VM,SC_PS(%eax) - jne 9f - movl SC_GS(%eax),%gs /* restore %gs */ -9: - movl $103,%eax /* 3.x SYS_sigreturn */ - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ -0: jmp 0b -#endif /* COMPAT_43 */ - - ALIGN_TEXT -esigcode: - - .data - .globl szsigcode -szsigcode: - .long esigcode-sigcode -#ifdef COMPAT_FREEBSD4 - .globl szfreebsd4_sigcode -szfreebsd4_sigcode: - .long esigcode-freebsd4_sigcode -#endif -#ifdef COMPAT_43 - .globl szosigcode -szosigcode: - .long esigcode-osigcode -#endif diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c deleted file mode 100644 index 329bba1..0000000 --- a/sys/i386/xen/mp_machdep.c +++ /dev/null @@ -1,1300 +0,0 @@ -/*- - * Copyright (c) 1996, by Steve Passe - * Copyright (c) 2008, by Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. The name of the developer may NOT be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_apic.h" -#include "opt_cpu.h" -#include "opt_kstack_pages.h" -#include "opt_mp_watchdog.h" -#include "opt_pmap.h" -#include "opt_sched.h" -#include "opt_smp.h" - -#if !defined(lint) -#if !defined(SMP) -#error How did you get here? -#endif - -#ifndef DEV_APIC -#error The apic device is required for SMP, add "device apic" to your config file. -#endif -#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) -#error SMP not supported with CPU_DISABLE_CMPXCHG -#endif -#endif /* not lint */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/bus.h> -#include <sys/cons.h> /* cngetc() */ -#include <sys/cpuset.h> -#ifdef GPROF -#include <sys/gmon.h> -#endif -#include <sys/kernel.h> -#include <sys/ktr.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/memrange.h> -#include <sys/mutex.h> -#include <sys/pcpu.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/sched.h> -#include <sys/smp.h> -#include <sys/sysctl.h> - -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/pmap.h> -#include <vm/vm_kern.h> -#include <vm/vm_extern.h> -#include <vm/vm_page.h> - -#include <x86/apicreg.h> -#include <machine/md_var.h> -#include <machine/mp_watchdog.h> -#include <machine/pcb.h> -#include <machine/psl.h> -#include <machine/smp.h> -#include <machine/specialreg.h> -#include <machine/pcpu.h> - -#include <xen/xen-os.h> -#include <xen/evtchn.h> -#include <xen/xen_intr.h> -#include <xen/hypervisor.h> -#include <xen/interface/vcpu.h> - -/*---------------------------- Extern Declarations ---------------------------*/ -extern struct pcpu __pcpu[]; - -extern void Xhypervisor_callback(void); -extern void failsafe_callback(void); - -/*--------------------------- Forward Declarations ---------------------------*/ -static driver_filter_t smp_reschedule_interrupt; -static driver_filter_t smp_call_function_interrupt; -static void assign_cpu_ids(void); -static void set_interrupt_apic_ids(void); -static int start_all_aps(void); -static int start_ap(int apic_id); -static void release_aps(void *dummy); - -/*---------------------------------- Macros ----------------------------------*/ -#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) - -/*-------------------------------- Local Types -------------------------------*/ -typedef void call_data_func_t(uintptr_t , uintptr_t); - -struct cpu_info { - int cpu_present:1; - int cpu_bsp:1; - int cpu_disabled:1; -}; - -struct xen_ipi_handler -{ - driver_filter_t *filter; - const char *description; -}; - -enum { - RESCHEDULE_VECTOR, - CALL_FUNCTION_VECTOR, -}; - -/*-------------------------------- Global Data -------------------------------*/ -static u_int hyperthreading_cpus; -static cpuset_t hyperthreading_cpus_mask; - -int mp_naps; /* # of Applications processors */ -int boot_cpu_id = -1; /* designated BSP */ - -static int bootAP; -static union descriptor *bootAPgdt; - -/* Free these after use */ -void *bootstacks[MAXCPU]; - -struct pcb stoppcbs[MAXCPU]; - -/* Variables needed for SMP tlb shootdown. */ -vm_offset_t smp_tlb_addr1; -vm_offset_t smp_tlb_addr2; -volatile int smp_tlb_wait; - -static u_int logical_cpus; -static volatile cpuset_t ipi_nmi_pending; - -/* used to hold the AP's until we are ready to release them */ -static struct mtx ap_boot_mtx; - -/* Set to 1 once we're ready to let the APs out of the pen. */ -static volatile int aps_ready = 0; - -/* - * Store data from cpu_add() until later in the boot when we actually setup - * the APs. - */ -static struct cpu_info cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; -int apic_cpuids[MAX_APIC_ID + 1]; - -/* Holds pending bitmap based IPIs per CPU */ -static volatile u_int cpu_ipi_pending[MAXCPU]; - -static int cpu_logical; -static int cpu_cores; - -static const struct xen_ipi_handler xen_ipis[] = -{ - [RESCHEDULE_VECTOR] = { smp_reschedule_interrupt, "resched" }, - [CALL_FUNCTION_VECTOR] = { smp_call_function_interrupt,"callfunc" } -}; - -/*------------------------------- Per-CPU Data -------------------------------*/ -DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); -DPCPU_DEFINE(struct vcpu_info *, vcpu_info); - -/*------------------------------ Implementation ------------------------------*/ -struct cpu_group * -cpu_topo(void) -{ - if (cpu_cores == 0) - cpu_cores = 1; - if (cpu_logical == 0) - cpu_logical = 1; - if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { - printf("WARNING: Non-uniform processors.\n"); - printf("WARNING: Using suboptimal topology.\n"); - return (smp_topo_none()); - } - /* - * No multi-core or hyper-threaded. - */ - if (cpu_logical * cpu_cores == 1) - return (smp_topo_none()); - /* - * Only HTT no multi-core. - */ - if (cpu_logical > 1 && cpu_cores == 1) - return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); - /* - * Only multi-core no HTT. - */ - if (cpu_cores > 1 && cpu_logical == 1) - return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); - /* - * Both HTT and multi-core. - */ - return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, - CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); -} - -/* - * Calculate usable address in base memory for AP trampoline code. - */ -u_int -mp_bootaddress(u_int basemem) -{ - - return (basemem); -} - -void -cpu_add(u_int apic_id, char boot_cpu) -{ - - if (apic_id > MAX_APIC_ID) { - panic("SMP: APIC ID %d too high", apic_id); - return; - } - KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", - apic_id)); - cpu_info[apic_id].cpu_present = 1; - if (boot_cpu) { - KASSERT(boot_cpu_id == -1, - ("CPU %d claims to be BSP, but CPU %d already is", apic_id, - boot_cpu_id)); - boot_cpu_id = apic_id; - cpu_info[apic_id].cpu_bsp = 1; - } - if (mp_ncpus < MAXCPU) - mp_ncpus++; - if (bootverbose) - printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : - "AP"); -} - -void -cpu_mp_setmaxid(void) -{ - - mp_maxid = MAXCPU - 1; -} - -int -cpu_mp_probe(void) -{ - - /* - * Always record BSP in CPU map so that the mbuf init code works - * correctly. - */ - CPU_SETOF(0, &all_cpus); - if (mp_ncpus == 0) { - /* - * No CPUs were found, so this must be a UP system. Setup - * the variables to represent a system with a single CPU - * with an id of 0. - */ - mp_ncpus = 1; - return (0); - } - - /* At least one CPU was found. */ - if (mp_ncpus == 1) { - /* - * One CPU was found, so this must be a UP system with - * an I/O APIC. - */ - return (0); - } - - /* At least two CPUs were found. */ - return (1); -} - -/* - * Initialize the IPI handlers and start up the AP's. - */ -void -cpu_mp_start(void) -{ - int i; - - /* Initialize the logical ID to APIC ID table. */ - for (i = 0; i < MAXCPU; i++) { - cpu_apic_ids[i] = -1; - cpu_ipi_pending[i] = 0; - } - - /* Set boot_cpu_id if needed. */ - if (boot_cpu_id == -1) { - boot_cpu_id = PCPU_GET(apic_id); - cpu_info[boot_cpu_id].cpu_bsp = 1; - } else - KASSERT(boot_cpu_id == PCPU_GET(apic_id), - ("BSP's APIC ID doesn't match boot_cpu_id")); - cpu_apic_ids[0] = boot_cpu_id; - apic_cpuids[boot_cpu_id] = 0; - - assign_cpu_ids(); - - /* Start each Application Processor */ - start_all_aps(); - - /* Setup the initial logical CPUs info. */ - logical_cpus = 0; - CPU_ZERO(&logical_cpus_mask); - if (cpu_feature & CPUID_HTT) - logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; - - set_interrupt_apic_ids(); -} - - -static void -iv_rendezvous(uintptr_t a, uintptr_t b) -{ - smp_rendezvous_action(); -} - -static void -iv_invltlb(uintptr_t a, uintptr_t b) -{ - xen_tlb_flush(); -} - -static void -iv_invlpg(uintptr_t a, uintptr_t b) -{ - xen_invlpg(a); -} - -static void -iv_invlrng(uintptr_t a, uintptr_t b) -{ - vm_offset_t start = (vm_offset_t)a; - vm_offset_t end = (vm_offset_t)b; - - while (start < end) { - xen_invlpg(start); - start += PAGE_SIZE; - } -} - - -static void -iv_invlcache(uintptr_t a, uintptr_t b) -{ - - wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); -} - -/* - * These start from "IPI offset" APIC_IPI_INTS - */ -static call_data_func_t *ipi_vectors[5] = -{ - iv_rendezvous, - iv_invltlb, - iv_invlpg, - iv_invlrng, - iv_invlcache, -}; - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -static int -smp_reschedule_interrupt(void *unused) -{ - int cpu = PCPU_GET(cpuid); - u_int ipi_bitmap; - - ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); - - if (ipi_bitmap & (1 << IPI_PREEMPT)) { -#ifdef COUNT_IPIS - (*ipi_preempt_counts[cpu])++; -#endif - sched_preempt(curthread); - } - - if (ipi_bitmap & (1 << IPI_AST)) { -#ifdef COUNT_IPIS - (*ipi_ast_counts[cpu])++; -#endif - /* Nothing to do for AST */ - } - return (FILTER_HANDLED); -} - -struct _call_data { - uint16_t func_id; - uint16_t wait; - uintptr_t arg1; - uintptr_t arg2; - atomic_t started; - atomic_t finished; -}; - -static struct _call_data *call_data; - -static int -smp_call_function_interrupt(void *unused) -{ - call_data_func_t *func; - uintptr_t arg1 = call_data->arg1; - uintptr_t arg2 = call_data->arg2; - int wait = call_data->wait; - atomic_t *started = &call_data->started; - atomic_t *finished = &call_data->finished; - - /* We only handle function IPIs, not bitmap IPIs */ - if (call_data->func_id < APIC_IPI_INTS || - call_data->func_id > IPI_BITMAP_VECTOR) - panic("invalid function id %u", call_data->func_id); - - func = ipi_vectors[IPI_TO_IDX(call_data->func_id)]; - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - (*func)(arg1, arg2); - - if (wait) { - mb(); - atomic_inc(finished); - } - atomic_add_int(&smp_tlb_wait, 1); - return (FILTER_HANDLED); -} - -/* - * Print various information about the SMP system hardware and setup. - */ -void -cpu_mp_announce(void) -{ - int i, x; - - /* List CPUs */ - printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); - for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) - continue; - if (cpu_info[x].cpu_disabled) - printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); - else { - KASSERT(i < mp_ncpus, - ("mp_ncpus and actual cpus are out of whack")); - printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); - } - } -} - -static int -xen_smp_cpu_init(unsigned int cpu) -{ - xen_intr_handle_t *ipi_handle; - const struct xen_ipi_handler *ipi; - int idx, rc; - - ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); - for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { - - /* - * The PCPU variable pc_device is not initialized on i386 PV, - * so we have to use the root_bus device in order to setup - * the IPIs. - */ - rc = xen_intr_alloc_and_bind_ipi(root_bus, cpu, - ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); - if (rc != 0) { - printf("Unable to allocate a XEN IPI port. " - "Error %d\n", rc); - break; - } - xen_intr_describe(ipi_handle[idx], "%s", ipi->description); - } - - for (;idx < nitems(xen_ipis); idx++) - ipi_handle[idx] = NULL; - - if (rc == 0) - return (0); - - /* Either all are successfully mapped, or none at all. */ - for (idx = 0; idx < nitems(xen_ipis); idx++) { - if (ipi_handle[idx] == NULL) - continue; - - xen_intr_unbind(ipi_handle[idx]); - ipi_handle[idx] = NULL; - } - - return (rc); -} - -static void -xen_smp_intr_init_cpus(void *unused) -{ - int i; - - for (i = 0; i < mp_ncpus; i++) - xen_smp_cpu_init(i); -} - -static void -xen_smp_intr_setup_cpus(void *unused) -{ - int i; - - for (i = 0; i < mp_ncpus; i++) - DPCPU_ID_SET(i, vcpu_info, - &HYPERVISOR_shared_info->vcpu_info[i]); -} - -#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) - -/* - * AP CPU's call this to initialize themselves. - */ -void -init_secondary(void) -{ - vm_offset_t addr; - u_int cpuid; - int gsel_tss; - - - /* bootAP is set in start_ap() to our ID. */ - PCPU_SET(currentldt, _default_ldt); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); -#if 0 - gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; -#endif - PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); -#if 0 - PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); - - PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); -#endif - PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); - - /* - * Set to a known state: - * Set by mpboot.s: CR0_PG, CR0_PE - * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM - */ - /* - * signal our startup to the BSP. - */ - mp_naps++; - - /* Spin until the BSP releases the AP's. */ - while (!aps_ready) - ia32_pause(); - - /* BSP may have changed PTD while we were waiting */ - invltlb(); - for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) - invlpg(addr); - -#if 0 - /* set up SSE/NX */ - initializecpu(); -#endif - - /* set up FPU state on the AP */ - npxinit(false); -#if 0 - /* A quick check from sanity claus */ - if (PCPU_GET(apic_id) != lapic_id()) { - printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); - printf("SMP: actual apic_id = %d\n", lapic_id()); - printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); - panic("cpuid mismatch! boom!!"); - } -#endif - - /* Initialize curthread. */ - KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); - PCPU_SET(curthread, PCPU_GET(idlethread)); - - mtx_lock_spin(&ap_boot_mtx); -#if 0 - - /* Init local apic for irq's */ - lapic_setup(1); -#endif - smp_cpus++; - - cpuid = PCPU_GET(cpuid); - CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); - printf("SMP: AP CPU #%d Launched!\n", cpuid); - - /* Determine if we are a logical CPU. */ - if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) - CPU_SET(cpuid, &logical_cpus_mask); - - /* Determine if we are a hyperthread. */ - if (hyperthreading_cpus > 1 && - PCPU_GET(apic_id) % hyperthreading_cpus != 0) - CPU_SET(cpuid, &hyperthreading_cpus_mask); -#if 0 - if (bootverbose) - lapic_dump("AP"); -#endif - if (smp_cpus == mp_ncpus) { - /* enable IPI's, tlb shootdown, freezes etc */ - atomic_store_rel_int(&smp_started, 1); - } - - mtx_unlock_spin(&ap_boot_mtx); - - /* wait until all the AP's are up */ - while (smp_started == 0) - ia32_pause(); - - PCPU_SET(curthread, PCPU_GET(idlethread)); - - /* Start per-CPU event timers. */ - cpu_initclocks_ap(); - - /* enter the scheduler */ - sched_throw(NULL); - - panic("scheduler returned us to %s", __func__); - /* NOTREACHED */ -} - -/******************************************************************* - * local functions and data - */ - -/* - * We tell the I/O APIC code about all the CPUs we want to receive - * interrupts. If we don't want certain CPUs to receive IRQs we - * can simply not tell the I/O APIC code about them in this function. - * We also do not tell it about the BSP since it tells itself about - * the BSP internally to work with UP kernels and on UP machines. - */ -static void -set_interrupt_apic_ids(void) -{ - u_int i, apic_id; - - for (i = 0; i < MAXCPU; i++) { - apic_id = cpu_apic_ids[i]; - if (apic_id == -1) - continue; - if (cpu_info[apic_id].cpu_bsp) - continue; - if (cpu_info[apic_id].cpu_disabled) - continue; - - /* Don't let hyperthreads service interrupts. */ - if (hyperthreading_cpus > 1 && - apic_id % hyperthreading_cpus != 0) - continue; - - intr_add_cpu(i); - } -} - -/* - * Assign logical CPU IDs to local APICs. - */ -static void -assign_cpu_ids(void) -{ - u_int i; - - /* Check for explicitly disabled CPUs. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) - continue; - - /* Don't use this CPU if it has been disabled by a tunable. */ - if (resource_disabled("lapic", i)) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - /* - * Assign CPU IDs to local APIC IDs and disable any CPUs - * beyond MAXCPU. CPU 0 has already been assigned to the BSP, - * so we only have to assign IDs for APs. - */ - mp_ncpus = 1; - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || - cpu_info[i].cpu_disabled) - continue; - - if (mp_ncpus < MAXCPU) { - cpu_apic_ids[mp_ncpus] = i; - apic_cpuids[i] = mp_ncpus; - mp_ncpus++; - } else - cpu_info[i].cpu_disabled = 1; - } - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, - mp_ncpus)); -} - -/* - * start each AP in our list - */ -/* Lowest 1MB is already mapped: don't touch*/ -#define TMPMAP_START 1 -int -start_all_aps(void) -{ - int x,apic_id, cpu; - struct pcpu *pc; - - mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); - - /* set up temporary P==V mapping for AP boot */ - /* XXX this is a hack, we should boot the AP on its own stack/PTD */ - - /* start each AP */ - for (cpu = 1; cpu < mp_ncpus; cpu++) { - apic_id = cpu_apic_ids[cpu]; - - - bootAP = cpu; - bootAPgdt = gdt + (512*cpu); - - /* Get per-cpu data */ - pc = &__pcpu[bootAP]; - pcpu_init(pc, bootAP, sizeof(struct pcpu)); - dpcpu_init((void *)kmem_malloc(kernel_arena, DPCPU_SIZE, - M_WAITOK | M_ZERO), bootAP); - pc->pc_apic_id = cpu_apic_ids[bootAP]; - pc->pc_vcpu_id = cpu_apic_ids[bootAP]; - pc->pc_prvspace = pc; - pc->pc_curthread = 0; - - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; - - PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); - bzero(bootAPgdt, PAGE_SIZE); - for (x = 0; x < NGDT; x++) - ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); - PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); -#ifdef notyet - - if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { - apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); - acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); -#ifdef CONFIG_ACPI - if (acpiid != 0xff) - x86_acpiid_to_apicid[acpiid] = apicid; -#endif - } -#endif - - /* attempt to start the Application Processor */ - if (!start_ap(cpu)) { - printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); - /* better panic as the AP may be running loose */ - printf("panic y/n? [y] "); - if (cngetc() != 'n') - panic("bye-bye"); - } - - CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ - } - - - pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); - - /* number of APs actually started */ - return (mp_naps); -} - -extern uint8_t *pcpu_boot_stack; -extern trap_info_t trap_table[]; - -static void -smp_trap_init(trap_info_t *trap_ctxt) -{ - const trap_info_t *t = trap_table; - - for (t = trap_table; t->address; t++) { - trap_ctxt[t->vector].flags = t->flags; - trap_ctxt[t->vector].cs = t->cs; - trap_ctxt[t->vector].address = t->address; - } -} - -extern struct rwlock pvh_global_lock; -extern int nkpt; -static void -cpu_initialize_context(unsigned int cpu) -{ - /* vcpu_guest_context_t is too large to allocate on the stack. - * Hence we allocate statically and protect it with a lock */ - vm_page_t m[NPGPTD + 2]; - static vcpu_guest_context_t ctxt; - vm_offset_t boot_stack; - vm_offset_t newPTD; - vm_paddr_t ma[NPGPTD]; - int i; - - /* - * Page 0,[0-3] PTD - * Page 1, [4] boot stack - * Page [5] PDPT - * - */ - for (i = 0; i < NPGPTD + 2; i++) { - m[i] = vm_page_alloc(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - - pmap_zero_page(m[i]); - - } - boot_stack = kva_alloc(PAGE_SIZE); - newPTD = kva_alloc(NPGPTD * PAGE_SIZE); - ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; - -#ifdef PAE - pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); - for (i = 0; i < NPGPTD; i++) { - ((vm_paddr_t *)boot_stack)[i] = - ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; - } -#endif - - /* - * Copy cpu0 IdlePTD to new IdlePTD - copying only - * kernel mappings - */ - pmap_qenter(newPTD, m, 4); - - memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), - (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), - nkpt*sizeof(vm_paddr_t)); - - pmap_qremove(newPTD, 4); - kva_free(newPTD, 4 * PAGE_SIZE); - /* - * map actual idle stack to boot_stack - */ - pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); - - - xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); - rw_wlock(&pvh_global_lock); - for (i = 0; i < 4; i++) { - int pdir = (PTDPTDI + i) / NPDEPG; - int curoffset = (PTDPTDI + i) % NPDEPG; - - xen_queue_pt_update((vm_paddr_t) - ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), - ma[i]); - } - PT_UPDATES_FLUSH(); - rw_wunlock(&pvh_global_lock); - - memset(&ctxt, 0, sizeof(ctxt)); - ctxt.flags = VGCF_IN_KERNEL; - ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); - ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.eip = (unsigned long)init_secondary; - ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ - - memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); - - smp_trap_init(ctxt.trap_ctxt); - - ctxt.ldt_ents = 0; - ctxt.gdt_frames[0] = - (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); - ctxt.gdt_ents = 512; - -#ifdef __i386__ - ctxt.user_regs.esp = boot_stack + PAGE_SIZE; - - ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.kernel_sp = boot_stack + PAGE_SIZE; - - ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; - ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; - - ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); -#else /* __x86_64__ */ - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); - ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.kernel_sp = idle->thread.rsp0; - - ctxt.event_callback_eip = (unsigned long)hypervisor_callback; - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; - ctxt.syscall_callback_eip = (unsigned long)system_call; - - ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); - - ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); -#endif - - printf("gdtpfn=%lx pdptpfn=%lx\n", - ctxt.gdt_frames[0], - ctxt.ctrlreg[3] >> PAGE_SHIFT); - - PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); - DELAY(3000); - PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); -} - -/* - * This function starts the AP (application processor) identified - * by the APIC ID 'physicalCpu'. It does quite a "song and dance" - * to accomplish this. This is necessary because of the nuances - * of the different hardware we might encounter. It isn't pretty, - * but it seems to work. - */ - -int cpus; -static int -start_ap(int apic_id) -{ - int ms; - - /* used as a watchpoint to signal AP startup */ - cpus = mp_naps; - - cpu_initialize_context(apic_id); - - /* Wait up to 5 seconds for it to start. */ - for (ms = 0; ms < 5000; ms++) { - if (mp_naps > cpus) - return (1); /* return SUCCESS */ - DELAY(1000); - } - return (0); /* return FAILURE */ -} - -static void -ipi_pcpu(int cpu, u_int ipi) -{ - KASSERT((ipi <= nitems(xen_ipis)), ("invalid IPI")); - xen_intr_signal(DPCPU_ID_GET(cpu, ipi_handle[ipi])); -} - -/* - * send an IPI to a specific CPU. - */ -static void -ipi_send_cpu(int cpu, u_int ipi) -{ - u_int bitmap, old_pending, new_pending; - - if (IPI_IS_BITMAPED(ipi)) { - bitmap = 1 << ipi; - ipi = IPI_BITMAP_VECTOR; - do { - old_pending = cpu_ipi_pending[cpu]; - new_pending = old_pending | bitmap; - } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], - old_pending, new_pending)); - if (!old_pending) - ipi_pcpu(cpu, RESCHEDULE_VECTOR); - } else { - KASSERT(call_data != NULL, ("call_data not set")); - ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); - } -} - -/* - * Flush the TLB on all other CPU's - */ -static void -smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) -{ - u_int ncpu; - struct _call_data data; - - ncpu = mp_ncpus - 1; /* does not shootdown self */ - if (ncpu < 1) - return; /* no other cpus */ - if (!(read_eflags() & PSL_I)) - panic("%s: interrupts disabled", __func__); - mtx_lock_spin(&smp_ipi_mtx); - KASSERT(call_data == NULL, ("call_data isn't null?!")); - call_data = &data; - call_data->func_id = vector; - call_data->arg1 = addr1; - call_data->arg2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); - ipi_all_but_self(vector); - while (smp_tlb_wait < ncpu) - ia32_pause(); - call_data = NULL; - mtx_unlock_spin(&smp_ipi_mtx); -} - -static void -smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, - vm_offset_t addr2) -{ - int cpu, ncpu, othercpus; - struct _call_data data; - - othercpus = mp_ncpus - 1; - if (CPU_ISFULLSET(&mask)) { - if (othercpus < 1) - return; - } else { - CPU_CLR(PCPU_GET(cpuid), &mask); - if (CPU_EMPTY(&mask)) - return; - } - if (!(read_eflags() & PSL_I)) - panic("%s: interrupts disabled", __func__); - mtx_lock_spin(&smp_ipi_mtx); - KASSERT(call_data == NULL, ("call_data isn't null?!")); - call_data = &data; - call_data->func_id = vector; - call_data->arg1 = addr1; - call_data->arg2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); - if (CPU_ISFULLSET(&mask)) { - ncpu = othercpus; - ipi_all_but_self(vector); - } else { - ncpu = 0; - while ((cpu = CPU_FFS(&mask)) != 0) { - cpu--; - CPU_CLR(cpu, &mask); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, - vector); - ipi_send_cpu(cpu, vector); - ncpu++; - } - } - while (smp_tlb_wait < ncpu) - ia32_pause(); - call_data = NULL; - mtx_unlock_spin(&smp_ipi_mtx); -} - -void -smp_cache_flush(void) -{ - - if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); -} - -void -smp_invltlb(void) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLTLB, 0, 0); - } -} - -void -smp_invlpg(vm_offset_t addr) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLPG, addr, 0); - } -} - -void -smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); - } -} - -void -smp_masked_invltlb(cpuset_t mask) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); - } -} - -void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); - } -} - -void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); - } -} - -/* - * send an IPI to a set of cpus. - */ -void -ipi_selected(cpuset_t cpus, u_int ipi) -{ - int cpu; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); - - while ((cpu = CPU_FFS(&cpus)) != 0) { - cpu--; - CPU_CLR(cpu, &cpus); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); - } -} - -/* - * send an IPI to a specific CPU. - */ -void -ipi_cpu(int cpu, u_int ipi) -{ - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); - - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); -} - -/* - * send an IPI to all CPUs EXCEPT myself - */ -void -ipi_all_but_self(u_int ipi) -{ - cpuset_t other_cpus; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - other_cpus = all_cpus; - CPU_CLR(PCPU_GET(cpuid), &other_cpus); - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); - - CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); - ipi_selected(other_cpus, ipi); -} - -int -ipi_nmi_handler() -{ - u_int cpuid; - - /* - * As long as there is not a simple way to know about a NMI's - * source, if the bitmask for the current CPU is present in - * the global pending bitword an IPI_STOP_HARD has been issued - * and should be handled. - */ - cpuid = PCPU_GET(cpuid); - if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) - return (1); - - CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); - cpustop_handler(); - return (0); -} - -/* - * Handle an IPI_STOP by saving our current context and spinning until we - * are resumed. - */ -void -cpustop_handler(void) -{ - int cpu; - - cpu = PCPU_GET(cpuid); - - savectx(&stoppcbs[cpu]); - - /* Indicate that we are stopped */ - CPU_SET_ATOMIC(cpu, &stopped_cpus); - - /* Wait for restart */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - CPU_CLR_ATOMIC(cpu, &started_cpus); - CPU_CLR_ATOMIC(cpu, &stopped_cpus); - - if (cpu == 0 && cpustop_restartfunc != NULL) { - cpustop_restartfunc(); - cpustop_restartfunc = NULL; - } -} - -/* - * Handlers for TLB related IPIs - * - * On i386 Xen PV this are no-ops since this port doesn't support SMP. - */ -void -invltlb_handler(void) -{ -} - -void -invlpg_handler(void) -{ -} - -void -invlrng_handler(void) -{ -} - -void -invlcache_handler(void) -{ -} - -/* - * This is called once the rest of the system is up and running and we're - * ready to let the AP's out of the pen. - */ -static void -release_aps(void *dummy __unused) -{ - - if (mp_ncpus == 1) - return; - atomic_store_rel_int(&aps_ready, 1); - while (smp_started == 0) - ia32_pause(); -} -SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); -SYSINIT(start_ipis, SI_SUB_SMP, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); -SYSINIT(start_cpu, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_setup_cpus, NULL); diff --git a/sys/i386/xen/mptable.c b/sys/i386/xen/mptable.c deleted file mode 100644 index 81d7c1b..0000000 --- a/sys/i386/xen/mptable.c +++ /dev/null @@ -1,109 +0,0 @@ -/*- - * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> - * Copyright (c) 1996, by Steve Passe - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. The name of the developer may NOT be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/bus.h> -#include <sys/kernel.h> - -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/pmap.h> - -#include <machine/frame.h> -#include <machine/intr_machdep.h> -#include <x86/apicvar.h> - -#include <xen/hypervisor.h> -#include <xen/xen-os.h> -#include <machine/smp.h> -#include <xen/interface/vcpu.h> - - -static int mptable_probe(void); -static int mptable_probe_cpus(void); -static void mptable_register(void *dummy); -static int mptable_setup_local(void); -static int mptable_setup_io(void); - -static struct apic_enumerator mptable_enumerator = { - "MPTable", - mptable_probe, - mptable_probe_cpus, - mptable_setup_local, - mptable_setup_io -}; - -static int -mptable_probe(void) -{ - - return (-100); -} - -static int -mptable_probe_cpus(void) -{ - int i, rc; - - for (i = 0; i < MAXCPU; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) - cpu_add(i, (i == 0)); - } - - return (0); -} - -/* - * Initialize the local APIC on the BSP. - */ -static int -mptable_setup_local(void) -{ - - PCPU_SET(apic_id, 0); - PCPU_SET(vcpu_id, 0); - return (0); -} - -static int -mptable_setup_io(void) -{ - - return (0); -} - -static void -mptable_register(void *dummy __unused) -{ - - apic_register_enumerator(&mptable_enumerator); -} -SYSINIT(mptable_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, mptable_register, - NULL); diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c deleted file mode 100644 index 757fc36..0000000 --- a/sys/i386/xen/pmap.c +++ /dev/null @@ -1,4420 +0,0 @@ -/*- - * Copyright (c) 1991 Regents of the University of California. - * All rights reserved. - * Copyright (c) 1994 John S. Dyson - * All rights reserved. - * Copyright (c) 1994 David Greenman - * All rights reserved. - * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * the Systems Programming Group of the University of Utah Computer - * Science Department and William Jolitz of UUNET Technologies Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - */ -/*- - * Copyright (c) 2003 Networks Associates Technology, Inc. - * All rights reserved. - * - * This software was developed for the FreeBSD Project by Jake Burkholder, - * Safeport Network Services, and Network Associates Laboratories, the - * Security Research Division of Network Associates, Inc. under - * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA - * CHATS research program. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Manages physical address maps. - * - * Since the information managed by this module is - * also stored by the logical address mapping module, - * this module may throw away valid virtual-to-physical - * mappings at almost any time. However, invalidations - * of virtual-to-physical mappings must be done as - * requested. - * - * In order to cope with hardware architectures which - * make virtual-to-physical map invalidates expensive, - * this module may delay invalidate or reduced protection - * operations until such time as they are actually - * necessary. This module is given full information as - * to which processors are currently using which maps, - * and to when physical maps must be made correct. - */ - -#include "opt_cpu.h" -#include "opt_pmap.h" -#include "opt_smp.h" -#include "opt_xbox.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/ktr.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/mman.h> -#include <sys/msgbuf.h> -#include <sys/mutex.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/sf_buf.h> -#include <sys/sx.h> -#include <sys/vmmeter.h> -#include <sys/sched.h> -#include <sys/sysctl.h> -#ifdef SMP -#include <sys/smp.h> -#else -#include <sys/cpuset.h> -#endif - -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/vm_kern.h> -#include <vm/vm_page.h> -#include <vm/vm_map.h> -#include <vm/vm_object.h> -#include <vm/vm_extern.h> -#include <vm/vm_pageout.h> -#include <vm/vm_pager.h> -#include <vm/uma.h> - -#include <machine/cpu.h> -#include <machine/cputypes.h> -#include <machine/md_var.h> -#include <machine/pcb.h> -#include <machine/specialreg.h> -#ifdef SMP -#include <machine/smp.h> -#endif - -#ifdef XBOX -#include <machine/xbox.h> -#endif - -#include <xen/interface/xen.h> -#include <xen/hypervisor.h> -#include <machine/xen/hypercall.h> -#include <machine/xen/xenvar.h> -#include <machine/xen/xenfunc.h> - -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - -#ifndef PMAP_SHPGPERPROC -#define PMAP_SHPGPERPROC 200 -#endif - -#define DIAGNOSTIC - -#if !defined(DIAGNOSTIC) -#ifdef __GNUC_GNU_INLINE__ -#define PMAP_INLINE __attribute__((__gnu_inline__)) inline -#else -#define PMAP_INLINE extern inline -#endif -#else -#define PMAP_INLINE -#endif - -#ifdef PV_STATS -#define PV_STAT(x) do { x ; } while (0) -#else -#define PV_STAT(x) do { } while (0) -#endif - -/* - * Get PDEs and PTEs for user/kernel address space - */ -#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) -#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) - -#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) -#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) -#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) -#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) -#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) - -#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) - -#define HAMFISTED_LOCKING -#ifdef HAMFISTED_LOCKING -static struct mtx createdelete_lock; -#endif - -struct pmap kernel_pmap_store; -LIST_HEAD(pmaplist, pmap); -static struct pmaplist allpmaps; -static struct mtx allpmaps_lock; - -vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ -vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ -int pgeflag = 0; /* PG_G or-in */ -int pseflag = 0; /* PG_PS or-in */ - -int nkpt; -vm_offset_t kernel_vm_end; -extern u_int32_t KERNend; - -#ifdef PAE -pt_entry_t pg_nx; -#endif - -static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); - -static int pat_works; /* Is page attribute table sane? */ - -/* - * This lock is defined as static in other pmap implementations. It cannot, - * however, be defined as static here, because it is (ab)used to serialize - * queued page table changes in other sources files. - */ -struct rwlock pvh_global_lock; - -/* - * Data for the pv entry allocation mechanism - */ -static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); -static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; -static int shpgperproc = PMAP_SHPGPERPROC; - -struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ -int pv_maxchunks; /* How many chunks we have KVA for */ -vm_offset_t pv_vafree; /* freelist stored in the PTE */ - -/* - * All those kernel PT submaps that BSD is so fond of - */ -struct sysmaps { - struct mtx lock; - pt_entry_t *CMAP1; - pt_entry_t *CMAP2; - caddr_t CADDR1; - caddr_t CADDR2; -}; -static struct sysmaps sysmaps_pcpu[MAXCPU]; -pt_entry_t *CMAP3; -caddr_t ptvmmap = 0; -caddr_t CADDR3; -struct msgbuf *msgbufp = 0; - -/* - * Crashdump maps. - */ -static caddr_t crashdumpmap; - -static pt_entry_t *PMAP1 = 0, *PMAP2; -static pt_entry_t *PADDR1 = 0, *PADDR2; -#ifdef SMP -static int PMAP1cpu; -static int PMAP1changedcpu; -SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, - &PMAP1changedcpu, 0, - "Number of times pmap_pte_quick changed CPU with same PMAP1"); -#endif -static int PMAP1changed; -SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, - &PMAP1changed, 0, - "Number of times pmap_pte_quick changed PMAP1"); -static int PMAP1unchanged; -SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, - &PMAP1unchanged, 0, - "Number of times pmap_pte_quick didn't change PMAP1"); -static struct mtx PMAP2mutex; - -static void free_pv_chunk(struct pv_chunk *pc); -static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); -static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); -static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, - vm_offset_t va); - -static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va, - vm_page_t m, vm_prot_t prot, vm_page_t mpte); -static void pmap_flush_page(vm_page_t m); -static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); -static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, - vm_page_t *free); -static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, - vm_page_t *free); -static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, - vm_offset_t va); -static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, - vm_page_t m); - -static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); - -static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); -static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free); -static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); -static void pmap_pte_release(pt_entry_t *pte); -static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); -static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr); - -static __inline void pagezero(void *page); - -CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); -CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); - -/* - * If you get an error here, then you set KVA_PAGES wrong! See the - * description of KVA_PAGES in sys/i386/include/pmap.h. It must be - * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. - */ -CTASSERT(KERNBASE % (1 << 24) == 0); - -void -pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) -{ - vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); - - switch (type) { - case SH_PD_SET_VA: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, - xpmap_ptom(val & ~(PG_RW))); -#endif - xen_queue_pt_update(pdir_ma, - xpmap_ptom(val)); - break; - case SH_PD_SET_VA_MA: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, - val & ~(PG_RW)); -#endif - xen_queue_pt_update(pdir_ma, val); - break; - case SH_PD_SET_VA_CLEAR: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, 0); -#endif - xen_queue_pt_update(pdir_ma, 0); - break; - } -} - -/* - * Bootstrap the system enough to run with virtual memory. - * - * On the i386 this is called after mapping has already been enabled - * and just syncs the pmap module with what has already been done. - * [We can't call it easily with mapping off since the kernel is not - * mapped with PA == VA, hence we would have to relocate every address - * from the linked base (virtual) address "KERNBASE" to the actual - * (physical) address starting relative to 0] - */ -void -pmap_bootstrap(vm_paddr_t firstaddr) -{ - vm_offset_t va; - pt_entry_t *pte, *unused; - struct sysmaps *sysmaps; - int i; - - /* - * Initialize the first available kernel virtual address. However, - * using "firstaddr" may waste a few pages of the kernel virtual - * address space, because locore may not have mapped every physical - * page that it allocated. Preferably, locore would provide a first - * unused virtual address in addition to "firstaddr". - */ - virtual_avail = (vm_offset_t) KERNBASE + firstaddr; - - virtual_end = VM_MAX_KERNEL_ADDRESS; - - /* - * Initialize the kernel pmap (which is statically allocated). - */ - PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); -#ifdef PAE - kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); -#endif - CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ - TAILQ_INIT(&kernel_pmap->pm_pvchunk); - - /* - * Initialize the global pv list lock. - */ - rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE); - - LIST_INIT(&allpmaps); - mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - if (nkpt == 0) - nkpt = NKPT; - - /* - * Reserve some special page table entries/VA space for temporary - * mapping of pages. - */ -#define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); - - va = virtual_avail; - pte = vtopte(va); - - /* - * CMAP1/CMAP2 are used for zeroing and copying pages. - * CMAP3 is used for the idle process page zeroing. - */ - for (i = 0; i < MAXCPU; i++) { - sysmaps = &sysmaps_pcpu[i]; - mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); - SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) - SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - } - SYSMAP(caddr_t, CMAP3, CADDR3, 1) - PT_SET_MA(CADDR3, 0); - - /* - * Crashdump maps. - */ - SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) - - /* - * ptvmmap is used for reading arbitrary physical pages via /dev/mem. - */ - SYSMAP(caddr_t, unused, ptvmmap, 1) - - /* - * msgbufp is used to map the system message buffer. - */ - SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) - - /* - * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), - * respectively. - */ - SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) - SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) - - mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); - - virtual_avail = va; - - /* - * Leave in place an identity mapping (virt == phys) for the low 1 MB - * physical memory region that is used by the ACPI wakeup code. This - * mapping must not have PG_G set. - */ -#ifndef XEN - /* - * leave here deliberately to show that this is not supported - */ -#ifdef XBOX - /* FIXME: This is gross, but needed for the XBOX. Since we are in such - * an early stadium, we cannot yet neatly map video memory ... :-( - * Better fixes are very welcome! */ - if (!arch_i386_is_xbox) -#endif - for (i = 1; i < NKPT; i++) - PTD[i] = 0; - - /* Initialize the PAT MSR if present. */ - pmap_init_pat(); - - /* Turn on PG_G on kernel page(s) */ - pmap_set_pg(); -#endif - -#ifdef HAMFISTED_LOCKING - mtx_init(&createdelete_lock, "pmap create/delete", NULL, MTX_DEF); -#endif -} - -/* - * Setup the PAT MSR. - */ -void -pmap_init_pat(void) -{ - uint64_t pat_msr; - - /* Bail if this CPU doesn't implement PAT. */ - if (!(cpu_feature & CPUID_PAT)) - return; - - if (cpu_vendor_id != CPU_VENDOR_INTEL || - (CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) { - /* - * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. - * Program 4 and 5 as WP and WC. - * Leave 6 and 7 as UC and UC-. - */ - pat_msr = rdmsr(MSR_PAT); - pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); - pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | - PAT_VALUE(5, PAT_WRITE_COMBINING); - pat_works = 1; - } else { - /* - * Due to some Intel errata, we can only safely use the lower 4 - * PAT entries. Thus, just replace PAT Index 2 with WC instead - * of UC-. - * - * Intel Pentium III Processor Specification Update - * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B - * or Mode C Paging) - * - * Intel Pentium IV Processor Specification Update - * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) - */ - pat_msr = rdmsr(MSR_PAT); - pat_msr &= ~PAT_MASK(2); - pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); - pat_works = 0; - } - wrmsr(MSR_PAT, pat_msr); -} - -/* - * Initialize a vm_page's machine-dependent fields. - */ -void -pmap_page_init(vm_page_t m) -{ - - TAILQ_INIT(&m->md.pv_list); - m->md.pat_mode = PAT_WRITE_BACK; -} - -/* - * ABuse the pte nodes for unmapped kva to thread a kva freelist through. - * Requirements: - * - Must deal with pages in order to ensure that none of the PG_* bits - * are ever set, PG_V in particular. - * - Assumes we can write to ptes without pte_store() atomic ops, even - * on PAE systems. This should be ok. - * - Assumes nothing will ever test these addresses for 0 to indicate - * no mapping instead of correctly checking PG_V. - * - Assumes a vm_offset_t will fit in a pte (true for i386). - * Because PG_V is never set, there can be no mappings to invalidate. - */ -static int ptelist_count = 0; -static vm_offset_t -pmap_ptelist_alloc(vm_offset_t *head) -{ - vm_offset_t va; - vm_offset_t *phead = (vm_offset_t *)*head; - - if (ptelist_count == 0) { - printf("out of memory!!!!!!\n"); - return (0); /* Out of memory */ - } - ptelist_count--; - va = phead[ptelist_count]; - return (va); -} - -static void -pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) -{ - vm_offset_t *phead = (vm_offset_t *)*head; - - phead[ptelist_count++] = va; -} - -static void -pmap_ptelist_init(vm_offset_t *head, void *base, int npages) -{ - int i, nstackpages; - vm_offset_t va; - vm_page_t m; - - nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t)); - for (i = 0; i < nstackpages; i++) { - va = (vm_offset_t)base + i * PAGE_SIZE; - m = vm_page_alloc(NULL, i, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - pmap_qenter(va, &m, 1); - } - - *head = (vm_offset_t)base; - for (i = npages - 1; i >= nstackpages; i--) { - va = (vm_offset_t)base + i * PAGE_SIZE; - pmap_ptelist_free(head, va); - } -} - - -/* - * Initialize the pmap module. - * Called by vm_init, to initialize any structures that the pmap - * system needs to map virtual memory. - */ -void -pmap_init(void) -{ - - /* - * Initialize the address space (zone) for the pv entries. Set a - * high water mark so that the system can recover from excessive - * numbers of pv entries. - */ - TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); - pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; - TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); - pv_entry_max = roundup(pv_entry_max, _NPCPV); - pv_entry_high_water = 9 * (pv_entry_max / 10); - - pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); - pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); - if (pv_chunkbase == NULL) - panic("pmap_init: not enough kvm for pv chunks"); - pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); -} - - -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, - "Max number of PV entries"); -SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, - "Page share factor per proc"); - -static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, - "2/4MB page mapping counters"); - -static u_long pmap_pde_mappings; -SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, - &pmap_pde_mappings, 0, "2/4MB page mappings"); - -/*************************************************** - * Low level helper routines..... - ***************************************************/ - -/* - * Determine the appropriate bits to set in a PTE or PDE for a specified - * caching mode. - */ -int -pmap_cache_bits(int mode, boolean_t is_pde) -{ - int pat_flag, pat_index, cache_bits; - - /* The PAT bit is different for PTE's and PDE's. */ - pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; - - /* If we don't support PAT, map extended modes to older ones. */ - if (!(cpu_feature & CPUID_PAT)) { - switch (mode) { - case PAT_UNCACHEABLE: - case PAT_WRITE_THROUGH: - case PAT_WRITE_BACK: - break; - case PAT_UNCACHED: - case PAT_WRITE_COMBINING: - case PAT_WRITE_PROTECTED: - mode = PAT_UNCACHEABLE; - break; - } - } - - /* Map the caching mode to a PAT index. */ - if (pat_works) { - switch (mode) { - case PAT_UNCACHEABLE: - pat_index = 3; - break; - case PAT_WRITE_THROUGH: - pat_index = 1; - break; - case PAT_WRITE_BACK: - pat_index = 0; - break; - case PAT_UNCACHED: - pat_index = 2; - break; - case PAT_WRITE_COMBINING: - pat_index = 5; - break; - case PAT_WRITE_PROTECTED: - pat_index = 4; - break; - default: - panic("Unknown caching mode %d\n", mode); - } - } else { - switch (mode) { - case PAT_UNCACHED: - case PAT_UNCACHEABLE: - case PAT_WRITE_PROTECTED: - pat_index = 3; - break; - case PAT_WRITE_THROUGH: - pat_index = 1; - break; - case PAT_WRITE_BACK: - pat_index = 0; - break; - case PAT_WRITE_COMBINING: - pat_index = 2; - break; - default: - panic("Unknown caching mode %d\n", mode); - } - } - - /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ - cache_bits = 0; - if (pat_index & 0x4) - cache_bits |= pat_flag; - if (pat_index & 0x2) - cache_bits |= PG_NC_PCD; - if (pat_index & 0x1) - cache_bits |= PG_NC_PWT; - return (cache_bits); -} -#ifdef SMP -/* - * For SMP, these functions have to use the IPI mechanism for coherence. - * - * N.B.: Before calling any of the following TLB invalidation functions, - * the calling processor must ensure that all stores updating a non- - * kernel page table are globally performed. Otherwise, another - * processor could cache an old, pre-update entry without being - * invalidated. This can happen one of two ways: (1) The pmap becomes - * active on another processor after its pm_active field is checked by - * one of the following functions but before a store updating the page - * table is globally performed. (2) The pmap becomes active on another - * processor before its pm_active field is checked but due to - * speculative loads one of the following functions stills reads the - * pmap as inactive on the other processor. - * - * The kernel page table is exempt because its pm_active field is - * immutable. The kernel page table is always active on every - * processor. - */ -void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) -{ - cpuset_t other_cpus; - u_int cpuid; - - CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", - pmap, va); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invlpg(va); - smp_invlpg(va); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invlpg(va); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg(other_cpus, va); - } - sched_unpin(); - PT_UPDATES_FLUSH(); -} - -void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - cpuset_t other_cpus; - vm_offset_t addr; - u_int cpuid; - - CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", - pmap, sva, eva); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - smp_invlpg_range(sva, eva); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg_range(other_cpus, sva, eva); - } - sched_unpin(); - PT_UPDATES_FLUSH(); -} - -void -pmap_invalidate_all(pmap_t pmap) -{ - cpuset_t other_cpus; - u_int cpuid; - - CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invltlb(); - smp_invltlb(); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invltlb(); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invltlb(other_cpus); - } - sched_unpin(); -} - -void -pmap_invalidate_cache(void) -{ - - sched_pin(); - wbinvd(); - smp_cache_flush(); - sched_unpin(); -} -#else /* !SMP */ -/* - * Normal, non-SMP, 486+ invalidation functions. - * We inline these within pmap.c for speed. - */ -PMAP_INLINE void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) -{ - CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", - pmap, va); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invlpg(va); - PT_UPDATES_FLUSH(); -} - -PMAP_INLINE void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t addr; - - if (eva - sva > PAGE_SIZE) - CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", - pmap, sva, eva); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - PT_UPDATES_FLUSH(); -} - -PMAP_INLINE void -pmap_invalidate_all(pmap_t pmap) -{ - - CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invltlb(); -} - -PMAP_INLINE void -pmap_invalidate_cache(void) -{ - - wbinvd(); -} -#endif /* !SMP */ - -#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) - -void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) -{ - - if (force) { - sva &= ~(vm_offset_t)cpu_clflush_line_size; - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); - } - - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported, do nothing. */ - else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { - - /* - * Otherwise, do per-cache line flush. Use the mfence - * instruction to insure that previous stores are - * included in the write-back. The processor - * propagates flush to other processors in the cache - * coherence domain. - */ - mfence(); - for (; sva < eva; sva += cpu_clflush_line_size) - clflush(sva); - mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); - } -} - -void -pmap_invalidate_cache_pages(vm_page_t *pages, int count) -{ - int i; - - if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || - (cpu_feature & CPUID_CLFSH) == 0) { - pmap_invalidate_cache(); - } else { - for (i = 0; i < count; i++) - pmap_flush_page(pages[i]); - } -} - -/* - * Are we current address space or kernel? N.B. We return FALSE when - * a pmap's page table is in use because a kernel thread is borrowing - * it. The borrowed page table can change spontaneously, making any - * dependence on its continued use subject to a race condition. - */ -static __inline int -pmap_is_current(pmap_t pmap) -{ - - return (pmap == kernel_pmap || - (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && - (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); -} - -/* - * If the given pmap is not the current or kernel pmap, the returned pte must - * be released by passing it to pmap_pte_release(). - */ -pt_entry_t * -pmap_pte(pmap_t pmap, vm_offset_t va) -{ - pd_entry_t newpf; - pd_entry_t *pde; - - pde = pmap_pde(pmap, va); - if (*pde & PG_PS) - return (pde); - if (*pde != 0) { - /* are we current address space or kernel? */ - if (pmap_is_current(pmap)) - return (vtopte(va)); - mtx_lock(&PMAP2mutex); - newpf = *pde & PG_FRAME; - if ((*PMAP2 & PG_FRAME) != newpf) { - PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); - CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", - pmap, va, (*PMAP2 & 0xffffffff)); - } - return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); - } - return (NULL); -} - -/* - * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte - * being NULL. - */ -static __inline void -pmap_pte_release(pt_entry_t *pte) -{ - - if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { - CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", - *PMAP2); - rw_wlock(&pvh_global_lock); - PT_SET_VA(PMAP2, 0, TRUE); - rw_wunlock(&pvh_global_lock); - mtx_unlock(&PMAP2mutex); - } -} - -static __inline void -invlcaddr(void *caddr) -{ - - invlpg((u_int)caddr); - PT_UPDATES_FLUSH(); -} - -/* - * Super fast pmap_pte routine best used when scanning - * the pv lists. This eliminates many coarse-grained - * invltlb calls. Note that many of the pv list - * scans are across different pmaps. It is very wasteful - * to do an entire invltlb for checking a single mapping. - * - * If the given pmap is not the current pmap, pvh_global_lock - * must be held and curthread pinned to a CPU. - */ -static pt_entry_t * -pmap_pte_quick(pmap_t pmap, vm_offset_t va) -{ - pd_entry_t newpf; - pd_entry_t *pde; - - pde = pmap_pde(pmap, va); - if (*pde & PG_PS) - return (pde); - if (*pde != 0) { - /* are we current address space or kernel? */ - if (pmap_is_current(pmap)) - return (vtopte(va)); - rw_assert(&pvh_global_lock, RA_WLOCKED); - KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); - newpf = *pde & PG_FRAME; - if ((*PMAP1 & PG_FRAME) != newpf) { - PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); - CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", - pmap, va, (u_long)*PMAP1); - -#ifdef SMP - PMAP1cpu = PCPU_GET(cpuid); -#endif - PMAP1changed++; - } else -#ifdef SMP - if (PMAP1cpu != PCPU_GET(cpuid)) { - PMAP1cpu = PCPU_GET(cpuid); - invlcaddr(PADDR1); - PMAP1changedcpu++; - } else -#endif - PMAP1unchanged++; - return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); - } - return (0); -} - -/* - * Routine: pmap_extract - * Function: - * Extract the physical page address associated - * with the given map/virtual_address pair. - */ -vm_paddr_t -pmap_extract(pmap_t pmap, vm_offset_t va) -{ - vm_paddr_t rtval; - pt_entry_t *pte; - pd_entry_t pde; - pt_entry_t pteval; - - rtval = 0; - PMAP_LOCK(pmap); - pde = pmap->pm_pdir[va >> PDRSHIFT]; - if (pde != 0) { - if ((pde & PG_PS) != 0) { - rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); - PMAP_UNLOCK(pmap); - return rtval; - } - pte = pmap_pte(pmap, va); - pteval = *pte ? xpmap_mtop(*pte) : 0; - rtval = (pteval & PG_FRAME) | (va & PAGE_MASK); - pmap_pte_release(pte); - } - PMAP_UNLOCK(pmap); - return (rtval); -} - -/* - * Routine: pmap_extract_ma - * Function: - * Like pmap_extract, but returns machine address - */ -vm_paddr_t -pmap_extract_ma(pmap_t pmap, vm_offset_t va) -{ - vm_paddr_t rtval; - pt_entry_t *pte; - pd_entry_t pde; - - rtval = 0; - PMAP_LOCK(pmap); - pde = pmap->pm_pdir[va >> PDRSHIFT]; - if (pde != 0) { - if ((pde & PG_PS) != 0) { - rtval = (pde & ~PDRMASK) | (va & PDRMASK); - PMAP_UNLOCK(pmap); - return rtval; - } - pte = pmap_pte(pmap, va); - rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); - pmap_pte_release(pte); - } - PMAP_UNLOCK(pmap); - return (rtval); -} - -/* - * Routine: pmap_extract_and_hold - * Function: - * Atomically extract and hold the physical page - * with the given pmap and virtual address pair - * if that mapping permits the given protection. - */ -vm_page_t -pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) -{ - pd_entry_t pde; - pt_entry_t pte, *ptep; - vm_page_t m; - vm_paddr_t pa; - - pa = 0; - m = NULL; - PMAP_LOCK(pmap); -retry: - pde = PT_GET(pmap_pde(pmap, va)); - if (pde != 0) { - if (pde & PG_PS) { - if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { - if (vm_page_pa_tryrelock(pmap, (pde & - PG_PS_FRAME) | (va & PDRMASK), &pa)) - goto retry; - m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | - (va & PDRMASK)); - vm_page_hold(m); - } - } else { - ptep = pmap_pte(pmap, va); - pte = PT_GET(ptep); - pmap_pte_release(ptep); - if (pte != 0 && - ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { - if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, - &pa)) - goto retry; - m = PHYS_TO_VM_PAGE(pte & PG_FRAME); - vm_page_hold(m); - } - } - } - PA_UNLOCK_COND(pa); - PMAP_UNLOCK(pmap); - return (m); -} - -/*************************************************** - * Low level mapping routines..... - ***************************************************/ - -/* - * Add a wired page to the kva. - * Note: not SMP coherent. - * - * This function may be used before pmap_bootstrap() is called. - */ -void -pmap_kenter(vm_offset_t va, vm_paddr_t pa) -{ - - PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); -} - -void -pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) -{ - pt_entry_t *pte; - - pte = vtopte(va); - pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag); -} - -static __inline void -pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) -{ - - PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); -} - -/* - * Remove a page from the kernel pagetables. - * Note: not SMP coherent. - * - * This function may be used before pmap_bootstrap() is called. - */ -PMAP_INLINE void -pmap_kremove(vm_offset_t va) -{ - pt_entry_t *pte; - - pte = vtopte(va); - PT_CLEAR_VA(pte, FALSE); -} - -/* - * Used to map a range of physical addresses into kernel - * virtual address space. - * - * The value passed in '*virt' is a suggested virtual address for - * the mapping. Architectures which can support a direct-mapped - * physical to virtual region can return the appropriate address - * within that region, leaving '*virt' unchanged. Other - * architectures should map the pages starting at '*virt' and - * update '*virt' with the first usable address after the mapped - * region. - */ -vm_offset_t -pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) -{ - vm_offset_t va, sva; - - va = sva = *virt; - CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", - va, start, end, prot); - while (start < end) { - pmap_kenter(va, start); - va += PAGE_SIZE; - start += PAGE_SIZE; - } - pmap_invalidate_range(kernel_pmap, sva, va); - *virt = va; - return (sva); -} - - -/* - * Add a list of wired pages to the kva - * this routine is only used for temporary - * kernel mappings that do not need to have - * page modification or references recorded. - * Note that old mappings are simply written - * over. The page *must* be wired. - * Note: SMP coherent. Uses a ranged shootdown IPI. - */ -void -pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) -{ - pt_entry_t *endpte, *pte; - vm_paddr_t pa; - vm_offset_t va = sva; - int mclcount = 0; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - int error; - - CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count); - pte = vtopte(sva); - endpte = pte + count; - while (pte < endpte) { - pa = VM_PAGE_TO_MACH(*ma) | pgeflag | PG_RW | PG_V | PG_M | PG_A; - - mclp->op = __HYPERVISOR_update_va_mapping; - mclp->args[0] = va; - mclp->args[1] = (uint32_t)(pa & 0xffffffff); - mclp->args[2] = (uint32_t)(pa >> 32); - mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0; - - va += PAGE_SIZE; - pte++; - ma++; - mclp++; - mclcount++; - if (mclcount == 16) { - error = HYPERVISOR_multicall(mcl, mclcount); - mclp = mcl; - mclcount = 0; - KASSERT(error == 0, ("bad multicall %d", error)); - } - } - if (mclcount) { - error = HYPERVISOR_multicall(mcl, mclcount); - KASSERT(error == 0, ("bad multicall %d", error)); - } - -#ifdef INVARIANTS - for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++) - KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE)); -#endif -} - -/* - * This routine tears out page mappings from the - * kernel -- it is meant only for temporary mappings. - * Note: SMP coherent. Uses a ranged shootdown IPI. - */ -void -pmap_qremove(vm_offset_t sva, int count) -{ - vm_offset_t va; - - CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count); - va = sva; - rw_wlock(&pvh_global_lock); - critical_enter(); - while (count-- > 0) { - pmap_kremove(va); - va += PAGE_SIZE; - } - PT_UPDATES_FLUSH(); - pmap_invalidate_range(kernel_pmap, sva, va); - critical_exit(); - rw_wunlock(&pvh_global_lock); -} - -/*************************************************** - * Page table page management routines..... - ***************************************************/ -static __inline void -pmap_free_zero_pages(vm_page_t free) -{ - vm_page_t m; - - while (free != NULL) { - m = free; - free = (void *)m->object; - m->object = NULL; - vm_page_free_zero(m); - } -} - -/* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count - * drops to zero, then the page table page is unmapped. Returns TRUE if the - * page table page was unmapped and FALSE otherwise. - */ -static inline boolean_t -pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) -{ - - --m->wire_count; - if (m->wire_count == 0) { - _pmap_unwire_ptp(pmap, m, free); - return (TRUE); - } else - return (FALSE); -} - -static void -_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) -{ - vm_offset_t pteva; - - PT_UPDATES_FLUSH(); - /* - * unmap the page table page - */ - xen_pt_unpin(pmap->pm_pdir[m->pindex]); - /* - * page *might* contain residual mapping :-/ - */ - PD_CLEAR_VA(pmap, m->pindex, TRUE); - pmap_zero_page(m); - --pmap->pm_stats.resident_count; - - /* - * This is a release store so that the ordinary store unmapping - * the page table page is globally performed before TLB shoot- - * down is begun. - */ - atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); - - /* - * Do an invltlb to make the invalidated mapping - * take effect immediately. - */ - pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); - pmap_invalidate_page(pmap, pteva); - - /* - * Put page on a list so that it is released after - * *ALL* TLB shootdown is done - */ - m->object = (void *)*free; - *free = m; -} - -/* - * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. - */ -static int -pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) -{ - pd_entry_t ptepde; - vm_page_t mpte; - - if (va >= VM_MAXUSER_ADDRESS) - return (0); - ptepde = PT_GET(pmap_pde(pmap, va)); - mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); - return (pmap_unwire_ptp(pmap, mpte, free)); -} - -/* - * Initialize the pmap for the swapper process. - */ -void -pmap_pinit0(pmap_t pmap) -{ - - PMAP_LOCK_INIT(pmap); - /* - * Since the page table directory is shared with the kernel pmap, - * which is already included in the list "allpmaps", this pmap does - * not need to be inserted into that list. - */ - pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); -#ifdef PAE - pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); -#endif - CPU_ZERO(&pmap->pm_active); - PCPU_SET(curpmap, pmap); - TAILQ_INIT(&pmap->pm_pvchunk); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); -} - -/* - * Initialize a preallocated and zeroed pmap structure, - * such as one in a vmspace structure. - */ -int -pmap_pinit(pmap_t pmap) -{ - vm_page_t m, ptdpg[NPGPTD + 1]; - int npgptd = NPGPTD + 1; - int i; - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - /* - * No need to allocate page table space yet but we do need a valid - * page directory table. - */ - if (pmap->pm_pdir == NULL) { - pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); - if (pmap->pm_pdir == NULL) { -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif - return (0); - } -#ifdef PAE - pmap->pm_pdpt = (pd_entry_t *)kva_alloc(1); -#endif - } - - /* - * allocate the page directory page(s) - */ - for (i = 0; i < npgptd;) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (m == NULL) - VM_WAIT; - else { - ptdpg[i++] = m; - } - } - - pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); - - for (i = 0; i < NPGPTD; i++) - if ((ptdpg[i]->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pdir + (i * NPDEPG)); - - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); - /* Copy the kernel page table directory entries. */ - bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); - mtx_unlock_spin(&allpmaps_lock); - -#ifdef PAE - pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); - if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) - bzero(pmap->pm_pdpt, PAGE_SIZE); - for (i = 0; i < NPGPTD; i++) { - vm_paddr_t ma; - - ma = VM_PAGE_TO_MACH(ptdpg[i]); - pmap->pm_pdpt[i] = ma | PG_V; - - } -#endif - for (i = 0; i < NPGPTD; i++) { - pt_entry_t *pd; - vm_paddr_t ma; - - ma = VM_PAGE_TO_MACH(ptdpg[i]); - pd = pmap->pm_pdir + (i * NPDEPG); - PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); -#if 0 - xen_pgd_pin(ma); -#endif - } - -#ifdef PAE - PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); -#endif - rw_wlock(&pvh_global_lock); - xen_flush_queue(); - xen_pgdpt_pin(VM_PAGE_TO_MACH(ptdpg[NPGPTD])); - for (i = 0; i < NPGPTD; i++) { - vm_paddr_t ma = VM_PAGE_TO_MACH(ptdpg[i]); - PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); - } - xen_flush_queue(); - rw_wunlock(&pvh_global_lock); - CPU_ZERO(&pmap->pm_active); - TAILQ_INIT(&pmap->pm_pvchunk); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif - return (1); -} - -/* - * this routine is called if the page table page is not - * mapped correctly. - */ -static vm_page_t -_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) -{ - vm_paddr_t ptema; - vm_page_t m; - - /* - * Allocate a page table page. - */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { - if ((flags & PMAP_ENTER_NOSLEEP) == 0) { - PMAP_UNLOCK(pmap); - rw_wunlock(&pvh_global_lock); - VM_WAIT; - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - } - - /* - * Indicate the need to retry. While waiting, the page table - * page may have been allocated. - */ - return (NULL); - } - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); - - /* - * Map the pagetable page into the process address space, if - * it isn't already there. - */ - - pmap->pm_stats.resident_count++; - - ptema = VM_PAGE_TO_MACH(m); - xen_pt_pin(ptema); - PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], - (ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); - - KASSERT(pmap->pm_pdir[ptepindex], - ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); - return (m); -} - -static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) -{ - u_int ptepindex; - pd_entry_t ptema; - vm_page_t m; - - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; -retry: - /* - * Get the page directory entry - */ - ptema = pmap->pm_pdir[ptepindex]; - - /* - * This supports switching from a 4MB page to a - * normal 4K page. - */ - if (ptema & PG_PS) { - /* - * XXX - */ - pmap->pm_pdir[ptepindex] = 0; - ptema = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_invalidate_all(kernel_pmap); - } - - /* - * If the page table page is mapped, we just increment the - * hold count, and activate it. - */ - if (ptema & PG_V) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); - m->wire_count++; - } else { - /* - * Here if the pte page isn't mapped, or if it has - * been deallocated. - */ - CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", - pmap, va, flags); - m = _pmap_allocpte(pmap, ptepindex, flags); - if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) - goto retry; - - KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex)); - } - return (m); -} - - -/*************************************************** -* Pmap allocation/deallocation routines. - ***************************************************/ - - -/* - * Release any resources held by the given physical map. - * Called when a pmap initialized by pmap_pinit is being released. - * Should only be called if the map contains no valid mappings. - */ -void -pmap_release(pmap_t pmap) -{ - vm_page_t m, ptdpg[2*NPGPTD+1]; - vm_paddr_t ma; - int i; -#ifdef PAE - int npgptd = NPGPTD + 1; -#else - int npgptd = NPGPTD; -#endif - - KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap resident count %ld != 0", - pmap->pm_stats.resident_count)); - PT_UPDATES_FLUSH(); - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - KASSERT(CPU_EMPTY(&pmap->pm_active), - ("releasing active pmap %p", pmap)); - mtx_lock_spin(&allpmaps_lock); - LIST_REMOVE(pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - - for (i = 0; i < NPGPTD; i++) - ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME); - pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); -#ifdef PAE - ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); -#endif - - for (i = 0; i < npgptd; i++) { - m = ptdpg[i]; - ma = VM_PAGE_TO_MACH(m); - /* unpinning L1 and L2 treated the same */ -#if 0 - xen_pgd_unpin(ma); -#else - if (i == NPGPTD) - xen_pgd_unpin(ma); -#endif -#ifdef PAE - if (i < NPGPTD) - KASSERT(VM_PAGE_TO_MACH(m) == (pmap->pm_pdpt[i] & PG_FRAME), - ("pmap_release: got wrong ptd page")); -#endif - m->wire_count--; - atomic_subtract_int(&vm_cnt.v_wire_count, 1); - vm_page_free(m); - } -#ifdef PAE - pmap_qremove((vm_offset_t)pmap->pm_pdpt, 1); -#endif - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif -} - -static int -kvm_size(SYSCTL_HANDLER_ARGS) -{ - unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; - - return (sysctl_handle_long(oidp, &ksize, 0, req)); -} -SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, - 0, 0, kvm_size, "IU", "Size of KVM"); - -static int -kvm_free(SYSCTL_HANDLER_ARGS) -{ - unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; - - return (sysctl_handle_long(oidp, &kfree, 0, req)); -} -SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, - 0, 0, kvm_free, "IU", "Amount of KVM free"); - -/* - * grow the number of kernel page table entries, if needed - */ -void -pmap_growkernel(vm_offset_t addr) -{ - struct pmap *pmap; - vm_paddr_t ptppaddr; - vm_page_t nkpg; - pd_entry_t newpdir; - - mtx_assert(&kernel_map->system_mtx, MA_OWNED); - if (kernel_vm_end == 0) { - kernel_vm_end = KERNBASE; - nkpt = 0; - while (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); - nkpt++; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - } - } - addr = roundup2(addr, NBPDR); - if (addr - 1 >= kernel_map->max_offset) - addr = kernel_map->max_offset; - while (kernel_vm_end < addr) { - if (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - continue; - } - - nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - if (nkpg == NULL) - panic("pmap_growkernel: no memory to grow kernel"); - - nkpt++; - - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); - ptppaddr = VM_PAGE_TO_PHYS(nkpg); - newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); - rw_wlock(&pvh_global_lock); - PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); - mtx_lock_spin(&allpmaps_lock); - LIST_FOREACH(pmap, &allpmaps, pm_list) - PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); - - mtx_unlock_spin(&allpmaps_lock); - rw_wunlock(&pvh_global_lock); - - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - } -} - - -/*************************************************** - * page management routines. - ***************************************************/ - -CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); -CTASSERT(_NPCM == 11); -CTASSERT(_NPCPV == 336); - -static __inline struct pv_chunk * -pv_to_chunk(pv_entry_t pv) -{ - - return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); -} - -#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) - -#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ -#define PC_FREE10 0x0000fffful /* Free values for index 10 */ - -static const uint32_t pc_freemask[_NPCM] = { - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE10 -}; - -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, - "Current number of pv entries"); - -#ifdef PV_STATS -static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; - -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, - "Current number of pv entry chunks"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, - "Current number of pv entry chunks allocated"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, - "Current number of pv entry chunks frees"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, - "Number of times tried to get a chunk page but failed."); - -static long pv_entry_frees, pv_entry_allocs; -static int pv_entry_spare; - -SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, - "Current number of pv entry frees"); -SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, - "Current number of pv entry allocs"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, - "Current number of spare pv entries"); -#endif - -/* - * We are in a serious low memory condition. Resort to - * drastic measures to free some pages so we can allocate - * another pv entry chunk. - */ -static vm_page_t -pmap_pv_reclaim(pmap_t locked_pmap) -{ - struct pch newtail; - struct pv_chunk *pc; - pmap_t pmap; - pt_entry_t *pte, tpte; - pv_entry_t pv; - vm_offset_t va; - vm_page_t free, m, m_pc; - uint32_t inuse; - int bit, field, freed; - - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - pmap = NULL; - free = m_pc = NULL; - TAILQ_INIT(&newtail); - while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || - free == NULL)) { - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - if (pmap != pc->pc_pmap) { - if (pmap != NULL) { - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - } - pmap = pc->pc_pmap; - /* Avoid deadlock and lock recursion. */ - if (pmap > locked_pmap) - PMAP_LOCK(pmap); - else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { - pmap = NULL; - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - continue; - } - } - - /* - * Destroy every non-wired, 4 KB page mapping in the chunk. - */ - freed = 0; - for (field = 0; field < _NPCM; field++) { - for (inuse = ~pc->pc_map[field] & pc_freemask[field]; - inuse != 0; inuse &= ~(1UL << bit)) { - bit = bsfl(inuse); - pv = &pc->pc_pventry[field * 32 + bit]; - va = pv->pv_va; - pte = pmap_pte(pmap, va); - tpte = *pte; - if ((tpte & PG_W) == 0) - tpte = pte_load_clear(pte); - pmap_pte_release(pte); - if ((tpte & PG_W) != 0) - continue; - KASSERT(tpte != 0, - ("pmap_pv_reclaim: pmap %p va %x zero pte", - pmap, va)); - if ((tpte & PG_G) != 0) - pmap_invalidate_page(pmap, va); - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - if ((tpte & PG_A) != 0) - vm_page_aflag_set(m, PGA_REFERENCED); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); - pc->pc_map[field] |= 1UL << bit; - pmap_unuse_pt(pmap, va, &free); - freed++; - } - } - if (freed == 0) { - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - continue; - } - /* Every freed mapping is for a 4 KB page. */ - pmap->pm_stats.resident_count -= freed; - PV_STAT(pv_entry_frees += freed); - PV_STAT(pv_entry_spare += freed); - pv_entry_count -= freed; - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - for (field = 0; field < _NPCM; field++) - if (pc->pc_map[field] != pc_freemask[field]) { - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, - pc_list); - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - - /* - * One freed pv entry in locked_pmap is - * sufficient. - */ - if (pmap == locked_pmap) - goto out; - break; - } - if (field == _NPCM) { - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); - /* Entire chunk is free; return it. */ - m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); - pmap_qremove((vm_offset_t)pc, 1); - pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); - break; - } - } -out: - TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); - if (pmap != NULL) { - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - } - if (m_pc == NULL && pv_vafree != 0 && free != NULL) { - m_pc = free; - free = (void *)m_pc->object; - /* Recycle a freed page table page. */ - m_pc->wire_count = 1; - atomic_add_int(&vm_cnt.v_wire_count, 1); - } - pmap_free_zero_pages(free); - return (m_pc); -} - -/* - * free the pv_entry back to the free list - */ -static void -free_pv_entry(pmap_t pmap, pv_entry_t pv) -{ - struct pv_chunk *pc; - int idx, field, bit; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(pv_entry_frees++); - PV_STAT(pv_entry_spare++); - pv_entry_count--; - pc = pv_to_chunk(pv); - idx = pv - &pc->pc_pventry[0]; - field = idx / 32; - bit = idx % 32; - pc->pc_map[field] |= 1ul << bit; - for (idx = 0; idx < _NPCM; idx++) - if (pc->pc_map[idx] != pc_freemask[idx]) { - /* - * 98% of the time, pc is already at the head of the - * list. If it isn't already, move it to the head. - */ - if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != - pc)) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, - pc_list); - } - return; - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - free_pv_chunk(pc); -} - -static void -free_pv_chunk(struct pv_chunk *pc) -{ - vm_page_t m; - - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); - /* entire chunk is free, return it */ - m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); - pmap_qremove((vm_offset_t)pc, 1); - vm_page_unwire(m, PQ_INACTIVE); - vm_page_free(m); - pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); -} - -/* - * get a new pv_entry, allocating a block from the system - * when needed. - */ -static pv_entry_t -get_pv_entry(pmap_t pmap, boolean_t try) -{ - static const struct timeval printinterval = { 60, 0 }; - static struct timeval lastprint; - int bit, field; - pv_entry_t pv; - struct pv_chunk *pc; - vm_page_t m; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - rw_assert(&pvh_global_lock, RA_WLOCKED); - PV_STAT(pv_entry_allocs++); - pv_entry_count++; - if (pv_entry_count > pv_entry_high_water) - if (ratecheck(&lastprint, &printinterval)) - printf("Approaching the limit on PV entries, consider " - "increasing either the vm.pmap.shpgperproc or the " - "vm.pmap.pv_entry_max tunable.\n"); -retry: - pc = TAILQ_FIRST(&pmap->pm_pvchunk); - if (pc != NULL) { - for (field = 0; field < _NPCM; field++) { - if (pc->pc_map[field]) { - bit = bsfl(pc->pc_map[field]); - break; - } - } - if (field < _NPCM) { - pv = &pc->pc_pventry[field * 32 + bit]; - pc->pc_map[field] &= ~(1ul << bit); - /* If this was the last item, move it to tail */ - for (field = 0; field < _NPCM; field++) - if (pc->pc_map[field] != 0) { - PV_STAT(pv_entry_spare--); - return (pv); /* not full, return */ - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); - PV_STAT(pv_entry_spare--); - return (pv); - } - } - /* - * Access to the ptelist "pv_vafree" is synchronized by the page - * queues lock. If "pv_vafree" is currently non-empty, it will - * remain non-empty until pmap_ptelist_alloc() completes. - */ - if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { - if (try) { - pv_entry_count--; - PV_STAT(pc_chunk_tryfail++); - return (NULL); - } - m = pmap_pv_reclaim(pmap); - if (m == NULL) - goto retry; - } - PV_STAT(pc_chunk_count++); - PV_STAT(pc_chunk_allocs++); - pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); - pmap_qenter((vm_offset_t)pc, &m, 1); - if ((m->flags & PG_ZERO) == 0) - pagezero(pc); - pc->pc_pmap = pmap; - pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ - for (field = 1; field < _NPCM; field++) - pc->pc_map[field] = pc_freemask[field]; - TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); - pv = &pc->pc_pventry[0]; - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - PV_STAT(pv_entry_spare += _NPCPV - 1); - return (pv); -} - -static __inline pv_entry_t -pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) -{ - pv_entry_t pv; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { - if (pmap == PV_PMAP(pv) && va == pv->pv_va) { - TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); - break; - } - } - return (pv); -} - -static void -pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) -{ - pv_entry_t pv; - - pv = pmap_pvh_remove(pvh, pmap, va); - KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); - free_pv_entry(pmap, pv); -} - -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) -{ - - rw_assert(&pvh_global_lock, RA_WLOCKED); - pmap_pvh_free(&m->md, pmap, va); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); -} - -/* - * Conditionally create a pv entry. - */ -static boolean_t -pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) -{ - pv_entry_t pv; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - rw_assert(&pvh_global_lock, RA_WLOCKED); - if (pv_entry_count < pv_entry_high_water && - (pv = get_pv_entry(pmap, TRUE)) != NULL) { - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - return (TRUE); - } else - return (FALSE); -} - -/* - * pmap_remove_pte: do the things to unmap a page in a process - */ -static int -pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) -{ - pt_entry_t oldpte; - vm_page_t m; - - CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", - pmap, (u_long)*ptq, va); - - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - oldpte = *ptq; - PT_SET_VA_MA(ptq, 0, TRUE); - KASSERT(oldpte != 0, - ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); - if (oldpte & PG_W) - pmap->pm_stats.wired_count -= 1; - /* - * Machines that don't support invlpg, also don't support - * PG_G. - */ - if (oldpte & PG_G) - pmap_invalidate_page(kernel_pmap, va); - pmap->pm_stats.resident_count -= 1; - if (oldpte & PG_MANAGED) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME); - if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - if (oldpte & PG_A) - vm_page_aflag_set(m, PGA_REFERENCED); - pmap_remove_entry(pmap, m, va); - } - return (pmap_unuse_pt(pmap, va, free)); -} - -/* - * Remove a single page from a process address space - */ -static void -pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) -{ - pt_entry_t *pte; - - CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", - pmap, va); - - rw_assert(&pvh_global_lock, RA_WLOCKED); - KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0) - return; - pmap_remove_pte(pmap, pte, va, free); - pmap_invalidate_page(pmap, va); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - -} - -/* - * Remove the given range of addresses from the specified map. - * - * It is assumed that the start and end are properly - * rounded to the page size. - */ -void -pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t pdnxt; - pd_entry_t ptpaddr; - pt_entry_t *pte; - vm_page_t free = NULL; - int anyvalid; - - CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", - pmap, sva, eva); - - /* - * Perform an unsynchronized read. This is, however, safe. - */ - if (pmap->pm_stats.resident_count == 0) - return; - - anyvalid = 0; - - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - - /* - * special handling of removing one page. a very - * common operation and easy to short circuit some - * code. - */ - if ((sva + PAGE_SIZE == eva) && - ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { - pmap_remove_page(pmap, sva, &free); - goto out; - } - - for (; sva < eva; sva = pdnxt) { - u_int pdirindex; - - /* - * Calculate index for next page table. - */ - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - if (pmap->pm_stats.resident_count == 0) - break; - - pdirindex = sva >> PDRSHIFT; - ptpaddr = pmap->pm_pdir[pdirindex]; - - /* - * Weed out invalid mappings. Note: we assume that the page - * directory table is always allocated, and in kernel virtual. - */ - if (ptpaddr == 0) - continue; - - /* - * Check for large page. - */ - if ((ptpaddr & PG_PS) != 0) { - PD_CLEAR_VA(pmap, pdirindex, TRUE); - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - anyvalid = 1; - continue; - } - - /* - * Limit our scan to either the end of the va represented - * by the current page table page, or to the end of the - * range being removed. - */ - if (pdnxt > eva) - pdnxt = eva; - - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & PG_V) == 0) - continue; - - /* - * The TLB entry for a PG_G mapping is invalidated - * by pmap_remove_pte(). - */ - if ((*pte & PG_G) == 0) - anyvalid = 1; - if (pmap_remove_pte(pmap, pte, sva, &free)) - break; - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); -out: - if (anyvalid) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - pmap_free_zero_pages(free); -} - -/* - * Routine: pmap_remove_all - * Function: - * Removes this physical page from - * all physical maps in which it resides. - * Reflects back modify bits to the pager. - * - * Notes: - * Original versions of this routine were very - * inefficient because they iteratively called - * pmap_remove (slow...) - */ - -void -pmap_remove_all(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t *pte, tpte; - vm_page_t free; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_remove_all: page %p is not managed", m)); - free = NULL; - rw_wlock(&pvh_global_lock); - sched_pin(); - while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pmap->pm_stats.resident_count--; - pte = pmap_pte_quick(pmap, pv->pv_va); - tpte = *pte; - PT_SET_VA_MA(pte, 0, TRUE); - KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", - pmap, pv->pv_va)); - if (tpte & PG_W) - pmap->pm_stats.wired_count--; - if (tpte & PG_A) - vm_page_aflag_set(m, PGA_REFERENCED); - - /* - * Update the vm_page_t clean and reference bits. - */ - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - pmap_unuse_pt(pmap, pv->pv_va, &free); - pmap_invalidate_page(pmap, pv->pv_va); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - free_pv_entry(pmap, pv); - PMAP_UNLOCK(pmap); - } - vm_page_aflag_clear(m, PGA_WRITEABLE); - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - pmap_free_zero_pages(free); -} - -/* - * Set the physical protection on the - * specified range of this map as requested. - */ -void -pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) -{ - vm_offset_t pdnxt; - pd_entry_t ptpaddr; - pt_entry_t *pte; - int anychanged; - - CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", - pmap, sva, eva, prot); - - if ((prot & VM_PROT_READ) == VM_PROT_NONE) { - pmap_remove(pmap, sva, eva); - return; - } - -#ifdef PAE - if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == - (VM_PROT_WRITE|VM_PROT_EXECUTE)) - return; -#else - if (prot & VM_PROT_WRITE) - return; -#endif - - anychanged = 0; - - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pt_entry_t obits, pbits; - u_int pdirindex; - - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - - pdirindex = sva >> PDRSHIFT; - ptpaddr = pmap->pm_pdir[pdirindex]; - - /* - * Weed out invalid mappings. Note: we assume that the page - * directory table is always allocated, and in kernel virtual. - */ - if (ptpaddr == 0) - continue; - - /* - * Check for large page. - */ - if ((ptpaddr & PG_PS) != 0) { - if ((prot & VM_PROT_WRITE) == 0) - pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pmap->pm_pdir[pdirindex] |= pg_nx; -#endif - anychanged = 1; - continue; - } - - if (pdnxt > eva) - pdnxt = eva; - - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - vm_page_t m; - -retry: - /* - * Regardless of whether a pte is 32 or 64 bits in - * size, PG_RW, PG_A, and PG_M are among the least - * significant 32 bits. - */ - obits = pbits = *pte; - if ((pbits & PG_V) == 0) - continue; - - if ((prot & VM_PROT_WRITE) == 0) { - if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == - (PG_MANAGED | PG_M | PG_RW)) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & - PG_FRAME); - vm_page_dirty(m); - } - pbits &= ~(PG_RW | PG_M); - } -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pbits |= pg_nx; -#endif - - if (pbits != obits) { - obits = *pte; - PT_SET_VA_MA(pte, pbits, TRUE); - if (*pte != pbits) - goto retry; - if (obits & PG_G) - pmap_invalidate_page(pmap, sva); - else - anychanged = 1; - } - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - if (anychanged) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * Insert the given physical page (p) at - * the specified virtual address (v) in the - * target physical map with the protection requested. - * - * If specified, the page will be wired down, meaning - * that the related pte can not be reclaimed. - * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. - */ -int -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - u_int flags, int8_t psind __unused) -{ - pd_entry_t *pde; - pt_entry_t *pte; - pt_entry_t newpte, origpte; - pv_entry_t pv; - vm_paddr_t opa, pa; - vm_page_t mpte, om; - boolean_t invlva, wired; - - CTR5(KTR_PMAP, - "pmap_enter: pmap=%08p va=0x%08x ma=0x%08x prot=0x%x flags=0x%x", - pmap, va, VM_PAGE_TO_MACH(m), prot, flags); - va = trunc_page(va); - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); - KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, - ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", - va)); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); - - mpte = NULL; - wired = (flags & PMAP_ENTER_WIRED) != 0; - - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - sched_pin(); - - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAXUSER_ADDRESS) { - mpte = pmap_allocpte(pmap, va, flags); - if (mpte == NULL) { - KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, - ("pmap_allocpte failed with sleep allowed")); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - return (KERN_RESOURCE_SHORTAGE); - } - } - - pde = pmap_pde(pmap, va); - if ((*pde & PG_PS) != 0) - panic("pmap_enter: attempted pmap_enter on 4MB page"); - pte = pmap_pte_quick(pmap, va); - - /* - * Page Directory table entry not valid, we need a new PT page - */ - if (pte == NULL) { - panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", - (uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va); - } - - pa = VM_PAGE_TO_PHYS(m); - om = NULL; - opa = origpte = 0; - -#if 0 - KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx", - pte, *pte)); -#endif - origpte = *pte; - if (origpte) - origpte = xpmap_mtop(origpte); - opa = origpte & PG_FRAME; - - /* - * Mapping has not changed, must be protection or wiring change. - */ - if (origpte && (opa == pa)) { - /* - * Wiring change, just update stats. We don't worry about - * wiring PT pages as they remain resident as long as there - * are valid mappings in them. Hence, if a user page is wired, - * the PT page will be also. - */ - if (wired && ((origpte & PG_W) == 0)) - pmap->pm_stats.wired_count++; - else if (!wired && (origpte & PG_W)) - pmap->pm_stats.wired_count--; - - /* - * Remove extra pte reference - */ - if (mpte) - mpte->wire_count--; - - if (origpte & PG_MANAGED) { - om = m; - pa |= PG_MANAGED; - } - goto validate; - } - - pv = NULL; - - /* - * Mapping has changed, invalidate old range and fall through to - * handle validating new mapping. - */ - if (opa) { - if (origpte & PG_W) - pmap->pm_stats.wired_count--; - if (origpte & PG_MANAGED) { - om = PHYS_TO_VM_PAGE(opa); - pv = pmap_pvh_remove(&om->md, pmap, va); - } else if (va < VM_MAXUSER_ADDRESS) - printf("va=0x%x is unmanaged :-( \n", va); - - if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, - ("pmap_enter: missing reference to page table page," - " va: 0x%x", va)); - } - } else - pmap->pm_stats.resident_count++; - - /* - * Enter on the PV list if part of our managed memory. - */ - if ((m->oflags & VPO_UNMANAGED) == 0) { - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, - ("pmap_enter: managed mapping within the clean submap")); - if (pv == NULL) - pv = get_pv_entry(pmap, FALSE); - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - pa |= PG_MANAGED; - } else if (pv != NULL) - free_pv_entry(pmap, pv); - - /* - * Increment counters - */ - if (wired) - pmap->pm_stats.wired_count++; - -validate: - /* - * Now validate mapping with desired protection/wiring. - */ - newpte = (pt_entry_t)(pa | PG_V); - if ((prot & VM_PROT_WRITE) != 0) { - newpte |= PG_RW; - if ((newpte & PG_MANAGED) != 0) - vm_page_aflag_set(m, PGA_WRITEABLE); - } -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - newpte |= pg_nx; -#endif - if (wired) - newpte |= PG_W; - if (va < VM_MAXUSER_ADDRESS) - newpte |= PG_U; - if (pmap == kernel_pmap) - newpte |= pgeflag; - - critical_enter(); - /* - * if the mapping or permission bits are different, we need - * to update the pte. - */ - if ((origpte & ~(PG_M|PG_A)) != newpte) { - if (origpte) { - invlva = FALSE; - origpte = *pte; - PT_SET_VA(pte, newpte | PG_A, FALSE); - if (origpte & PG_A) { - if (origpte & PG_MANAGED) - vm_page_aflag_set(om, PGA_REFERENCED); - if (opa != VM_PAGE_TO_PHYS(m)) - invlva = TRUE; -#ifdef PAE - if ((origpte & PG_NX) == 0 && - (newpte & PG_NX) != 0) - invlva = TRUE; -#endif - } - if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if ((origpte & PG_MANAGED) != 0) - vm_page_dirty(om); - if ((prot & VM_PROT_WRITE) == 0) - invlva = TRUE; - } - if ((origpte & PG_MANAGED) != 0 && - TAILQ_EMPTY(&om->md.pv_list)) - vm_page_aflag_clear(om, PGA_WRITEABLE); - if (invlva) - pmap_invalidate_page(pmap, va); - } else{ - PT_SET_VA(pte, newpte | PG_A, FALSE); - } - - } - PT_UPDATES_FLUSH(); - critical_exit(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - return (KERN_SUCCESS); -} - -/* - * Maps a sequence of resident pages belonging to the same object. - * The sequence begins with the given page m_start. This page is - * mapped at the given virtual address start. Each subsequent page is - * mapped at a virtual address that is offset from start by the same - * amount as the page is offset from m_start within the object. The - * last page in the sequence is the page with the largest offset from - * m_start that can be mapped at a virtual address less than the given - * virtual address end. Not every virtual page between start and end - * is mapped; only those for which a resident page exists with the - * corresponding offset from m_start are mapped. - */ -void -pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, - vm_page_t m_start, vm_prot_t prot) -{ - vm_page_t m, mpte; - vm_pindex_t diff, psize; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - int error, count = 0; - - VM_OBJECT_ASSERT_LOCKED(m_start->object); - - psize = atop(end - start); - mpte = NULL; - m = m_start; - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m, - prot, mpte); - m = TAILQ_NEXT(m, listq); - if (count == 16) { - error = HYPERVISOR_multicall(mcl, count); - KASSERT(error == 0, ("bad multicall %d", error)); - mclp = mcl; - count = 0; - } - } - if (count) { - error = HYPERVISOR_multicall(mcl, count); - KASSERT(error == 0, ("bad multicall %d", error)); - } - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * this code makes some *MAJOR* assumptions: - * 1. Current pmap & pmap exists. - * 2. Not wired. - * 3. Read access. - * 4. No page table pages. - * but is *MUCH* faster than pmap_enter... - */ - -void -pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) -{ - multicall_entry_t mcl, *mclp; - int count = 0; - mclp = &mcl; - - CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x", - pmap, va, m, prot); - - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - (void)pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL); - if (count) - HYPERVISOR_multicall(&mcl, count); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -#ifdef notyet -void -pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count) -{ - int i, error, index = 0; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - - PMAP_LOCK(pmap); - for (i = 0; i < count; i++, addrs++, pages++, prots++) { - if (!pmap_is_prefaultable_locked(pmap, *addrs)) - continue; - - (void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL); - if (index == 16) { - error = HYPERVISOR_multicall(mcl, index); - mclp = mcl; - index = 0; - KASSERT(error == 0, ("bad multicall %d", error)); - } - } - if (index) { - error = HYPERVISOR_multicall(mcl, index); - KASSERT(error == 0, ("bad multicall %d", error)); - } - - PMAP_UNLOCK(pmap); -} -#endif - -static vm_page_t -pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, vm_page_t mpte) -{ - pt_entry_t *pte; - vm_paddr_t pa; - vm_page_t free; - multicall_entry_t *mcl = *mclpp; - - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || - (m->oflags & VPO_UNMANAGED) != 0, - ("pmap_enter_quick_locked: managed mapping within the clean submap")); - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAXUSER_ADDRESS) { - u_int ptepindex; - pd_entry_t ptema; - - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; - if (mpte && (mpte->pindex == ptepindex)) { - mpte->wire_count++; - } else { - /* - * Get the page directory entry - */ - ptema = pmap->pm_pdir[ptepindex]; - - /* - * If the page table page is mapped, we just increment - * the hold count, and activate it. - */ - if (ptema & PG_V) { - if (ptema & PG_PS) - panic("pmap_enter_quick: unexpected mapping into 4MB page"); - mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); - mpte->wire_count++; - } else { - mpte = _pmap_allocpte(pmap, ptepindex, - PMAP_ENTER_NOSLEEP); - if (mpte == NULL) - return (mpte); - } - } - } else { - mpte = NULL; - } - - /* - * This call to vtopte makes the assumption that we are - * entering the page into the current pmap. In order to support - * quick entry into any pmap, one would likely use pmap_pte_quick. - * But that isn't as quick as vtopte. - */ - KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap")); - pte = vtopte(va); - if (*pte & PG_V) { - if (mpte != NULL) { - mpte->wire_count--; - mpte = NULL; - } - return (mpte); - } - - /* - * Enter on the PV list if part of our managed memory. - */ - if ((m->oflags & VPO_UNMANAGED) == 0 && - !pmap_try_insert_pv_entry(pmap, va, m)) { - if (mpte != NULL) { - free = NULL; - if (pmap_unwire_ptp(pmap, mpte, &free)) { - pmap_invalidate_page(pmap, va); - pmap_free_zero_pages(free); - } - - mpte = NULL; - } - return (mpte); - } - - /* - * Increment counters - */ - pmap->pm_stats.resident_count++; - - pa = VM_PAGE_TO_PHYS(m); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pa |= pg_nx; -#endif - -#if 0 - /* - * Now validate mapping with RO protection - */ - if ((m->oflags & VPO_UNMANAGED) != 0) - pte_store(pte, pa | PG_V | PG_U); - else - pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); -#else - /* - * Now validate mapping with RO protection - */ - if ((m->oflags & VPO_UNMANAGED) != 0) - pa = xpmap_ptom(pa | PG_V | PG_U); - else - pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED); - - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; - mcl->args[1] = (uint32_t)(pa & 0xffffffff); - mcl->args[2] = (uint32_t)(pa >> 32); - mcl->args[3] = 0; - *mclpp = mcl + 1; - *count = *count + 1; -#endif - return (mpte); -} - -/* - * Make a temporary mapping for a physical address. This is only intended - * to be used for panic dumps. - */ -void * -pmap_kenter_temporary(vm_paddr_t pa, int i) -{ - vm_offset_t va; - vm_paddr_t ma = xpmap_ptom(pa); - - va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); - PT_SET_MA(va, (ma & ~PAGE_MASK) | PG_V | pgeflag); - invlpg(va); - return ((void *)crashdumpmap); -} - -/* - * This code maps large physical mmap regions into the - * processor address space. Note that some shortcuts - * are taken, but the code works. - */ -void -pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, - vm_pindex_t pindex, vm_size_t size) -{ - pd_entry_t *pde; - vm_paddr_t pa, ptepa; - vm_page_t p; - int pat_mode; - - VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, - ("pmap_object_init_pt: non-device object")); - if (pseflag && - (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { - if (!vm_object_populate(object, pindex, pindex + atop(size))) - return; - p = vm_page_lookup(object, pindex); - KASSERT(p->valid == VM_PAGE_BITS_ALL, - ("pmap_object_init_pt: invalid page %p", p)); - pat_mode = p->md.pat_mode; - - /* - * Abort the mapping if the first page is not physically - * aligned to a 2/4MB page boundary. - */ - ptepa = VM_PAGE_TO_PHYS(p); - if (ptepa & (NBPDR - 1)) - return; - - /* - * Skip the first page. Abort the mapping if the rest of - * the pages are not physically contiguous or have differing - * memory attributes. - */ - p = TAILQ_NEXT(p, listq); - for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; - pa += PAGE_SIZE) { - KASSERT(p->valid == VM_PAGE_BITS_ALL, - ("pmap_object_init_pt: invalid page %p", p)); - if (pa != VM_PAGE_TO_PHYS(p) || - pat_mode != p->md.pat_mode) - return; - p = TAILQ_NEXT(p, listq); - } - - /* - * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and - * "size" is a multiple of 2/4M, adding the PAT setting to - * "pa" will not affect the termination of this loop. - */ - PMAP_LOCK(pmap); - for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + - size; pa += NBPDR) { - pde = pmap_pde(pmap, addr); - if (*pde == 0) { - pde_store(pde, pa | PG_PS | PG_M | PG_A | - PG_U | PG_RW | PG_V); - pmap->pm_stats.resident_count += NBPDR / - PAGE_SIZE; - pmap_pde_mappings++; - } - /* Else continue on if the PDE is already valid. */ - addr += NBPDR; - } - PMAP_UNLOCK(pmap); - } -} - -/* - * Clear the wired attribute from the mappings for the specified range of - * addresses in the given pmap. Every valid mapping within that range - * must have the wired attribute set. In contrast, invalid mappings - * cannot have the wired attribute set, so they are ignored. - * - * The wired attribute of the page table entry is not a hardware feature, - * so there is no need to invalidate any TLB entries. - */ -void -pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t pdnxt; - pd_entry_t *pde; - pt_entry_t *pte; - - CTR3(KTR_PMAP, "pmap_unwire: pmap=%p sva=0x%x eva=0x%x", pmap, sva, - eva); - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - pde = pmap_pde(pmap, sva); - if ((*pde & PG_V) == 0) - continue; - if ((*pde & PG_PS) != 0) - panic("pmap_unwire: unexpected PG_PS in pde %#jx", - (uintmax_t)*pde); - if (pdnxt > eva) - pdnxt = eva; - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & PG_V) == 0) - continue; - if ((*pte & PG_W) == 0) - panic("pmap_unwire: pte %#jx is missing PG_W", - (uintmax_t)*pte); - PT_SET_VA_MA(pte, *pte & ~PG_W, FALSE); - pmap->pm_stats.wired_count--; - } - } - if (*PMAP1) - PT_CLEAR_VA(PMAP1, FALSE); - PT_UPDATES_FLUSH(); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - - -/* - * Copy the range specified by src_addr/len - * from the source map to the range dst_addr/len - * in the destination map. - * - * This routine is only advisory and need not do anything. - */ - -void -pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, - vm_offset_t src_addr) -{ - vm_page_t free; - vm_offset_t addr; - vm_offset_t end_addr = src_addr + len; - vm_offset_t pdnxt; - - if (dst_addr != src_addr) - return; - - if (!pmap_is_current(src_pmap)) { - CTR2(KTR_PMAP, - "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx", - (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME)); - - return; - } - CTR5(KTR_PMAP, "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", - dst_pmap, src_pmap, dst_addr, len, src_addr); - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - rw_wlock(&pvh_global_lock); - if (dst_pmap < src_pmap) { - PMAP_LOCK(dst_pmap); - PMAP_LOCK(src_pmap); - } else { - PMAP_LOCK(src_pmap); - PMAP_LOCK(dst_pmap); - } - sched_pin(); - for (addr = src_addr; addr < end_addr; addr = pdnxt) { - pt_entry_t *src_pte, *dst_pte; - vm_page_t dstmpte, srcmpte; - pd_entry_t srcptepaddr; - u_int ptepindex; - - KASSERT(addr < UPT_MIN_ADDRESS, - ("pmap_copy: invalid to pmap_copy page tables")); - - pdnxt = (addr + NBPDR) & ~PDRMASK; - if (pdnxt < addr) - pdnxt = end_addr; - ptepindex = addr >> PDRSHIFT; - - srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); - if (srcptepaddr == 0) - continue; - - if (srcptepaddr & PG_PS) { - if (dst_pmap->pm_pdir[ptepindex] == 0) { - PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE); - dst_pmap->pm_stats.resident_count += - NBPDR / PAGE_SIZE; - } - continue; - } - - srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); - KASSERT(srcmpte->wire_count > 0, - ("pmap_copy: source page table page is unused")); - - if (pdnxt > end_addr) - pdnxt = end_addr; - - src_pte = vtopte(addr); - while (addr < pdnxt) { - pt_entry_t ptetemp; - ptetemp = *src_pte; - /* - * we only virtual copy managed pages - */ - if ((ptetemp & PG_MANAGED) != 0) { - dstmpte = pmap_allocpte(dst_pmap, addr, - PMAP_ENTER_NOSLEEP); - if (dstmpte == NULL) - goto out; - dst_pte = pmap_pte_quick(dst_pmap, addr); - if (*dst_pte == 0 && - pmap_try_insert_pv_entry(dst_pmap, addr, - PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) { - /* - * Clear the wired, modified, and - * accessed (referenced) bits - * during the copy. - */ - KASSERT(ptetemp != 0, ("src_pte not set")); - PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */); - KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)), - ("no pmap copy expected: 0x%jx saw: 0x%jx", - ptetemp & ~(PG_W | PG_M | PG_A), *dst_pte)); - dst_pmap->pm_stats.resident_count++; - } else { - free = NULL; - if (pmap_unwire_ptp(dst_pmap, dstmpte, - &free)) { - pmap_invalidate_page(dst_pmap, - addr); - pmap_free_zero_pages(free); - } - goto out; - } - if (dstmpte->wire_count >= srcmpte->wire_count) - break; - } - addr += PAGE_SIZE; - src_pte++; - } - } -out: - PT_UPDATES_FLUSH(); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(src_pmap); - PMAP_UNLOCK(dst_pmap); - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif -} - -static __inline void -pagezero(void *page) -{ -#if defined(I686_CPU) - if (cpu_class == CPUCLASS_686) { -#if defined(CPU_ENABLE_SSE) - if (cpu_feature & CPUID_SSE2) - sse2_pagezero(page); - else -#endif - i686_pagezero(page); - } else -#endif - bzero(page, PAGE_SIZE); -} - -/* - * pmap_zero_page zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. - */ -void -pmap_zero_page(vm_page_t m) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_zero_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - pagezero(sysmaps->CADDR2); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * pmap_zero_page_area zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. - * - * off and size may not cover an area beyond a single hardware page. - */ -void -pmap_zero_page_area(vm_page_t m, int off, int size) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_zero_page_area: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - - if (off == 0 && size == PAGE_SIZE) - pagezero(sysmaps->CADDR2); - else - bzero((char *)sysmaps->CADDR2 + off, size); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * pmap_zero_page_idle zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. This - * is intended to be called from the vm_pagezero process only and - * outside of Giant. - */ -void -pmap_zero_page_idle(vm_page_t m) -{ - - if (*CMAP3) - panic("pmap_zero_page_idle: CMAP3 busy"); - sched_pin(); - PT_SET_MA(CADDR3, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - pagezero(CADDR3); - PT_SET_MA(CADDR3, 0); - sched_unpin(); -} - -/* - * pmap_copy_page copies the specified (machine independent) - * page by mapping the page into virtual memory and using - * bcopy to copy the page, one machine dependent page at a - * time. - */ -void -pmap_copy_page(vm_page_t src, vm_page_t dst) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP1) - panic("pmap_copy_page: CMAP1 busy"); - if (*sysmaps->CMAP2) - panic("pmap_copy_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(src) | PG_A); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(dst) | PG_A | PG_M); - bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -int unmapped_buf_allowed = 1; - -void -pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], - vm_offset_t b_offset, int xfersize) -{ - struct sysmaps *sysmaps; - vm_page_t a_pg, b_pg; - char *a_cp, *b_cp; - vm_offset_t a_pg_offset, b_pg_offset; - int cnt; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP1 != 0) - panic("pmap_copy_pages: CMAP1 busy"); - if (*sysmaps->CMAP2 != 0) - panic("pmap_copy_pages: CMAP2 busy"); - sched_pin(); - while (xfersize > 0) { - a_pg = ma[a_offset >> PAGE_SHIFT]; - a_pg_offset = a_offset & PAGE_MASK; - cnt = min(xfersize, PAGE_SIZE - a_pg_offset); - b_pg = mb[b_offset >> PAGE_SHIFT]; - b_pg_offset = b_offset & PAGE_MASK; - cnt = min(cnt, PAGE_SIZE - b_pg_offset); - PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(a_pg) | PG_A); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | - VM_PAGE_TO_MACH(b_pg) | PG_A | PG_M); - a_cp = sysmaps->CADDR1 + a_pg_offset; - b_cp = sysmaps->CADDR2 + b_pg_offset; - bcopy(a_cp, b_cp, cnt); - a_offset += cnt; - b_offset += cnt; - xfersize -= cnt; - } - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * Returns true if the pmap's pv is one of the first - * 16 pvs linked to from this page. This count may - * be changed upwards or downwards in the future; it - * is only necessary that true be returned for a small - * subset of pmaps for proper page aging. - */ -boolean_t -pmap_page_exists_quick(pmap_t pmap, vm_page_t m) -{ - pv_entry_t pv; - int loops = 0; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_page_exists_quick: page %p is not managed", m)); - rv = FALSE; - rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - if (PV_PMAP(pv) == pmap) { - rv = TRUE; - break; - } - loops++; - if (loops >= 16) - break; - } - rw_wunlock(&pvh_global_lock); - return (rv); -} - -/* - * pmap_page_wired_mappings: - * - * Return the number of managed mappings to the given physical page - * that are wired. - */ -int -pmap_page_wired_mappings(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - int count; - - count = 0; - if ((m->oflags & VPO_UNMANAGED) != 0) - return (count); - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_W) != 0) - count++; - PMAP_UNLOCK(pmap); - } - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (count); -} - -/* - * Returns TRUE if the given page is mapped. Otherwise, returns FALSE. - */ -boolean_t -pmap_page_is_mapped(vm_page_t m) -{ - - if ((m->oflags & VPO_UNMANAGED) != 0) - return (FALSE); - return (!TAILQ_EMPTY(&m->md.pv_list)); -} - -/* - * Remove all pages from specified address space - * this aids process exit speeds. Also, this code - * is special cased for current process only, but - * can have the more generic (and slightly slower) - * mode enabled. This is much faster than pmap_remove - * in the case of running down an entire address space. - */ -void -pmap_remove_pages(pmap_t pmap) -{ - pt_entry_t *pte, tpte; - vm_page_t m, free = NULL; - pv_entry_t pv; - struct pv_chunk *pc, *npc; - int field, idx; - int32_t bit; - uint32_t inuse, bitmask; - int allfree; - - CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); - - if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { - printf("warning: pmap_remove_pages called with non-current pmap\n"); - return; - } - rw_wlock(&pvh_global_lock); - KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); - PMAP_LOCK(pmap); - sched_pin(); - TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { - KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, - pc->pc_pmap)); - allfree = 1; - for (field = 0; field < _NPCM; field++) { - inuse = ~pc->pc_map[field] & pc_freemask[field]; - while (inuse != 0) { - bit = bsfl(inuse); - bitmask = 1UL << bit; - idx = field * 32 + bit; - pv = &pc->pc_pventry[idx]; - inuse &= ~bitmask; - - pte = vtopte(pv->pv_va); - tpte = *pte ? xpmap_mtop(*pte) : 0; - - if (tpte == 0) { - printf( - "TPTE at %p IS ZERO @ VA %08x\n", - pte, pv->pv_va); - panic("bad pte"); - } - -/* - * We cannot remove wired pages from a process' mapping at this time - */ - if (tpte & PG_W) { - allfree = 0; - continue; - } - - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - KASSERT(m->phys_addr == (tpte & PG_FRAME), - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, - (uintmax_t)tpte)); - - KASSERT(m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %#jx", - (uintmax_t)tpte)); - - - PT_CLEAR_VA(pte, FALSE); - - /* - * Update the vm_page_t clean/reference bits. - */ - if (tpte & PG_M) - vm_page_dirty(m); - - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); - - pmap_unuse_pt(pmap, pv->pv_va, &free); - - /* Mark free */ - PV_STAT(pv_entry_frees++); - PV_STAT(pv_entry_spare++); - pv_entry_count--; - pc->pc_map[field] |= bitmask; - pmap->pm_stats.resident_count--; - } - } - PT_UPDATES_FLUSH(); - if (allfree) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - free_pv_chunk(pc); - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - - sched_unpin(); - pmap_invalidate_all(pmap); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - pmap_free_zero_pages(free); -} - -/* - * pmap_is_modified: - * - * Return whether or not the specified physical page was modified - * in any physical maps. - */ -boolean_t -pmap_is_modified(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_is_modified: page %p is not managed", m)); - rv = FALSE; - - /* - * If the page is not exclusive busied, then PGA_WRITEABLE cannot be - * concurrently set while the object is locked. Thus, if PGA_WRITEABLE - * is clear, no PTEs can have PG_M set. - */ - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) - return (rv); - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & PG_M) != 0; - PMAP_UNLOCK(pmap); - if (rv) - break; - } - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rv); -} - -/* - * pmap_is_prefaultable: - * - * Return whether or not the specified virtual address is elgible - * for prefault. - */ -static boolean_t -pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr) -{ - pt_entry_t *pte; - boolean_t rv = FALSE; - - return (rv); - - if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) { - pte = vtopte(addr); - rv = (*pte == 0); - } - return (rv); -} - -boolean_t -pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) -{ - boolean_t rv; - - PMAP_LOCK(pmap); - rv = pmap_is_prefaultable_locked(pmap, addr); - PMAP_UNLOCK(pmap); - return (rv); -} - -boolean_t -pmap_is_referenced(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_is_referenced: page %p is not managed", m)); - rv = FALSE; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); - PMAP_UNLOCK(pmap); - if (rv) - break; - } - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rv); -} - -void -pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) -{ - int i, npages = round_page(len) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - pt_entry_t *pte; - pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); - rw_wlock(&pvh_global_lock); - pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); - rw_wunlock(&pvh_global_lock); - PMAP_MARK_PRIV(xpmap_mtop(*pte)); - pmap_pte_release(pte); - } -} - -void -pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) -{ - int i, npages = round_page(len) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - pt_entry_t *pte; - pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); - PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); - rw_wlock(&pvh_global_lock); - pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); - rw_wunlock(&pvh_global_lock); - pmap_pte_release(pte); - } -} - -/* - * Clear the write and modified bits in each of the given page's mappings. - */ -void -pmap_remove_write(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t oldpte, *pte; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_remove_write: page %p is not managed", m)); - - /* - * If the page is not exclusive busied, then PGA_WRITEABLE cannot be - * set by another thread while the object is locked. Thus, - * if PGA_WRITEABLE is clear, no page table entries need updating. - */ - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) - return; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); -retry: - oldpte = *pte; - if ((oldpte & PG_RW) != 0) { - vm_paddr_t newpte = oldpte & ~(PG_RW | PG_M); - - /* - * Regardless of whether a pte is 32 or 64 bits - * in size, PG_RW and PG_M are among the least - * significant 32 bits. - */ - PT_SET_VA_MA(pte, newpte, TRUE); - if (*pte != newpte) - goto retry; - - if ((oldpte & PG_M) != 0) - vm_page_dirty(m); - pmap_invalidate_page(pmap, pv->pv_va); - } - PMAP_UNLOCK(pmap); - } - vm_page_aflag_clear(m, PGA_WRITEABLE); - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); -} - -/* - * pmap_ts_referenced: - * - * Return a count of reference bits for a page, clearing those bits. - * It is not necessary for every reference bit to be cleared, but it - * is necessary that 0 only be returned when there are truly no - * reference bits set. - * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - */ -int -pmap_ts_referenced(vm_page_t m) -{ - pv_entry_t pv, pvf, pvn; - pmap_t pmap; - pt_entry_t *pte; - int rtval = 0; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_ts_referenced: page %p is not managed", m)); - rw_wlock(&pvh_global_lock); - sched_pin(); - if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - pvf = pv; - do { - pvn = TAILQ_NEXT(pv, pv_next); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_A) != 0) { - PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); - pmap_invalidate_page(pmap, pv->pv_va); - rtval++; - if (rtval > 4) - pvn = NULL; - } - PMAP_UNLOCK(pmap); - } while ((pv = pvn) != NULL && pv != pvf); - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rtval); -} - -/* - * Apply the given advice to the specified range of addresses within the - * given pmap. Depending on the advice, clear the referenced and/or - * modified flags in each mapping and set the mapped page's dirty field. - */ -void -pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) -{ - pd_entry_t oldpde; - pt_entry_t *pte; - vm_offset_t pdnxt; - vm_page_t m; - boolean_t anychanged; - - if (advice != MADV_DONTNEED && advice != MADV_FREE) - return; - anychanged = FALSE; - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - oldpde = pmap->pm_pdir[sva >> PDRSHIFT]; - if ((oldpde & (PG_PS | PG_V)) != PG_V) - continue; - if (pdnxt > eva) - pdnxt = eva; - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | - PG_V)) - continue; - else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if (advice == MADV_DONTNEED) { - /* - * Future calls to pmap_is_modified() - * can be avoided by making the page - * dirty now. - */ - m = PHYS_TO_VM_PAGE(xpmap_mtop(*pte) & - PG_FRAME); - vm_page_dirty(m); - } - PT_SET_VA_MA(pte, *pte & ~(PG_M | PG_A), TRUE); - } else if ((*pte & PG_A) != 0) - PT_SET_VA_MA(pte, *pte & ~PG_A, TRUE); - else - continue; - if ((*pte & PG_G) != 0) - pmap_invalidate_page(pmap, sva); - else - anychanged = TRUE; - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - if (anychanged) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * Clear the modify bits on the specified physical page. - */ -void -pmap_clear_modify(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t *pte; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_clear_modify: page %p is not managed", m)); - VM_OBJECT_ASSERT_WLOCKED(m->object); - KASSERT(!vm_page_xbusied(m), - ("pmap_clear_modify: page %p is exclusive busied", m)); - - /* - * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. - * If the object containing the page is locked and the page is not - * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. - */ - if ((m->aflags & PGA_WRITEABLE) == 0) - return; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - /* - * Regardless of whether a pte is 32 or 64 bits - * in size, PG_M is among the least significant - * 32 bits. - */ - PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE); - pmap_invalidate_page(pmap, pv->pv_va); - } - PMAP_UNLOCK(pmap); - } - sched_unpin(); - rw_wunlock(&pvh_global_lock); -} - -/* - * Miscellaneous support routines follow - */ - -/* - * Map a set of physical memory pages into the kernel virtual - * address space. Return a pointer to where it is mapped. This - * routine is intended to be used for mapping device memory, - * NOT real memory. - */ -void * -pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) -{ - vm_offset_t va, offset; - vm_size_t tmpsize; - - offset = pa & PAGE_MASK; - size = round_page(offset + size); - pa = pa & PG_FRAME; - - if (pa < KERNLOAD && pa + size <= KERNLOAD) - va = KERNBASE + pa; - else - va = kva_alloc(size); - if (!va) - panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); - - for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) - pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); - pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + size, FALSE); - return ((void *)(va + offset)); -} - -void * -pmap_mapdev(vm_paddr_t pa, vm_size_t size) -{ - - return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); -} - -void * -pmap_mapbios(vm_paddr_t pa, vm_size_t size) -{ - - return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); -} - -void -pmap_unmapdev(vm_offset_t va, vm_size_t size) -{ - vm_offset_t base, offset; - - if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) - return; - base = trunc_page(va); - offset = va & PAGE_MASK; - size = round_page(offset + size); - kva_free(base, size); -} - -/* - * Sets the memory attribute for the specified page. - */ -void -pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) -{ - - m->md.pat_mode = ma; - if ((m->flags & PG_FICTITIOUS) != 0) - return; - - /* - * If "m" is a normal page, flush it from the cache. - * See pmap_invalidate_cache_range(). - * - * First, try to find an existing mapping of the page by sf - * buffer. sf_buf_invalidate_cache() modifies mapping and - * flushes the cache. - */ - if (sf_buf_invalidate_cache(m)) - return; - - /* - * If page is not mapped by sf buffer, but CPU does not - * support self snoop, map the page transient and do - * invalidation. In the worst case, whole cache is flushed by - * pmap_invalidate_cache_range(). - */ - if ((cpu_feature & CPUID_SS) == 0) - pmap_flush_page(m); -} - -static void -pmap_flush_page(vm_page_t m) -{ - struct sysmaps *sysmaps; - vm_offset_t sva, eva; - - if ((cpu_feature & CPUID_CLFSH) != 0) { - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_flush_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | - VM_PAGE_TO_MACH(m) | PG_A | PG_M | - pmap_cache_bits(m->md.pat_mode, 0)); - invlcaddr(sysmaps->CADDR2); - sva = (vm_offset_t)sysmaps->CADDR2; - eva = sva + PAGE_SIZE; - - /* - * Use mfence despite the ordering implied by - * mtx_{un,}lock() because clflush is not guaranteed - * to be ordered by any other instruction. - */ - mfence(); - for (; sva < eva; sva += cpu_clflush_line_size) - clflush(sva); - mfence(); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); - } else - pmap_invalidate_cache(); -} - -/* - * Changes the specified virtual address range's memory type to that given by - * the parameter "mode". The specified virtual address range must be - * completely contained within either the kernel map. - * - * Returns zero if the change completed successfully, and either EINVAL or - * ENOMEM if the change failed. Specifically, EINVAL is returned if some part - * of the virtual address range was not mapped, and ENOMEM is returned if - * there was insufficient memory available to complete the change. - */ -int -pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) -{ - vm_offset_t base, offset, tmpva; - pt_entry_t *pte; - u_int opte, npte; - pd_entry_t *pde; - boolean_t changed; - - base = trunc_page(va); - offset = va & PAGE_MASK; - size = round_page(offset + size); - - /* Only supported on kernel virtual addresses. */ - if (base <= VM_MAXUSER_ADDRESS) - return (EINVAL); - - /* 4MB pages and pages that aren't mapped aren't supported. */ - for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { - pde = pmap_pde(kernel_pmap, tmpva); - if (*pde & PG_PS) - return (EINVAL); - if ((*pde & PG_V) == 0) - return (EINVAL); - pte = vtopte(va); - if ((*pte & PG_V) == 0) - return (EINVAL); - } - - changed = FALSE; - - /* - * Ok, all the pages exist and are 4k, so run through them updating - * their cache mode. - */ - for (tmpva = base; size > 0; ) { - pte = vtopte(tmpva); - - /* - * The cache mode bits are all in the low 32-bits of the - * PTE, so we can just spin on updating the low 32-bits. - */ - do { - opte = *(u_int *)pte; - npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); - npte |= pmap_cache_bits(mode, 0); - PT_SET_VA_MA(pte, npte, TRUE); - } while (npte != opte && (*pte != npte)); - if (npte != opte) - changed = TRUE; - tmpva += PAGE_SIZE; - size -= PAGE_SIZE; - } - - /* - * Flush CPU caches to make sure any data isn't cached that - * shouldn't be, etc. - */ - if (changed) { - pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); - } - return (0); -} - -/* - * perform the pmap work for mincore - */ -int -pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) -{ - pt_entry_t *ptep, pte; - vm_paddr_t pa; - int val; - - PMAP_LOCK(pmap); -retry: - ptep = pmap_pte(pmap, addr); - pte = (ptep != NULL) ? PT_GET(ptep) : 0; - pmap_pte_release(ptep); - val = 0; - if ((pte & PG_V) != 0) { - val |= MINCORE_INCORE; - if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; - if ((pte & PG_A) != 0) - val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; - } - if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != - (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && - (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { - pa = pte & PG_FRAME; - /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ - if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) - goto retry; - } else - PA_UNLOCK_COND(*locked_pa); - PMAP_UNLOCK(pmap); - return (val); -} - -void -pmap_activate(struct thread *td) -{ - pmap_t pmap, oldpmap; - u_int cpuid; - u_int32_t cr3; - - critical_enter(); - pmap = vmspace_pmap(td->td_proc->p_vmspace); - oldpmap = PCPU_GET(curpmap); - cpuid = PCPU_GET(cpuid); -#if defined(SMP) - CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); - CPU_SET_ATOMIC(cpuid, &pmap->pm_active); -#else - CPU_CLR(cpuid, &oldpmap->pm_active); - CPU_SET(cpuid, &pmap->pm_active); -#endif -#ifdef PAE - cr3 = vtophys(pmap->pm_pdpt); -#else - cr3 = vtophys(pmap->pm_pdir); -#endif - /* - * pmap_activate is for the current thread on the current cpu - */ - td->td_pcb->pcb_cr3 = cr3; - PT_UPDATES_FLUSH(); - load_cr3(cr3); - PCPU_SET(curpmap, pmap); - critical_exit(); -} - -void -pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) -{ -} - -/* - * Increase the starting virtual address of the given mapping if a - * different alignment might result in more superpage mappings. - */ -void -pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, - vm_offset_t *addr, vm_size_t size) -{ - vm_offset_t superpage_offset; - - if (size < NBPDR) - return; - if (object != NULL && (object->flags & OBJ_COLORED) != 0) - offset += ptoa(object->pg_color); - superpage_offset = offset & PDRMASK; - if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || - (*addr & PDRMASK) == superpage_offset) - return; - if ((*addr & PDRMASK) < superpage_offset) - *addr = (*addr & ~PDRMASK) + superpage_offset; - else - *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; -} - -void -pmap_suspend() -{ - pmap_t pmap; - int i, pdir, offset; - vm_paddr_t pdirma; - mmu_update_t mu[4]; - - /* - * We need to remove the recursive mapping structure from all - * our pmaps so that Xen doesn't get confused when it restores - * the page tables. The recursive map lives at page directory - * index PTDPTDI. We assume that the suspend code has stopped - * the other vcpus (if any). - */ - LIST_FOREACH(pmap, &allpmaps, pm_list) { - for (i = 0; i < 4; i++) { - /* - * Figure out which page directory (L2) page - * contains this bit of the recursive map and - * the offset within that page of the map - * entry - */ - pdir = (PTDPTDI + i) / NPDEPG; - offset = (PTDPTDI + i) % NPDEPG; - pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; - mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); - mu[i].val = 0; - } - HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); - } -} - -void -pmap_resume() -{ - pmap_t pmap; - int i, pdir, offset; - vm_paddr_t pdirma; - mmu_update_t mu[4]; - - /* - * Restore the recursive map that we removed on suspend. - */ - LIST_FOREACH(pmap, &allpmaps, pm_list) { - for (i = 0; i < 4; i++) { - /* - * Figure out which page directory (L2) page - * contains this bit of the recursive map and - * the offset within that page of the map - * entry - */ - pdir = (PTDPTDI + i) / NPDEPG; - offset = (PTDPTDI + i) % NPDEPG; - pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; - mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); - mu[i].val = (pmap->pm_pdpt[i] & PG_FRAME) | PG_V; - } - HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); - } -} - -#if defined(PMAP_DEBUG) -pmap_pid_dump(int pid) -{ - pmap_t pmap; - struct proc *p; - int npte = 0; - int index; - - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_pid != pid) - continue; - - if (p->p_vmspace) { - int i,j; - index = 0; - pmap = vmspace_pmap(p->p_vmspace); - for (i = 0; i < NPDEPTD; i++) { - pd_entry_t *pde; - pt_entry_t *pte; - vm_offset_t base = i << PDRSHIFT; - - pde = &pmap->pm_pdir[i]; - if (pde && pmap_pde_v(pde)) { - for (j = 0; j < NPTEPG; j++) { - vm_offset_t va = base + (j << PAGE_SHIFT); - if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { - if (index) { - index = 0; - printf("\n"); - } - sx_sunlock(&allproc_lock); - return (npte); - } - pte = pmap_pte(pmap, va); - if (pte && pmap_pte_v(pte)) { - pt_entry_t pa; - vm_page_t m; - pa = PT_GET(pte); - m = PHYS_TO_VM_PAGE(pa & PG_FRAME); - printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", - va, pa, m->hold_count, m->wire_count, m->flags); - npte++; - index++; - if (index >= 2) { - index = 0; - printf("\n"); - } else { - printf(" "); - } - } - } - } - } - } - } - sx_sunlock(&allproc_lock); - return (npte); -} -#endif - -#if defined(DEBUG) - -static void pads(pmap_t pm); -void pmap_pvdump(vm_paddr_t pa); - -/* print address space of pmap*/ -static void -pads(pmap_t pm) -{ - int i, j; - vm_paddr_t va; - pt_entry_t *ptep; - - if (pm == kernel_pmap) - return; - for (i = 0; i < NPDEPTD; i++) - if (pm->pm_pdir[i]) - for (j = 0; j < NPTEPG; j++) { - va = (i << PDRSHIFT) + (j << PAGE_SHIFT); - if (pm == kernel_pmap && va < KERNBASE) - continue; - if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) - continue; - ptep = pmap_pte(pm, va); - if (pmap_pte_v(ptep)) - printf("%x:%x ", va, *ptep); - }; - -} - -void -pmap_pvdump(vm_paddr_t pa) -{ - pv_entry_t pv; - pmap_t pmap; - vm_page_t m; - - printf("pa %x", pa); - m = PHYS_TO_VM_PAGE(pa); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); - pads(pmap); - } - printf(" "); -} -#endif diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c deleted file mode 100644 index dbaa7ad..0000000 --- a/sys/i386/xen/xen_machdep.c +++ /dev/null @@ -1,1236 +0,0 @@ -/* - * - * Copyright (c) 2004 Christian Limpach. - * Copyright (c) 2004-2006,2008 Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christian Limpach. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/bus.h> -#include <sys/ktr.h> -#include <sys/lock.h> -#include <sys/mount.h> -#include <sys/malloc.h> -#include <sys/mutex.h> -#include <sys/kernel.h> -#include <sys/proc.h> -#include <sys/reboot.h> -#include <sys/rwlock.h> -#include <sys/sysproto.h> -#include <sys/boot.h> - -#include <xen/xen-os.h> - -#include <vm/vm.h> -#include <vm/pmap.h> -#include <machine/segments.h> -#include <machine/pcb.h> -#include <machine/stdarg.h> -#include <machine/vmparam.h> -#include <machine/cpu.h> -#include <machine/intr_machdep.h> -#include <machine/md_var.h> -#include <machine/asmacros.h> - - - -#include <xen/hypervisor.h> -#include <xen/xenstore/xenstorevar.h> -#include <machine/xen/xenvar.h> -#include <machine/xen/xenfunc.h> -#include <machine/xen/xenpmap.h> -#include <machine/xen/xenfunc.h> -#include <xen/interface/memory.h> -#include <machine/xen/features.h> -#ifdef SMP -#include <machine/privatespace.h> -#endif - - -#include <vm/vm_page.h> - - -#define IDTVEC(name) __CONCAT(X,name) - -extern inthand_t -IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), - IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), - IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), - IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), - IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); - - -int xendebug_flags; -start_info_t *xen_start_info; -start_info_t *HYPERVISOR_start_info; -shared_info_t *HYPERVISOR_shared_info; -xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; -xen_pfn_t *xen_phys_machine; -xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; -xen_pfn_t *xen_pfn_to_mfn_frame_list_list; -int preemptable, init_first; -extern unsigned int avail_space; -int xen_vector_callback_enabled = 0; -enum xen_domain_type xen_domain_type = XEN_PV_DOMAIN; - -void ni_cli(void); -void ni_sti(void); - - -void -ni_cli(void) -{ - CTR0(KTR_SPARE2, "ni_cli disabling interrupts"); - __asm__("pushl %edx;" - "pushl %eax;" - ); - __cli(); - __asm__("popl %eax;" - "popl %edx;" - ); -} - - -void -ni_sti(void) -{ - __asm__("pushl %edx;" - "pushl %esi;" - "pushl %eax;" - ); - __sti(); - __asm__("popl %eax;" - "popl %esi;" - "popl %edx;" - ); -} - -void -force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} - -/* - * Modify the cmd_line by converting ',' to NULLs so that it is in a format - * suitable for the static env vars. - */ -char * -xen_setbootenv(char *cmd_line) -{ - char *cmd_line_next; - - /* Skip leading spaces */ - for (; *cmd_line == ' '; cmd_line++); - - xc_printf("xen_setbootenv(): cmd_line='%s'\n", cmd_line); - - for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); - return cmd_line; -} - -int -xen_boothowto(char *envp) -{ - int i, howto = 0; - - /* get equivalents from the environment */ - for (i = 0; howto_names[i].ev != NULL; i++) - if (kern_getenv(howto_names[i].ev) != NULL) - howto |= howto_names[i].mask; - return howto; -} - - -#define XPQUEUE_SIZE 128 - -struct mmu_log { - char *file; - int line; -}; - -#ifdef SMP -/* per-cpu queues and indices */ -#ifdef INVARIANTS -static struct mmu_log xpq_queue_log[XEN_LEGACY_MAX_VCPUS][XPQUEUE_SIZE]; -#endif - -static int xpq_idx[XEN_LEGACY_MAX_VCPUS]; -static mmu_update_t xpq_queue[XEN_LEGACY_MAX_VCPUS][XPQUEUE_SIZE]; - -#define XPQ_QUEUE_LOG xpq_queue_log[vcpu] -#define XPQ_QUEUE xpq_queue[vcpu] -#define XPQ_IDX xpq_idx[vcpu] -#define SET_VCPU() int vcpu = smp_processor_id() -#else - -static mmu_update_t xpq_queue[XPQUEUE_SIZE]; -#ifdef INVARIANTS -static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; -#endif -static int xpq_idx = 0; - -#define XPQ_QUEUE_LOG xpq_queue_log -#define XPQ_QUEUE xpq_queue -#define XPQ_IDX xpq_idx -#define SET_VCPU() -#endif /* !SMP */ - -#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); - -#if 0 -static void -xen_dump_queue(void) -{ - int _xpq_idx = XPQ_IDX; - int i; - - if (_xpq_idx <= 1) - return; - - xc_printf("xen_dump_queue(): %u entries\n", _xpq_idx); - for (i = 0; i < _xpq_idx; i++) { - xc_printf(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, - XPQ_QUEUE[i].ptr); - } -} -#endif - - -static __inline void -_xen_flush_queue(void) -{ - SET_VCPU(); - int _xpq_idx = XPQ_IDX; - int error, i; - -#ifdef INVARIANTS - if (__predict_true(gdtset)) - CRITICAL_ASSERT(curthread); -#endif - - XPQ_IDX = 0; - /* Make sure index is cleared first to avoid double updates. */ - error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, - _xpq_idx, NULL, DOMID_SELF); - -#if 0 - if (__predict_true(gdtset)) - for (i = _xpq_idx; i > 0;) { - if (i >= 3) { - CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx " - "ptr: %lx val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff), - (XPQ_QUEUE[i-2].val & 0xffffffff), - (XPQ_QUEUE[i-2].ptr & 0xffffffff), - (XPQ_QUEUE[i-3].val & 0xffffffff), - (XPQ_QUEUE[i-3].ptr & 0xffffffff)); - i -= 3; - } else if (i == 2) { - CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff), - (XPQ_QUEUE[i-2].val & 0xffffffff), - (XPQ_QUEUE[i-2].ptr & 0xffffffff)); - i = 0; - } else { - CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff)); - i = 0; - } - } -#endif - if (__predict_false(error < 0)) { - for (i = 0; i < _xpq_idx; i++) - printf("val: %llx ptr: %llx\n", - XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); - panic("Failed to execute MMU updates: %d", error); - } - -} - -void -xen_flush_queue(void) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - critical_enter(); - if (XPQ_IDX != 0) _xen_flush_queue(); - if (__predict_true(gdtset)) - critical_exit(); -} - -static __inline void -xen_increment_idx(void) -{ - SET_VCPU(); - - XPQ_IDX++; - if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) - xen_flush_queue(); -} - -void -xen_check_queue(void) -{ -#ifdef INVARIANTS - SET_VCPU(); - - KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); -#endif -} - -void -xen_invlpg(vm_offset_t va) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_ALL; - op.arg1.linear_addr = va & ~PAGE_MASK; - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_load_cr3(u_int val) -{ - struct mmuext_op op; -#ifdef INVARIANTS - SET_VCPU(); - - KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); -#endif - op.cmd = MMUEXT_NEW_BASEPTR; - op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#ifdef KTR -static __inline u_int -rebp(void) -{ - u_int data; - - __asm __volatile("movl 4(%%ebp),%0" : "=r" (data)); - return (data); -} -#endif - -u_int -read_eflags(void) -{ - vcpu_info_t *_vcpu; - u_int eflags; - - eflags = _read_eflags(); - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; - if (_vcpu->evtchn_upcall_mask) - eflags &= ~PSL_I; - - return (eflags); -} - -void -write_eflags(u_int eflags) -{ - u_int intr; - - CTR2(KTR_SPARE2, "%x xen_restore_flags eflags %x", rebp(), eflags); - intr = ((eflags & PSL_I) == 0); - __restore_flags(intr); - _write_eflags(eflags); -} - -void -xen_cli(void) -{ - CTR1(KTR_SPARE2, "%x xen_cli disabling interrupts", rebp()); - __cli(); -} - -void -xen_sti(void) -{ - CTR1(KTR_SPARE2, "%x xen_sti enabling interrupts", rebp()); - __sti(); -} - -u_int -xen_rcr2(void) -{ - - return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2); -} - -void -_xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - critical_enter(); - XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - XPQ_QUEUE[XPQ_IDX].val = pfn; -#ifdef INVARIANTS - XPQ_QUEUE_LOG[XPQ_IDX].file = file; - XPQ_QUEUE_LOG[XPQ_IDX].line = line; -#endif - xen_increment_idx(); - if (__predict_true(gdtset)) - critical_exit(); -} - -extern struct rwlock pvh_global_lock; - -void -_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - rw_assert(&pvh_global_lock, RA_WLOCKED); - - KASSERT((ptr & 7) == 0, ("misaligned update")); - - if (__predict_true(gdtset)) - critical_enter(); - - XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; - XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; -#ifdef INVARIANTS - XPQ_QUEUE_LOG[XPQ_IDX].file = file; - XPQ_QUEUE_LOG[XPQ_IDX].line = line; -#endif - xen_increment_idx(); - if (__predict_true(gdtset)) - critical_exit(); -} - -void -xen_pgdpt_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L3_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pgd_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L2_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pgd_unpin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pt_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L1_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pt_unpin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_set_ldt(vm_paddr_t ptr, unsigned long len) -{ - struct mmuext_op op; - op.cmd = MMUEXT_SET_LDT; - op.arg1.linear_addr = ptr; - op.arg2.nr_ents = len; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_update_descriptor(union descriptor *table, union descriptor *entry) -{ - vm_paddr_t pa; - pt_entry_t *ptp; - - ptp = vtopte((vm_offset_t)table); - pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); - if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) - panic("HYPERVISOR_update_descriptor failed\n"); -} - - -#if 0 -/* - * Bitmap is indexed by page number. If bit is set, the page is part of a - * xen_create_contiguous_region() area of memory. - */ -unsigned long *contiguous_bitmap; - -static void -contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) -{ - unsigned long start_off, end_off, curr_idx, end_idx; - - curr_idx = first_page / BITS_PER_LONG; - start_off = first_page & (BITS_PER_LONG-1); - end_idx = (first_page + nr_pages) / BITS_PER_LONG; - end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); - - if (curr_idx == end_idx) { - contiguous_bitmap[curr_idx] |= - ((1UL<<end_off)-1) & -(1UL<<start_off); - } else { - contiguous_bitmap[curr_idx] |= -(1UL<<start_off); - while ( ++curr_idx < end_idx ) - contiguous_bitmap[curr_idx] = ~0UL; - contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; - } -} - -static void -contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages) -{ - unsigned long start_off, end_off, curr_idx, end_idx; - - curr_idx = first_page / BITS_PER_LONG; - start_off = first_page & (BITS_PER_LONG-1); - end_idx = (first_page + nr_pages) / BITS_PER_LONG; - end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); - - if (curr_idx == end_idx) { - contiguous_bitmap[curr_idx] &= - -(1UL<<end_off) | ((1UL<<start_off)-1); - } else { - contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; - while ( ++curr_idx != end_idx ) - contiguous_bitmap[curr_idx] = 0; - contiguous_bitmap[curr_idx] &= -(1UL<<end_off); - } -} -#endif - -/* Ensure multi-page extents are contiguous in machine memory. */ -int -xen_create_contiguous_region(vm_page_t pages, int npages) -{ - unsigned long mfn, i, flags; - int order; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &mfn); - - balloon_lock(flags); - - /* can currently only handle power of two allocation */ - PANIC_IF(ffs(npages) != fls(npages)); - - /* 0. determine order */ - order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); - - /* 1. give away machine pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; - mfn = PFNTOMFN(pfn); - PFNTOMFN(pfn) = INVALID_P2M_ENTRY; - PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1); - } - - - /* 2. Get a new contiguous memory extent. */ - reservation.extent_order = order; - /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not - * running with a broxen driver XXXEN - */ - reservation.address_bits = 31; - if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) - goto fail; - - /* 3. Map the new extent in place of old pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; - xen_machphys_update(mfn+i, pfn); - PFNTOMFN(pfn) = mfn+i; - } - - xen_tlb_flush(); - -#if 0 - contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); -#endif - - balloon_unlock(flags); - - return 0; - - fail: - reservation.extent_order = 0; - reservation.address_bits = 0; - - for (i = 0; i < (1 << order); i++) { - int pfn; - pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; - PANIC_IF(HYPERVISOR_memory_op( - XENMEM_increase_reservation, &reservation) != 1); - xen_machphys_update(mfn, pfn); - PFNTOMFN(pfn) = mfn; - } - - xen_tlb_flush(); - - balloon_unlock(flags); - - return ENOMEM; -} - -void -xen_destroy_contiguous_region(void *addr, int npages) -{ - unsigned long mfn, i, flags, order, pfn0; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &mfn); - - pfn0 = vtophys(addr) >> PAGE_SHIFT; -#if 0 - scrub_pages(vstart, 1 << order); -#endif - /* can currently only handle power of two allocation */ - PANIC_IF(ffs(npages) != fls(npages)); - - /* 0. determine order */ - order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); - - balloon_lock(flags); - -#if 0 - contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); -#endif - - /* 1. Zap current PTEs, giving away the underlying pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - uint64_t new_val = 0; - pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; - - PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0)); - PFNTOMFN(pfn) = INVALID_P2M_ENTRY; - PANIC_IF(HYPERVISOR_memory_op( - XENMEM_decrease_reservation, &reservation) != 1); - } - - /* 2. Map new pages in place of old pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - uint64_t new_val; - pfn = pfn0 + i; - PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1); - - new_val = mfn << PAGE_SHIFT; - PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), - new_val, PG_KERNEL)); - xen_machphys_update(mfn, pfn); - PFNTOMFN(pfn) = mfn; - } - - xen_tlb_flush(); - - balloon_unlock(flags); -} - -extern vm_offset_t proc0kstack; -extern int vm86paddr, vm86phystk; -char *bootmem_start, *bootmem_current, *bootmem_end; - -pteinfo_t *pteinfo_list; -void initvalues(start_info_t *startinfo); - -void * -bootmem_alloc(unsigned int size) -{ - char *retptr; - - retptr = bootmem_current; - PANIC_IF(retptr + size > bootmem_end); - bootmem_current += size; - - return retptr; -} - -void -bootmem_free(void *ptr, unsigned int size) -{ - char *tptr; - - tptr = ptr; - PANIC_IF(tptr != bootmem_current - size || - bootmem_current - size < bootmem_start); - - bootmem_current -= size; -} - -#if 0 -static vm_paddr_t -xpmap_mtop2(vm_paddr_t mpa) -{ - return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) - ) | (mpa & ~PG_FRAME); -} - -static pd_entry_t -xpmap_get_bootpde(vm_paddr_t va) -{ - - return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; -} - -static pd_entry_t -xpmap_get_vbootpde(vm_paddr_t va) -{ - pd_entry_t pde; - - pde = xpmap_get_bootpde(va); - if ((pde & PG_V) == 0) - return (pde & ~PG_FRAME); - return (pde & ~PG_FRAME) | - (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); -} - -static pt_entry_t 8* -xpmap_get_bootptep(vm_paddr_t va) -{ - pd_entry_t pde; - - pde = xpmap_get_vbootpde(va); - if ((pde & PG_V) == 0) - return (void *)-1; -#define PT_MASK 0x003ff000 /* page table address bits */ - return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); -} - -static pt_entry_t -xpmap_get_bootpte(vm_paddr_t va) -{ - - return xpmap_get_bootptep(va)[0]; -} -#endif - - -#ifdef ADD_ISA_HOLE -static void -shift_phys_machine(unsigned long *phys_machine, int nr_pages) -{ - - unsigned long *tmp_page, *current_page, *next_page; - int i; - - tmp_page = bootmem_alloc(PAGE_SIZE); - current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); - next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); - bcopy(phys_machine, tmp_page, PAGE_SIZE); - - while (current_page > phys_machine) { - /* save next page */ - bcopy(next_page, tmp_page, PAGE_SIZE); - /* shift down page */ - bcopy(current_page, next_page, PAGE_SIZE); - /* finish swap */ - bcopy(tmp_page, current_page, PAGE_SIZE); - - current_page -= (PAGE_SIZE/sizeof(unsigned long)); - next_page -= (PAGE_SIZE/sizeof(unsigned long)); - } - bootmem_free(tmp_page, PAGE_SIZE); - - for (i = 0; i < nr_pages; i++) { - xen_machphys_update(phys_machine[i], i); - } - memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); - -} -#endif /* ADD_ISA_HOLE */ - -/* - * Build a directory of the pages that make up our Physical to Machine - * mapping table. The Xen suspend/restore code uses this to find our - * mapping table. - */ -static void -init_frame_list_list(void *arg) -{ - unsigned long nr_pages = xen_start_info->nr_pages; -#define FPP (PAGE_SIZE/sizeof(xen_pfn_t)) - int i, j, k; - - xen_pfn_to_mfn_frame_list_list = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); - for (i = 0, j = 0, k = -1; i < nr_pages; - i += FPP, j++) { - if ((j & (FPP - 1)) == 0) { - k++; - xen_pfn_to_mfn_frame_list[k] = - malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); - xen_pfn_to_mfn_frame_list_list[k] = - VTOMFN(xen_pfn_to_mfn_frame_list[k]); - j = 0; - } - xen_pfn_to_mfn_frame_list[k][j] = - VTOMFN(&xen_phys_machine[i]); - } - - HYPERVISOR_shared_info->arch.max_pfn = nr_pages; - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list - = VTOMFN(xen_pfn_to_mfn_frame_list_list); -} -SYSINIT(init_fll, SI_SUB_DEVFS, SI_ORDER_ANY, init_frame_list_list, NULL); - -extern unsigned long physfree; - -int pdir, curoffset; -extern int nkpt; - -extern uint32_t kernbase; - -void -initvalues(start_info_t *startinfo) -{ - vm_offset_t cur_space, cur_space_pt; - struct physdev_set_iopl set_iopl; - - int l3_pages, l2_pages, l1_pages, offset; - vm_paddr_t console_page_ma, xen_store_ma; - vm_offset_t tmpva; - vm_paddr_t shinfo; -#ifdef PAE - vm_paddr_t IdlePDPTma, IdlePDPTnewma; - vm_paddr_t IdlePTDnewma[4]; - pd_entry_t *IdlePDPTnew, *IdlePTDnew; - vm_paddr_t IdlePTDma[4]; -#else - vm_paddr_t IdlePTDma[1]; -#endif - unsigned long i; - int ncpus = MAXCPU; - - nkpt = min( - min( - max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt), - NPGPTD*NPDEPG - KPTDI), - (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT); - - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); -#ifdef notyet - /* - * need to install handler - */ - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); -#endif - xen_start_info = startinfo; - HYPERVISOR_start_info = startinfo; - xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; - - IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); - l1_pages = 0; - -#ifdef PAE - l3_pages = 1; - l2_pages = 0; - IdlePDPT = (pd_entry_t *)startinfo->pt_base; - IdlePDPTma = VTOM(startinfo->pt_base); - for (i = (KERNBASE >> 30); - (i < 4) && (IdlePDPT[i] != 0); i++) - l2_pages++; - /* - * Note that only one page directory has been allocated at this point. - * Thus, if KERNBASE - */ - for (i = 0; i < l2_pages; i++) - IdlePTDma[i] = VTOM(IdlePTD + i*PAGE_SIZE); - - l2_pages = (l2_pages == 0) ? 1 : l2_pages; -#else - l3_pages = 0; - l2_pages = 1; -#endif - for (i = (((KERNBASE>>18) & PAGE_MASK)>>PAGE_SHIFT); - (i<l2_pages*NPDEPG) && (i<(VM_MAX_KERNEL_ADDRESS>>PDRSHIFT)); i++) { - - if (IdlePTD[i] == 0) - break; - l1_pages++; - } - - /* number of pages allocated after the pts + 1*/; - cur_space = xen_start_info->pt_base + - (l3_pages + l2_pages + l1_pages + 1)*PAGE_SIZE; - - xc_printf("initvalues(): wooh - availmem=%x,%x\n", avail_space, - cur_space); - - xc_printf("KERNBASE=%x,pt_base=%lx, VTOPFN(base)=%x, nr_pt_frames=%lx\n", - KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), - xen_start_info->nr_pt_frames); - xendebug_flags = 0; /* 0xffffffff; */ - -#ifdef ADD_ISA_HOLE - shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); -#endif - XENPRINTF("IdlePTD %p\n", IdlePTD); - XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%x pt_base: 0x%lx " - "mod_start: 0x%lx mod_len: 0x%lx\n", - xen_start_info->nr_pages, xen_start_info->shared_info, - xen_start_info->flags, xen_start_info->pt_base, - xen_start_info->mod_start, xen_start_info->mod_len); - -#ifdef PAE - IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; - bzero(IdlePDPTnew, PAGE_SIZE); - - IdlePDPTnewma = VTOM(IdlePDPTnew); - IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; - bzero(IdlePTDnew, 4*PAGE_SIZE); - - for (i = 0; i < 4; i++) - IdlePTDnewma[i] = VTOM((uint8_t *)IdlePTDnew + i*PAGE_SIZE); - /* - * L3 - * - * Copy the 4 machine addresses of the new PTDs in to the PDPT - * - */ - for (i = 0; i < 4; i++) - IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; - - __asm__("nop;"); - /* - * - * re-map the new PDPT read-only - */ - PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); - /* - * - * Unpin the current PDPT - */ - xen_pt_unpin(IdlePDPTma); - -#endif /* PAE */ - - /* Map proc0's KSTACK */ - proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); - xc_printf("proc0kstack=%u\n", proc0kstack); - - /* vm86/bios stack */ - cur_space += PAGE_SIZE; - - /* Map space for the vm86 region */ - vm86paddr = (vm_offset_t)cur_space; - cur_space += (PAGE_SIZE * 3); - - /* allocate 4 pages for bootmem allocator */ - bootmem_start = bootmem_current = (char *)cur_space; - cur_space += (4 * PAGE_SIZE); - bootmem_end = (char *)cur_space; - - /* allocate pages for gdt */ - gdt = (union descriptor *)cur_space; - cur_space += PAGE_SIZE*ncpus; - - /* allocate page for ldt */ - ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; - cur_space += PAGE_SIZE; - - /* unmap remaining pages from initial chunk - * - */ - for (tmpva = cur_space; tmpva < (((uint32_t)&kernbase) + (l1_pages<<PDRSHIFT)); - tmpva += PAGE_SIZE) { - bzero((char *)tmpva, PAGE_SIZE); - PT_SET_MA(tmpva, (vm_paddr_t)0); - } - - PT_UPDATES_FLUSH(); - - memcpy(((uint8_t *)IdlePTDnew) + ((unsigned int)(KERNBASE >> 18)), - ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK), - l1_pages*sizeof(pt_entry_t)); - - for (i = 0; i < 4; i++) { - PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, - IdlePTDnewma[i] | PG_V); - } - xen_load_cr3(VTOP(IdlePDPTnew)); - xen_pgdpt_pin(VTOM(IdlePDPTnew)); - - /* allocate remainder of nkpt pages */ - cur_space_pt = cur_space; - for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt; - i++, cur_space += PAGE_SIZE) { - pdir = (offset + i) / NPDEPG; - curoffset = ((offset + i) % NPDEPG); - if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS) - break; - - /* - * make sure that all the initial page table pages - * have been zeroed - */ - PT_SET_MA(cur_space, VTOM(cur_space) | PG_V | PG_RW); - bzero((char *)cur_space, PAGE_SIZE); - PT_SET_MA(cur_space, (vm_paddr_t)0); - xen_pt_pin(VTOM(cur_space)); - xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + - curoffset*sizeof(vm_paddr_t)), - VTOM(cur_space) | PG_KERNEL); - PT_UPDATES_FLUSH(); - } - - for (i = 0; i < 4; i++) { - pdir = (PTDPTDI + i) / NPDEPG; - curoffset = (PTDPTDI + i) % NPDEPG; - - xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + - curoffset*sizeof(vm_paddr_t)), - IdlePTDnewma[i] | PG_V); - } - - PT_UPDATES_FLUSH(); - - IdlePTD = IdlePTDnew; - IdlePDPT = IdlePDPTnew; - IdlePDPTma = IdlePDPTnewma; - - HYPERVISOR_shared_info = (shared_info_t *)cur_space; - cur_space += PAGE_SIZE; - - xen_store = (struct xenstore_domain_interface *)cur_space; - cur_space += PAGE_SIZE; - - console_page = (char *)cur_space; - cur_space += PAGE_SIZE; - - /* - * shared_info is an unsigned long so this will randomly break if - * it is allocated above 4GB - I guess people are used to that - * sort of thing with Xen ... sigh - */ - shinfo = xen_start_info->shared_info; - PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); - - xc_printf("#4\n"); - - xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT); - PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); - console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT); - PT_SET_MA(console_page, console_page_ma | PG_KERNEL); - - xc_printf("#5\n"); - - set_iopl.iopl = 1; - PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); - xc_printf("#6\n"); -#if 0 - /* add page table for KERNBASE */ - xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_KERNEL); - xen_flush_queue(); -#ifdef PAE - xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_V | PG_A); -#else - xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_V | PG_A); -#endif - xen_flush_queue(); - cur_space += PAGE_SIZE; - xc_printf("#6\n"); -#endif /* 0 */ -#ifdef notyet - if (xen_start_info->flags & SIF_INITDOMAIN) { - /* Map first megabyte */ - for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) - PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); - xen_flush_queue(); - } -#endif - /* - * re-map kernel text read-only - * - */ - for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); - i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) - PT_SET_MA(i, VTOM(i) | PG_V | PG_A); - - xc_printf("#7\n"); - physfree = VTOP(cur_space); - init_first = physfree >> PAGE_SHIFT; - IdlePTD = (pd_entry_t *)VTOP(IdlePTD); - IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); - setup_xen_features(); - xc_printf("#8, proc0kstack=%u\n", proc0kstack); -} - - -trap_info_t trap_table[] = { - { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, - { 1, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, - { 3, 3|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, - { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, - /* This is UPL on Linux and KPL on BSD */ - { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, - { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, - { 7, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, - /* - * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, - * no handler for double fault - */ - { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, - {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, - {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, - {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, - {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, - {14, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, - {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, - {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, - {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, - {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, - {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, - {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, - { 0, 0, 0, 0 } -}; - -/* Perform a multicall and check that individual calls succeeded. */ -int -HYPERVISOR_multicall(struct multicall_entry * call_list, int nr_calls) -{ - int ret = 0; - int i; - - /* Perform the multicall. */ - PANIC_IF(_HYPERVISOR_multicall(call_list, nr_calls)); - - /* Check the results of individual hypercalls. */ - for (i = 0; i < nr_calls; i++) - if (__predict_false(call_list[i].result < 0)) - ret++; - if (__predict_false(ret > 0)) - panic("%d multicall(s) failed: cpu %d\n", - ret, smp_processor_id()); - - /* If we didn't panic already, everything succeeded. */ - return (0); -} - -/********** CODE WORTH KEEPING ABOVE HERE *****************/ - -void xen_failsafe_handler(void); - -void -xen_failsafe_handler(void) -{ - - panic("xen_failsafe_handler called!\n"); -} - -void xen_handle_thread_switch(struct pcb *pcb); - -/* This is called by cpu_switch() when switching threads. */ -/* The pcb arg refers to the process control block of the */ -/* next thread which is to run */ -void -xen_handle_thread_switch(struct pcb *pcb) -{ - uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; - uint32_t *b = (uint32_t *)&pcb->pcb_fsd; - multicall_entry_t mcl[3]; - int i = 0; - - /* Notify Xen of task switch */ - mcl[i].op = __HYPERVISOR_stack_switch; - mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); - mcl[i++].args[1] = (unsigned long)pcb; - - /* Check for update of fsd */ - if (*a != *b || *(a+1) != *(b+1)) { - mcl[i].op = __HYPERVISOR_update_descriptor; - *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); - *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; - } - - a += 2; - b += 2; - - /* Check for update of gsd */ - if (*a != *b || *(a+1) != *(b+1)) { - mcl[i].op = __HYPERVISOR_update_descriptor; - *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); - *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; - } - - (void)HYPERVISOR_multicall(mcl, i); -} diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 39e4df3..3ff3440 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -1238,12 +1238,14 @@ __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) coresize = round_page(hdrsize + notesz) + seginfo.size; #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_add(td->td_proc, RACCT_CORE, coresize); - PROC_UNLOCK(td->td_proc); - if (error != 0) { - error = EFAULT; - goto done; + if (racct_enable) { + PROC_LOCK(td->td_proc); + error = racct_add(td->td_proc, RACCT_CORE, coresize); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = EFAULT; + goto done; + } } #endif if (coresize >= limit) { diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 5290957..329f418 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -857,13 +857,15 @@ do_dup(struct thread *td, int flags, int old, int new) * the limit on the size of the file descriptor table. */ #ifdef RACCT - PROC_LOCK(p); - error = racct_set(p, RACCT_NOFILE, new + 1); - PROC_UNLOCK(p); - if (error != 0) { - FILEDESC_XUNLOCK(fdp); - fdrop(fp, td); - return (EMFILE); + if (racct_enable) { + PROC_LOCK(p); + error = racct_set(p, RACCT_NOFILE, new + 1); + PROC_UNLOCK(p); + if (error != 0) { + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (EMFILE); + } } #endif fdgrowtable_exp(fdp, new + 1); @@ -1609,7 +1611,7 @@ fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; - int fd = -1, maxfd, allocfd; + int fd, maxfd, allocfd; #ifdef RACCT int error; #endif @@ -1631,11 +1633,13 @@ fdalloc(struct thread *td, int minfd, int *result) if (fd >= fdp->fd_nfiles) { allocfd = min(fd * 2, maxfd); #ifdef RACCT - PROC_LOCK(p); - error = racct_set(p, RACCT_NOFILE, allocfd); - PROC_UNLOCK(p); - if (error != 0) - return (EMFILE); + if (racct_enable) { + PROC_LOCK(p); + error = racct_set(p, RACCT_NOFILE, allocfd); + PROC_UNLOCK(p); + if (error != 0) + return (EMFILE); + } #endif /* * fd is already equal to first free descriptor >= minfd, so @@ -2042,9 +2046,11 @@ fdescfree(struct thread *td) MPASS(fdp != NULL); #ifdef RACCT - PROC_LOCK(td->td_proc); - racct_set(td->td_proc, RACCT_NOFILE, 0); - PROC_UNLOCK(td->td_proc); + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_set(td->td_proc, RACCT_NOFILE, 0); + PROC_UNLOCK(td->td_proc); + } #endif if (td->td_proc->p_fdtol != NULL) diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index ecc2651..9d893f8 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1060,7 +1060,7 @@ exec_new_vmspace(imgp, sv) /* Allocate a new stack */ if (imgp->stack_sz != 0) { - ssiz = imgp->stack_sz; + ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 4d23fc0..0a601a1 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -907,9 +907,11 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options) * Destroy resource accounting information associated with the process. */ #ifdef RACCT - PROC_LOCK(p); - racct_sub(p, RACCT_NPROC, 1); - PROC_UNLOCK(p); + if (racct_enable) { + PROC_LOCK(p); + racct_sub(p, RACCT_NPROC, 1); + PROC_UNLOCK(p); + } #endif racct_proc_exit(p); diff --git a/sys/kern/kern_gzio.c b/sys/kern/kern_gzio.c index a4974a7..cee21f0 100644 --- a/sys/kern/kern_gzio.c +++ b/sys/kern/kern_gzio.c @@ -32,8 +32,7 @@ __FBSDID("$FreeBSD$"); #include <sys/gzio.h> #include <sys/kernel.h> #include <sys/malloc.h> - -#include <net/zutil.h> +#include <sys/zutil.h> #define KERN_GZ_HDRLEN 10 /* gzip header length */ #define KERN_GZ_TRAILERLEN 8 /* gzip trailer length */ diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index 7a5d936..a84019a 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -1455,12 +1455,7 @@ intr_event_handle(struct intr_event *ie, struct trapframe *frame) /* Schedule the ithread if needed. */ if (thread) { error = intr_event_schedule_thread(ie); -#ifndef XEN KASSERT(error == 0, ("bad stray interrupt")); -#else - if (error != 0) - log(LOG_WARNING, "bad stray interrupt"); -#endif } critical_exit(); td->td_intr_nesting_level--; diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 5b23129..c87c4e2 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1778,7 +1778,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) mtx_unlock(&pr->pr_mtx); #ifdef RACCT - if (created) + if (racct_enable && created) prison_racct_attach(pr); #endif @@ -1862,7 +1862,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } #ifdef RACCT - if (!created) { + if (racct_enable && !created) { if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); prison_racct_modify(pr); @@ -2652,7 +2652,8 @@ prison_deref(struct prison *pr, int flags) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); #ifdef RACCT - prison_racct_detach(pr); + if (racct_enable) + prison_racct_detach(pr); #endif free(pr, M_PRISON); @@ -4460,12 +4461,15 @@ SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the zfs file system"); +#ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void *arg2, void *arg3) { struct prison_racct *prr; + ASSERT_RACCT_ENABLED(); + sx_slock(&allprison_lock); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); @@ -4477,6 +4481,7 @@ prison_racct_find_locked(const char *name) { struct prison_racct *prr; + ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) @@ -4507,6 +4512,8 @@ prison_racct_find(const char *name) { struct prison_racct *prr; + ASSERT_RACCT_ENABLED(); + sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); @@ -4517,6 +4524,8 @@ void prison_racct_hold(struct prison_racct *prr) { + ASSERT_RACCT_ENABLED(); + refcount_acquire(&prr->prr_refcount); } @@ -4524,6 +4533,7 @@ static void prison_racct_free_locked(struct prison_racct *prr) { + ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { @@ -4538,6 +4548,7 @@ prison_racct_free(struct prison_racct *prr) { int old; + ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); old = prr->prr_refcount; @@ -4549,12 +4560,12 @@ prison_racct_free(struct prison_racct *prr) sx_xunlock(&allprison_lock); } -#ifdef RACCT static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; + ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); @@ -4574,6 +4585,8 @@ prison_racct_modify(struct prison *pr) struct ucred *cred; struct prison_racct *oldprr; + ASSERT_RACCT_ENABLED(); + sx_slock(&allproc_lock); sx_xlock(&allprison_lock); @@ -4613,6 +4626,7 @@ static void prison_racct_detach(struct prison *pr) { + ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 505521d..6618c08 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -2822,7 +2822,7 @@ static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, - CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, + CTLFLAG_RW | CTLFLAG_CAPWR | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE, diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 05becea..5cee140 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -70,8 +70,15 @@ FEATURE(racct, "Resource Accounting"); * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; +#ifdef RACCT_DISABLED +int racct_enable = 0; +#else +int racct_enable = 1; +#endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); +SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, + 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); @@ -313,6 +320,8 @@ racct_getpcpu(struct proc *p, u_int pcpu) fixpt_t p_pctcpu; struct thread *td; + ASSERT_RACCT_ENABLED(); + /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. @@ -377,6 +386,7 @@ racct_add_racct(struct racct *dest, const struct racct *src) { int i; + ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* @@ -398,6 +408,7 @@ racct_sub_racct(struct racct *dest, const struct racct *src) { int i; + ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* @@ -431,6 +442,9 @@ void racct_create(struct racct **racctp) { + if (!racct_enable) + return; + SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); KASSERT(*racctp == NULL, ("racct already allocated")); @@ -444,6 +458,8 @@ racct_destroy_locked(struct racct **racctp) int i; struct racct *racct; + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); mtx_assert(&racct_lock, MA_OWNED); @@ -470,6 +486,9 @@ void racct_destroy(struct racct **racct) { + if (!racct_enable) + return; + mtx_lock(&racct_lock); racct_destroy_locked(racct); mtx_unlock(&racct_lock); @@ -485,6 +504,7 @@ racct_alloc_resource(struct racct *racct, int resource, uint64_t amount) { + ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); KASSERT(racct != NULL, ("NULL racct")); @@ -516,6 +536,8 @@ racct_add_locked(struct proc *p, int resource, uint64_t amount) int error; #endif + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); /* @@ -546,6 +568,9 @@ racct_add(struct proc *p, int resource, uint64_t amount) { int error; + if (!racct_enable) + return (0); + mtx_lock(&racct_lock); error = racct_add_locked(p, resource, amount); mtx_unlock(&racct_lock); @@ -557,6 +582,8 @@ racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount, 0, 0); @@ -577,6 +604,9 @@ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { + if (!racct_enable) + return; + mtx_lock(&racct_lock); racct_add_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); @@ -590,6 +620,9 @@ void racct_add_force(struct proc *p, int resource, uint64_t amount) { + if (!racct_enable) + return; + SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0); /* @@ -612,6 +645,8 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount) int error; #endif + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); /* @@ -671,6 +706,9 @@ racct_set(struct proc *p, int resource, uint64_t amount) { int error; + if (!racct_enable) + return (0); + mtx_lock(&racct_lock); error = racct_set_locked(p, resource, amount); mtx_unlock(&racct_lock); @@ -683,6 +721,8 @@ racct_set_force_locked(struct proc *p, int resource, uint64_t amount) int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); /* @@ -717,6 +757,10 @@ racct_set_force_locked(struct proc *p, int resource, uint64_t amount) void racct_set_force(struct proc *p, int resource, uint64_t amount) { + + if (!racct_enable) + return; + mtx_lock(&racct_lock); racct_set_force_locked(p, resource, amount); mtx_unlock(&racct_lock); @@ -732,6 +776,9 @@ uint64_t racct_get_limit(struct proc *p, int resource) { + if (!racct_enable) + return (UINT64_MAX); + #ifdef RCTL return (rctl_get_limit(p, resource)); #else @@ -749,6 +796,9 @@ uint64_t racct_get_available(struct proc *p, int resource) { + if (!racct_enable) + return (UINT64_MAX); + #ifdef RCTL return (rctl_get_available(p, resource)); #else @@ -765,6 +815,8 @@ static int64_t racct_pcpu_available(struct proc *p) { + ASSERT_RACCT_ENABLED(); + #ifdef RCTL return (rctl_pcpu_available(p)); #else @@ -779,6 +831,9 @@ void racct_sub(struct proc *p, int resource, uint64_t amount) { + if (!racct_enable) + return; + SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); /* @@ -804,6 +859,8 @@ racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; + ASSERT_RACCT_ENABLED(); + SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount, 0, 0); @@ -827,6 +884,9 @@ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { + if (!racct_enable) + return; + mtx_lock(&racct_lock); racct_sub_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); @@ -840,6 +900,9 @@ racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; + if (!racct_enable) + return (0); + /* * Create racct for the child process. */ @@ -896,6 +959,9 @@ racct_proc_fork_done(struct proc *child) { #ifdef RCTL + if (!racct_enable) + return; + PROC_LOCK(child); mtx_lock(&racct_lock); rctl_enforce(child, RACCT_NPROC, 0); @@ -913,6 +979,9 @@ racct_proc_exit(struct proc *p) struct timeval wallclock; uint64_t pct_estimate, pct; + if (!racct_enable) + return; + PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. @@ -967,6 +1036,9 @@ racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; + if (!racct_enable) + return; + PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; @@ -1004,6 +1076,8 @@ void racct_move(struct racct *dest, struct racct *src) { + ASSERT_RACCT_ENABLED(); + mtx_lock(&racct_lock); racct_add_racct(dest, src); @@ -1020,6 +1094,7 @@ racct_proc_throttle(struct proc *p) int cpuid; #endif + ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* @@ -1065,6 +1140,9 @@ racct_proc_throttle(struct proc *p) static void racct_proc_wakeup(struct proc *p) { + + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled) { @@ -1079,6 +1157,8 @@ racct_decay_resource(struct racct *racct, void * res, void* dummy) int resource; int64_t r_old, r_new; + ASSERT_RACCT_ENABLED(); + resource = *(int *)res; r_old = racct->r_resources[resource]; @@ -1095,6 +1175,9 @@ racct_decay_resource(struct racct *racct, void * res, void* dummy) static void racct_decay(int resource) { + + ASSERT_RACCT_ENABLED(); + ui_racct_foreach(racct_decay_resource, &resource, NULL); loginclass_racct_foreach(racct_decay_resource, &resource, NULL); prison_racct_foreach(racct_decay_resource, &resource, NULL); @@ -1109,6 +1192,8 @@ racctd(void) uint64_t runtime; uint64_t pct, pct_estimate; + ASSERT_RACCT_ENABLED(); + for (;;) { racct_decay(RACCT_PCTCPU); @@ -1188,11 +1273,22 @@ static struct kproc_desc racctd_kp = { racctd, NULL }; -SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); + +static void +racctd_init(void) +{ + if (!racct_enable) + return; + + kproc_start(&racctd_kp); +} +SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { + if (!racct_enable) + return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 934327a..c43b83d 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -225,6 +225,7 @@ rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) int64_t available = INT64_MAX; struct ucred *cred = p->p_ucred; + ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_LOCKED); resource = rule->rr_resource; @@ -264,6 +265,8 @@ rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule, { int64_t available; + ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_LOCKED); available = rctl_available_resource(p, rule); @@ -283,6 +286,8 @@ rctl_pcpu_available(const struct proc *p) { struct rctl_rule_link *link; int64_t available, minavailable, limit; + ASSERT_RACCT_ENABLED(); + minavailable = INT64_MAX; limit = 0; @@ -334,6 +339,8 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) static int curtime = 0; static struct timeval lasttime; + ASSERT_RACCT_ENABLED(); + rw_rlock(&rctl_lock); /* @@ -457,6 +464,8 @@ rctl_get_limit(struct proc *p, int resource) struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; + ASSERT_RACCT_ENABLED(); + rw_rlock(&rctl_lock); /* @@ -487,6 +496,8 @@ rctl_get_available(struct proc *p, int resource) minavailable = INT64_MAX; + ASSERT_RACCT_ENABLED(); + rw_rlock(&rctl_lock); /* @@ -521,6 +532,8 @@ static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { + ASSERT_RACCT_ENABLED(); + if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); @@ -635,6 +648,7 @@ rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; + ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); @@ -652,6 +666,7 @@ rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; + ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rw_assert(&rctl_lock, RA_WLOCKED); @@ -678,6 +693,7 @@ rctl_racct_remove_rules(struct racct *racct, int removed = 0; struct rctl_rule_link *link, *linktmp; + ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_WLOCKED); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { @@ -696,6 +712,8 @@ static void rctl_rule_acquire_subject(struct rctl_rule *rule) { + ASSERT_RACCT_ENABLED(); + switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: @@ -722,6 +740,8 @@ static void rctl_rule_release_subject(struct rctl_rule *rule) { + ASSERT_RACCT_ENABLED(); + switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: @@ -749,6 +769,8 @@ rctl_rule_alloc(int flags) { struct rctl_rule *rule; + ASSERT_RACCT_ENABLED(); + rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); @@ -771,6 +793,8 @@ rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; + ASSERT_RACCT_ENABLED(); + copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); @@ -793,6 +817,7 @@ void rctl_rule_acquire(struct rctl_rule *rule) { + ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); @@ -805,6 +830,7 @@ rctl_rule_free(void *context, int pending) rule = (struct rctl_rule *)context; + ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* @@ -819,6 +845,7 @@ void rctl_rule_release(struct rctl_rule *rule) { + ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { @@ -838,6 +865,8 @@ static int rctl_rule_fully_specified(const struct rctl_rule *rule) { + ASSERT_RACCT_ENABLED(); + switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); @@ -882,6 +911,8 @@ rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) struct rctl_rule *rule; id_t id; + ASSERT_RACCT_ENABLED(); + rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); @@ -1008,6 +1039,7 @@ rctl_rule_add(struct rctl_rule *rule) struct rctl_rule *rule2; int match; + ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* @@ -1118,6 +1150,8 @@ rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; + ASSERT_RACCT_ENABLED(); + rw_wlock(&rctl_lock); found += rctl_racct_remove_rules(racct, filter); rw_wunlock(&rctl_lock); @@ -1134,6 +1168,8 @@ rctl_rule_remove(struct rctl_rule *filter) int found = 0; struct proc *p; + ASSERT_RACCT_ENABLED(); + if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; @@ -1172,6 +1208,8 @@ rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; + ASSERT_RACCT_ENABLED(); + sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { @@ -1231,6 +1269,8 @@ rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) int error; char *str; + ASSERT_RACCT_ENABLED(); + if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFLEN) @@ -1256,6 +1296,8 @@ rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; + ASSERT_RACCT_ENABLED(); + if (outputsbuf == NULL) return (0); @@ -1277,6 +1319,8 @@ rctl_racct_to_sbuf(struct racct *racct, int sloppy) int64_t amount; struct sbuf *sb; + ASSERT_RACCT_ENABLED(); + sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) @@ -1302,6 +1346,9 @@ sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) struct loginclass *lc; struct prison_racct *prr; + if (!racct_enable) + return (ENOSYS); + error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); @@ -1372,6 +1419,8 @@ rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; + ASSERT_RACCT_ENABLED(); + rw_rlock(&rctl_lock); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) @@ -1393,6 +1442,9 @@ sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) struct rctl_rule_link *link; struct proc *p; + if (!racct_enable) + return (ENOSYS); + error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); @@ -1467,6 +1519,9 @@ sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) struct rctl_rule *filter; struct rctl_rule_link *link; + if (!racct_enable) + return (ENOSYS); + error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); @@ -1538,6 +1593,9 @@ sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) struct rctl_rule *rule; char *inputstr; + if (!racct_enable) + return (ENOSYS); + error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); @@ -1580,6 +1638,9 @@ sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) struct rctl_rule *filter; char *inputstr; + if (!racct_enable) + return (ENOSYS); + error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); @@ -1616,6 +1677,8 @@ rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) struct prison_racct *newprr; LIST_HEAD(, rctl_rule_link) newrules; + ASSERT_RACCT_ENABLED(); + newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; @@ -1756,6 +1819,7 @@ rctl_proc_fork(struct proc *parent, struct proc *child) LIST_INIT(&child->p_racct->r_rule_links); + ASSERT_RACCT_ENABLED(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); rw_wlock(&rctl_lock); @@ -1809,6 +1873,8 @@ rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; + ASSERT_RACCT_ENABLED(); + rw_wlock(&rctl_lock); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); @@ -1823,6 +1889,9 @@ static void rctl_init(void) { + if (!racct_enable) + return; + rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index e547c5f..998ee15 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -154,7 +154,6 @@ static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); -static void vpanic(const char *fmt, va_list ap) __dead2; /* register various local shutdown events */ static void @@ -676,7 +675,7 @@ panic(const char *fmt, ...) vpanic(fmt, ap); } -static void +void vpanic(const char *fmt, va_list ap) { #ifdef SMP diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index ad10dc7..a238a09 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -66,12 +66,6 @@ __FBSDID("$FreeBSD$"); #include <machine/cpu.h> -#ifdef XEN -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/pmap.h> -#endif - #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ @@ -475,9 +469,6 @@ mi_switch(int flags, struct thread *newtd) "lockname:\"%s\"", td->td_lockname); #endif SDT_PROBE0(sched, , , preempt); -#ifdef XEN - PT_UPDATES_FLUSH(); -#endif sched_switch(td, newtd, flags); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 280bc0b..6911bb97 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -187,11 +187,13 @@ create_thread(struct thread *td, mcontext_t *ctx, } #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_add(p, RACCT_NTHR, 1); - PROC_UNLOCK(td->td_proc); - if (error != 0) - return (EPROCLIM); + if (racct_enable) { + PROC_LOCK(p); + error = racct_add(p, RACCT_NTHR, 1); + PROC_UNLOCK(p); + if (error != 0) + return (EPROCLIM); + } #endif /* Initialize our td */ @@ -250,9 +252,9 @@ create_thread(struct thread *td, mcontext_t *ctx, } } - PROC_LOCK(td->td_proc); - td->td_proc->p_flag |= P_HADTHREADS; - thread_link(newtd, p); + PROC_LOCK(p); + p->p_flag |= P_HADTHREADS; + thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ @@ -280,9 +282,11 @@ create_thread(struct thread *td, mcontext_t *ctx, fail: #ifdef RACCT - PROC_LOCK(p); - racct_sub(p, RACCT_NTHR, 1); - PROC_UNLOCK(p); + if (racct_enable) { + PROC_LOCK(p); + racct_sub(p, RACCT_NTHR, 1); + PROC_UNLOCK(p); + } #endif return (error); } diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 65c8276..26be035 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -66,7 +66,7 @@ __FBSDID("$FreeBSD$"); #include <sys/link_elf.h> #ifdef DDB_CTF -#include <net/zlib.h> +#include <sys/zlib.h> #endif #include "linker_if.h" diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c index 5c107fe..021381d 100644 --- a/sys/kern/link_elf_obj.c +++ b/sys/kern/link_elf_obj.c @@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$"); #include <sys/link_elf.h> #ifdef DDB_CTF -#include <net/zlib.h> +#include <sys/zlib.h> #endif #include "linker_if.h" diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 3e39d55..59bd387 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -1585,7 +1585,7 @@ sched_pctcpu(struct thread *td) return (ts->ts_pctcpu); } -#ifdef RACCT +#ifdef RACCT /* * Calculates the contribution to the thread cpu usage for the latest * (unfinished) second. diff --git a/sys/kern/subr_dnvlist.c b/sys/kern/subr_dnvlist.c index fe17684..9058520 100644 --- a/sys/kern/subr_dnvlist.c +++ b/sys/kern/subr_dnvlist.c @@ -89,89 +89,6 @@ dnvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep, return (value); } -#ifndef _KERNEL -#define DNVLIST_GETF(ftype, type) \ -ftype \ -dnvlist_getf_##type(const nvlist_t *nvl, ftype defval, \ - const char *namefmt, ...) \ -{ \ - va_list nameap; \ - ftype value; \ - \ - va_start(nameap, namefmt); \ - value = dnvlist_getv_##type(nvl, defval, namefmt, nameap); \ - va_end(nameap); \ - \ - return (value); \ -} - -DNVLIST_GETF(bool, bool) -DNVLIST_GETF(uint64_t, number) -DNVLIST_GETF(const char *, string) -DNVLIST_GETF(const nvlist_t *, nvlist) -DNVLIST_GETF(int, descriptor) - -#undef DNVLIST_GETF - -const void * -dnvlist_getf_binary(const nvlist_t *nvl, size_t *sizep, const void *defval, - size_t defsize, const char *namefmt, ...) -{ - va_list nameap; - const void *value; - - va_start(nameap, namefmt); - value = dnvlist_getv_binary(nvl, sizep, defval, defsize, namefmt, - nameap); - va_end(nameap); - - return (value); -} - -#define DNVLIST_GETV(ftype, type) \ -ftype \ -dnvlist_getv_##type(const nvlist_t *nvl, ftype defval, \ - const char *namefmt, va_list nameap) \ -{ \ - char *name; \ - ftype value; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - return (defval); \ - value = dnvlist_get_##type(nvl, name, defval); \ - free(name); \ - return (value); \ -} - -DNVLIST_GETV(bool, bool) -DNVLIST_GETV(uint64_t, number) -DNVLIST_GETV(const char *, string) -DNVLIST_GETV(const nvlist_t *, nvlist) -DNVLIST_GETV(int, descriptor) - -#undef DNVLIST_GETV - -const void * -dnvlist_getv_binary(const nvlist_t *nvl, size_t *sizep, const void *defval, - size_t defsize, const char *namefmt, va_list nameap) -{ - char *name; - const void *value; - - nv_vasprintf(&name, namefmt, nameap); - if (name != NULL) { - value = dnvlist_get_binary(nvl, name, sizep, defval, defsize); - nv_free(name); - } else { - if (sizep != NULL) - *sizep = defsize; - value = defval; - } - return (value); -} -#endif - #define DNVLIST_TAKE(ftype, type) \ ftype \ dnvlist_take_##type(nvlist_t *nvl, const char *name, ftype defval) \ @@ -209,86 +126,3 @@ dnvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep, return (value); } -#ifndef _KERNEL -#define DNVLIST_TAKEF(ftype, type) \ -ftype \ -dnvlist_takef_##type(nvlist_t *nvl, ftype defval, \ - const char *namefmt, ...) \ -{ \ - va_list nameap; \ - ftype value; \ - \ - va_start(nameap, namefmt); \ - value = dnvlist_takev_##type(nvl, defval, namefmt, nameap); \ - va_end(nameap); \ - \ - return (value); \ -} - -DNVLIST_TAKEF(bool, bool) -DNVLIST_TAKEF(uint64_t, number) -DNVLIST_TAKEF(char *, string) -DNVLIST_TAKEF(nvlist_t *, nvlist) -DNVLIST_TAKEF(int, descriptor) - -#undef DNVLIST_TAKEF - -void * -dnvlist_takef_binary(nvlist_t *nvl, size_t *sizep, void *defval, - size_t defsize, const char *namefmt, ...) -{ - va_list nameap; - void *value; - - va_start(nameap, namefmt); - value = dnvlist_takev_binary(nvl, sizep, defval, defsize, namefmt, - nameap); - va_end(nameap); - - return (value); -} - -#define DNVLIST_TAKEV(ftype, type) \ -ftype \ -dnvlist_takev_##type(nvlist_t *nvl, ftype defval, const char *namefmt, \ - va_list nameap) \ -{ \ - char *name; \ - ftype value; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - return (defval); \ - value = dnvlist_take_##type(nvl, name, defval); \ - free(name); \ - return (value); \ -} - -DNVLIST_TAKEV(bool, bool) -DNVLIST_TAKEV(uint64_t, number) -DNVLIST_TAKEV(char *, string) -DNVLIST_TAKEV(nvlist_t *, nvlist) -DNVLIST_TAKEV(int, descriptor) - -#undef DNVLIST_TAKEV - -void * -dnvlist_takev_binary(nvlist_t *nvl, size_t *sizep, void *defval, - size_t defsize, const char *namefmt, va_list nameap) -{ - char *name; - void *value; - - nv_vasprintf(&name, namefmt, nameap); - if (name != NULL) { - value = dnvlist_take_binary(nvl, name, sizep, defval, defsize); - nv_free(name); - } else { - if (sizep != NULL) - *sizep = defsize; - value = defval; - } - - return (value); -} -#endif diff --git a/sys/kern/subr_nvlist.c b/sys/kern/subr_nvlist.c index f352c62..f96b890 100644 --- a/sys/kern/subr_nvlist.c +++ b/sys/kern/subr_nvlist.c @@ -142,12 +142,11 @@ void nvlist_destroy(nvlist_t *nvl) { nvpair_t *nvp; - int serrno; if (nvl == NULL) return; - SAVE_ERRNO(serrno); + ERRNO_SAVE(); NVLIST_ASSERT(nvl); @@ -158,7 +157,7 @@ nvlist_destroy(nvlist_t *nvl) nvl->nvl_magic = 0; nv_free(nvl); - RESTORE_ERRNO(serrno); + ERRNO_RESTORE(); } void @@ -231,6 +230,17 @@ nvlist_empty(const nvlist_t *nvl) return (nvlist_first_nvpair(nvl) == NULL); } +int +nvlist_flags(const nvlist_t *nvl) +{ + + NVLIST_ASSERT(nvl); + PJDLOG_ASSERT(nvl->nvl_error == 0); + PJDLOG_ASSERT((nvl->nvl_flags & ~(NV_FLAG_PUBLIC_MASK)) == 0); + + return (nvl->nvl_flags); +} + static void nvlist_report_missing(int type, const char *name) { @@ -264,7 +274,7 @@ nvlist_find(const nvlist_t *nvl, int type, const char *name) } if (nvp == NULL) - RESTORE_ERRNO(ENOENT); + ERRNO_SET(ENOENT); return (nvp); } @@ -281,37 +291,6 @@ nvlist_exists_type(const nvlist_t *nvl, const char *name, int type) return (nvlist_find(nvl, type, name) != NULL); } -#ifndef _KERNEL -bool -nvlist_existsf_type(const nvlist_t *nvl, int type, const char *namefmt, ...) -{ - va_list nameap; - bool ret; - - va_start(nameap, namefmt); - ret = nvlist_existsv_type(nvl, type, namefmt, nameap); - va_end(nameap); - - return (ret); -} - -bool -nvlist_existsv_type(const nvlist_t *nvl, int type, const char *namefmt, - va_list nameap) -{ - char *name; - bool exists; - - nv_vasprintf(&name, namefmt, nameap); - if (name == NULL) - return (false); - - exists = nvlist_exists_type(nvl, name, type); - nv_free(name); - return (exists); -} -#endif - void nvlist_free_type(nvlist_t *nvl, const char *name, int type) { @@ -329,30 +308,6 @@ nvlist_free_type(nvlist_t *nvl, const char *name, int type) nvlist_report_missing(type, name); } -#ifndef _KERNEL -void -nvlist_freef_type(nvlist_t *nvl, int type, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_freev_type(nvl, type, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_freev_type(nvlist_t *nvl, int type, const char *namefmt, va_list nameap) -{ - char *name; - - nv_vasprintf(&name, namefmt, nameap); - if (name == NULL) - nvlist_report_missing(type, "<unknown>"); - nvlist_free_type(nvl, name, type); - nv_free(name); -} -#endif - nvlist_t * nvlist_clone(const nvlist_t *nvl) { @@ -362,7 +317,7 @@ nvlist_clone(const nvlist_t *nvl) NVLIST_ASSERT(nvl); if (nvl->nvl_error != 0) { - RESTORE_ERRNO(nvl->nvl_error); + ERRNO_SET(nvl->nvl_error); return (NULL); } @@ -533,27 +488,30 @@ out: #ifndef _KERNEL static int * -nvlist_xdescriptors(const nvlist_t *nvl, int *descs, int level) +nvlist_xdescriptors(const nvlist_t *nvl, int *descs) { - const nvpair_t *nvp; + nvpair_t *nvp; + const char *name; + int type; NVLIST_ASSERT(nvl); PJDLOG_ASSERT(nvl->nvl_error == 0); - PJDLOG_ASSERT(level < 3); - for (nvp = nvlist_first_nvpair(nvl); nvp != NULL; - nvp = nvlist_next_nvpair(nvl, nvp)) { - switch (nvpair_type(nvp)) { - case NV_TYPE_DESCRIPTOR: - *descs = nvpair_get_descriptor(nvp); - descs++; - break; - case NV_TYPE_NVLIST: - descs = nvlist_xdescriptors(nvpair_get_nvlist(nvp), - descs, level + 1); - break; + nvp = NULL; + do { + while ((name = nvlist_next(nvl, &type, (void**)&nvp)) != NULL) { + switch (type) { + case NV_TYPE_DESCRIPTOR: + *descs = nvpair_get_descriptor(nvp); + descs++; + break; + case NV_TYPE_NVLIST: + nvl = nvpair_get_nvlist(nvp); + nvp = NULL; + break; + } } - } + } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL); return (descs); } @@ -571,7 +529,7 @@ nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp) if (fds == NULL) return (NULL); if (nitems > 0) - nvlist_xdescriptors(nvl, fds, 0); + nvlist_xdescriptors(nvl, fds); fds[nitems] = -1; if (nitemsp != NULL) *nitemsp = nitems; @@ -579,30 +537,33 @@ nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp) } #endif -static size_t -nvlist_xndescriptors(const nvlist_t *nvl, int level) +size_t +nvlist_ndescriptors(const nvlist_t *nvl) { #ifndef _KERNEL - const nvpair_t *nvp; + nvpair_t *nvp; + const char *name; size_t ndescs; + int type; NVLIST_ASSERT(nvl); PJDLOG_ASSERT(nvl->nvl_error == 0); - PJDLOG_ASSERT(level < 3); ndescs = 0; - for (nvp = nvlist_first_nvpair(nvl); nvp != NULL; - nvp = nvlist_next_nvpair(nvl, nvp)) { - switch (nvpair_type(nvp)) { - case NV_TYPE_DESCRIPTOR: - ndescs++; - break; - case NV_TYPE_NVLIST: - ndescs += nvlist_xndescriptors(nvpair_get_nvlist(nvp), - level + 1); - break; + nvp = NULL; + do { + while ((name = nvlist_next(nvl, &type, (void**)&nvp)) != NULL) { + switch (type) { + case NV_TYPE_DESCRIPTOR: + ndescs++; + break; + case NV_TYPE_NVLIST: + nvl = nvpair_get_nvlist(nvp); + nvp = NULL; + break; + } } - } + } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL); return (ndescs); #else @@ -610,13 +571,6 @@ nvlist_xndescriptors(const nvlist_t *nvl, int level) #endif } -size_t -nvlist_ndescriptors(const nvlist_t *nvl) -{ - - return (nvlist_xndescriptors(nvl, 0)); -} - static unsigned char * nvlist_pack_header(const nvlist_t *nvl, unsigned char *ptr, size_t *leftp) { @@ -640,7 +594,7 @@ nvlist_pack_header(const nvlist_t *nvl, unsigned char *ptr, size_t *leftp) return (ptr); } -void * +static void * nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) { unsigned char *buf, *ptr; @@ -652,7 +606,7 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) NVLIST_ASSERT(nvl); if (nvl->nvl_error != 0) { - RESTORE_ERRNO(nvl->nvl_error); + ERRNO_SET(nvl->nvl_error); return (NULL); } @@ -742,12 +696,12 @@ nvlist_pack(const nvlist_t *nvl, size_t *sizep) NVLIST_ASSERT(nvl); if (nvl->nvl_error != 0) { - RESTORE_ERRNO(nvl->nvl_error); + ERRNO_SET(nvl->nvl_error); return (NULL); } if (nvlist_ndescriptors(nvl) > 0) { - RESTORE_ERRNO(EOPNOTSUPP); + ERRNO_SET(EOPNOTSUPP); return (NULL); } @@ -759,11 +713,11 @@ nvlist_check_header(struct nvlist_header *nvlhdrp) { if (nvlhdrp->nvlh_magic != NVLIST_HEADER_MAGIC) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (false); } if ((nvlhdrp->nvlh_flags & ~NV_FLAG_ALL_MASK) != 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (false); } #if BYTE_ORDER == BIG_ENDIAN @@ -815,11 +769,11 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds, return (ptr); failed: - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } -nvlist_t * +static nvlist_t * nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds) { const unsigned char *ptr; @@ -909,10 +863,10 @@ nvlist_send(int sock, const nvlist_t *nvl) int *fds; void *data; int64_t fdidx; - int serrno, ret; + int ret; if (nvlist_error(nvl) != 0) { - errno = nvlist_error(nvl); + ERRNO_SET(nvlist_error(nvl)); return (-1); } @@ -938,10 +892,10 @@ nvlist_send(int sock, const nvlist_t *nvl) ret = 0; out: - serrno = errno; + ERRNO_SAVE(); free(fds); free(data); - errno = serrno; + ERRNO_RESTORE(); return (ret); } @@ -952,7 +906,7 @@ nvlist_recv(int sock) nvlist_t *nvl, *ret; unsigned char *buf; size_t nfds, size, i; - int serrno, *fds; + int *fds; if (buf_recv(sock, &nvlhdr, sizeof(nvlhdr)) == -1) return (NULL); @@ -963,7 +917,7 @@ nvlist_recv(int sock) nfds = (size_t)nvlhdr.nvlh_descriptors; size = sizeof(nvlhdr) + (size_t)nvlhdr.nvlh_size; - buf = malloc(size); + buf = nv_malloc(size); if (buf == NULL) return (NULL); @@ -976,7 +930,7 @@ nvlist_recv(int sock) goto out; if (nfds > 0) { - fds = malloc(nfds * sizeof(fds[0])); + fds = nv_malloc(nfds * sizeof(fds[0])); if (fds == NULL) goto out; if (fd_recv(sock, fds, nfds) == -1) @@ -985,17 +939,19 @@ nvlist_recv(int sock) nvl = nvlist_xunpack(buf, size, fds, nfds); if (nvl == NULL) { + ERRNO_SAVE(); for (i = 0; i < nfds; i++) close(fds[i]); + ERRNO_RESTORE(); goto out; } ret = nvl; out: - serrno = errno; + ERRNO_SAVE(); free(buf); free(fds); - errno = serrno; + ERRNO_RESTORE(); return (ret); } @@ -1100,86 +1056,6 @@ NVLIST_EXISTS(binary, BINARY) #undef NVLIST_EXISTS -#ifndef _KERNEL -bool -nvlist_existsf(const nvlist_t *nvl, const char *namefmt, ...) -{ - va_list nameap; - bool ret; - - va_start(nameap, namefmt); - ret = nvlist_existsv(nvl, namefmt, nameap); - va_end(nameap); - return (ret); -} - -#define NVLIST_EXISTSF(type) \ -bool \ -nvlist_existsf_##type(const nvlist_t *nvl, const char *namefmt, ...) \ -{ \ - va_list nameap; \ - bool ret; \ - \ - va_start(nameap, namefmt); \ - ret = nvlist_existsv_##type(nvl, namefmt, nameap); \ - va_end(nameap); \ - return (ret); \ -} - -NVLIST_EXISTSF(null) -NVLIST_EXISTSF(bool) -NVLIST_EXISTSF(number) -NVLIST_EXISTSF(string) -NVLIST_EXISTSF(nvlist) -#ifndef _KERNEL -NVLIST_EXISTSF(descriptor) -#endif -NVLIST_EXISTSF(binary) - -#undef NVLIST_EXISTSF - -bool -nvlist_existsv(const nvlist_t *nvl, const char *namefmt, va_list nameap) -{ - char *name; - bool exists; - - nv_vasprintf(&name, namefmt, nameap); - if (name == NULL) - return (false); - - exists = nvlist_exists(nvl, name); - nv_free(name); - return (exists); -} - -#define NVLIST_EXISTSV(type) \ -bool \ -nvlist_existsv_##type(const nvlist_t *nvl, const char *namefmt, \ - va_list nameap) \ -{ \ - char *name; \ - bool exists; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - return (false); \ - exists = nvlist_exists_##type(nvl, name); \ - free(name); \ - return (exists); \ -} - -NVLIST_EXISTSV(null) -NVLIST_EXISTSV(bool) -NVLIST_EXISTSV(number) -NVLIST_EXISTSV(string) -NVLIST_EXISTSV(nvlist) -NVLIST_EXISTSV(descriptor) -NVLIST_EXISTSV(binary) - -#undef NVLIST_EXISTSV -#endif - void nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp) { @@ -1188,19 +1064,19 @@ nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp) NVPAIR_ASSERT(nvp); if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } if (nvlist_exists(nvl, nvpair_name(nvp))) { nvl->nvl_error = EEXIST; - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } newnvp = nvpair_clone(nvp); if (newnvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } @@ -1208,34 +1084,6 @@ nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp) } void -nvlist_add_null(nvlist_t *nvl, const char *name) -{ - - nvlist_addf_null(nvl, "%s", name); -} - -void -nvlist_add_bool(nvlist_t *nvl, const char *name, bool value) -{ - - nvlist_addf_bool(nvl, value, "%s", name); -} - -void -nvlist_add_number(nvlist_t *nvl, const char *name, uint64_t value) -{ - - nvlist_addf_number(nvl, value, "%s", name); -} - -void -nvlist_add_string(nvlist_t *nvl, const char *name, const char *value) -{ - - nvlist_addf_string(nvl, value, "%s", name); -} - -void nvlist_add_stringf(nvlist_t *nvl, const char *name, const char *valuefmt, ...) { va_list valueap; @@ -1252,213 +1100,117 @@ nvlist_add_stringv(nvlist_t *nvl, const char *name, const char *valuefmt, nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } nvp = nvpair_create_stringv(name, valuefmt, valueap); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value) -{ - - nvlist_addf_nvlist(nvl, value, "%s", name); -} - -#ifndef _KERNEL -void -nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value) -{ - - nvlist_addf_descriptor(nvl, value, "%s", name); -} -#endif - -void -nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, - size_t size) -{ - - nvlist_addf_binary(nvl, value, size, "%s", name); -} - -void -nvlist_addf_null(nvlist_t *nvl, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_null(nvl, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_addf_bool(nvlist_t *nvl, bool value, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_bool(nvl, value, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_addf_number(nvlist_t *nvl, uint64_t value, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_number(nvl, value, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_addf_string(nvlist_t *nvl, const char *value, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_string(nvl, value, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_addf_nvlist(nvlist_t *nvl, const nvlist_t *value, const char *namefmt, - ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_nvlist(nvl, value, namefmt, nameap); - va_end(nameap); -} - -#ifndef _KERNEL -void -nvlist_addf_descriptor(nvlist_t *nvl, int value, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_descriptor(nvl, value, namefmt, nameap); - va_end(nameap); -} -#endif - -void -nvlist_addf_binary(nvlist_t *nvl, const void *value, size_t size, - const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_addv_binary(nvl, value, size, namefmt, nameap); - va_end(nameap); -} - -void -nvlist_addv_null(nvlist_t *nvl, const char *namefmt, va_list nameap) +nvlist_add_null(nvlist_t *nvl, const char *name) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_null(namefmt, nameap); + nvp = nvpair_create_null(name); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_addv_bool(nvlist_t *nvl, bool value, const char *namefmt, va_list nameap) +nvlist_add_bool(nvlist_t *nvl, const char *name, bool value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_bool(value, namefmt, nameap); + nvp = nvpair_create_bool(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_addv_number(nvlist_t *nvl, uint64_t value, const char *namefmt, - va_list nameap) +nvlist_add_number(nvlist_t *nvl, const char *name, uint64_t value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_number(value, namefmt, nameap); + nvp = nvpair_create_number(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_addv_string(nvlist_t *nvl, const char *value, const char *namefmt, - va_list nameap) +nvlist_add_string(nvlist_t *nvl, const char *name, const char *value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_string(value, namefmt, nameap); + nvp = nvpair_create_string(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_addv_nvlist(nvlist_t *nvl, const nvlist_t *value, const char *namefmt, - va_list nameap) +nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_nvlist(value, namefmt, nameap); + nvp = nvpair_create_nvlist(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } #ifndef _KERNEL void -nvlist_addv_descriptor(nvlist_t *nvl, int value, const char *namefmt, - va_list nameap) +nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value) { nvpair_t *nvp; @@ -1467,7 +1219,7 @@ nvlist_addv_descriptor(nvlist_t *nvl, int value, const char *namefmt, return; } - nvp = nvpair_createv_descriptor(value, namefmt, nameap); + nvp = nvpair_create_descriptor(name, value); if (nvp == NULL) nvl->nvl_error = errno = (errno != 0 ? errno : ENOMEM); else @@ -1476,22 +1228,23 @@ nvlist_addv_descriptor(nvlist_t *nvl, int value, const char *namefmt, #endif void -nvlist_addv_binary(nvlist_t *nvl, const void *value, size_t size, - const char *namefmt, va_list nameap) +nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, + size_t size) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_createv_binary(value, size, namefmt, nameap); + nvp = nvpair_create_binary(name, value, size); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void @@ -1503,153 +1256,100 @@ nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp) if (nvlist_error(nvl) != 0) { nvpair_free(nvp); - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } if (nvlist_exists(nvl, nvpair_name(nvp))) { nvpair_free(nvp); nvl->nvl_error = EEXIST; - RESTORE_ERRNO(nvl->nvl_error); + ERRNO_SET(nvl->nvl_error); return; } nvpair_insert(&nvl->nvl_head, nvp, nvl); } -#define NVLIST_MOVE(vtype, type) \ -void \ -nvlist_move_##type(nvlist_t *nvl, const char *name, vtype value) \ -{ \ - \ - nvlist_movef_##type(nvl, value, "%s", name); \ -} - -NVLIST_MOVE(char *, string) -NVLIST_MOVE(nvlist_t *, nvlist) -#ifndef _KERNEL -NVLIST_MOVE(int, descriptor) -#endif - -#undef NVLIST_MOVE - -void -nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size) -{ - - nvlist_movef_binary(nvl, value, size, "%s", name); -} - -#define NVLIST_MOVEF(vtype, type) \ -void \ -nvlist_movef_##type(nvlist_t *nvl, vtype value, const char *namefmt, \ - ...) \ -{ \ - va_list nameap; \ - \ - va_start(nameap, namefmt); \ - nvlist_movev_##type(nvl, value, namefmt, nameap); \ - va_end(nameap); \ -} - -NVLIST_MOVEF(char *, string) -NVLIST_MOVEF(nvlist_t *, nvlist) -#ifndef _KERNEL -NVLIST_MOVEF(int, descriptor) -#endif - -#undef NVLIST_MOVEF - -void -nvlist_movef_binary(nvlist_t *nvl, void *value, size_t size, - const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_movev_binary(nvl, value, size, namefmt, nameap); - va_end(nameap); -} - void -nvlist_movev_string(nvlist_t *nvl, char *value, const char *namefmt, - va_list nameap) +nvlist_move_string(nvlist_t *nvl, const char *name, char *value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { nv_free(value); - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_movev_string(value, namefmt, nameap); + nvp = nvpair_move_string(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } void -nvlist_movev_nvlist(nvlist_t *nvl, nvlist_t *value, const char *namefmt, - va_list nameap) +nvlist_move_nvlist(nvlist_t *nvl, const char *name, nvlist_t *value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { if (value != NULL && nvlist_get_nvpair_parent(value) != NULL) nvlist_destroy(value); - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_movev_nvlist(value, namefmt, nameap); + nvp = nvpair_move_nvlist(name, value); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } #ifndef _KERNEL void -nvlist_movev_descriptor(nvlist_t *nvl, int value, const char *namefmt, - va_list nameap) +nvlist_move_descriptor(nvlist_t *nvl, const char *name, int value) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { close(value); - errno = nvlist_error(nvl); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_movev_descriptor(value, namefmt, nameap); - if (nvp == NULL) - nvl->nvl_error = errno = (errno != 0 ? errno : ENOMEM); - else + nvp = nvpair_move_descriptor(name, value); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } #endif void -nvlist_movev_binary(nvlist_t *nvl, void *value, size_t size, - const char *namefmt, va_list nameap) +nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size) { nvpair_t *nvp; if (nvlist_error(nvl) != 0) { nv_free(value); - RESTORE_ERRNO(nvlist_error(nvl)); + ERRNO_SET(nvlist_error(nvl)); return; } - nvp = nvpair_movev_binary(value, size, namefmt, nameap); + nvp = nvpair_move_binary(name, value, size); if (nvp == NULL) { nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); - RESTORE_ERRNO(nvl->nvl_error); - } else + ERRNO_SET(nvl->nvl_error); + } else { nvlist_move_nvpair(nvl, nvp); + } } const nvpair_t * @@ -1693,84 +1393,6 @@ nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep) return (nvpair_get_binary(nvp, sizep)); } -#define NVLIST_GETF(ftype, type) \ -ftype \ -nvlist_getf_##type(const nvlist_t *nvl, const char *namefmt, ...) \ -{ \ - va_list nameap; \ - ftype value; \ - \ - va_start(nameap, namefmt); \ - value = nvlist_getv_##type(nvl, namefmt, nameap); \ - va_end(nameap); \ - \ - return (value); \ -} - -#ifndef _KERNEL -NVLIST_GETF(bool, bool) -NVLIST_GETF(uint64_t, number) -NVLIST_GETF(const char *, string) -NVLIST_GETF(const nvlist_t *, nvlist) -NVLIST_GETF(int, descriptor) - -#undef NVLIST_GETF - -const void * -nvlist_getf_binary(const nvlist_t *nvl, size_t *sizep, const char *namefmt, ...) -{ - va_list nameap; - const void *value; - - va_start(nameap, namefmt); - value = nvlist_getv_binary(nvl, sizep, namefmt, nameap); - va_end(nameap); - - return (value); -} - -#define NVLIST_GETV(ftype, type, TYPE) \ -ftype \ -nvlist_getv_##type(const nvlist_t *nvl, const char *namefmt, \ - va_list nameap) \ -{ \ - char *name; \ - ftype value; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - nvlist_report_missing(NV_TYPE_##TYPE, "<unknown>"); \ - value = nvlist_get_##type(nvl, name); \ - free(name); \ - \ - return (value); \ -} - -NVLIST_GETV(bool, bool, BOOL) -NVLIST_GETV(uint64_t, number, NUMBER) -NVLIST_GETV(const char *, string, STRING) -NVLIST_GETV(const nvlist_t *, nvlist, NVLIST) -NVLIST_GETV(int, descriptor, DESCRIPTOR) - -#undef NVLIST_GETV - -const void * -nvlist_getv_binary(const nvlist_t *nvl, size_t *sizep, const char *namefmt, - va_list nameap) -{ - char *name; - const void *binary; - - nv_vasprintf(&name, namefmt, nameap); - if (name == NULL) - nvlist_report_missing(NV_TYPE_BINARY, "<unknown>"); - - binary = nvlist_get_binary(nvl, name, sizep); - nv_free(name); - return (binary); -} -#endif - #define NVLIST_TAKE(ftype, type, TYPE) \ ftype \ nvlist_take_##type(nvlist_t *nvl, const char *name) \ @@ -1813,82 +1435,6 @@ nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep) return (value); } -#define NVLIST_TAKEF(ftype, type) \ -ftype \ -nvlist_takef_##type(nvlist_t *nvl, const char *namefmt, ...) \ -{ \ - va_list nameap; \ - ftype value; \ - \ - va_start(nameap, namefmt); \ - value = nvlist_takev_##type(nvl, namefmt, nameap); \ - va_end(nameap); \ - \ - return (value); \ -} - -#ifndef _KERNEL -NVLIST_TAKEF(bool, bool) -NVLIST_TAKEF(uint64_t, number) -NVLIST_TAKEF(char *, string) -NVLIST_TAKEF(nvlist_t *, nvlist) -NVLIST_TAKEF(int, descriptor) - -#undef NVLIST_TAKEF - -void * -nvlist_takef_binary(nvlist_t *nvl, size_t *sizep, const char *namefmt, ...) -{ - va_list nameap; - void *value; - - va_start(nameap, namefmt); - value = nvlist_takev_binary(nvl, sizep, namefmt, nameap); - va_end(nameap); - - return (value); -} - -#define NVLIST_TAKEV(ftype, type, TYPE) \ -ftype \ -nvlist_takev_##type(nvlist_t *nvl, const char *namefmt, va_list nameap) \ -{ \ - char *name; \ - ftype value; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - nvlist_report_missing(NV_TYPE_##TYPE, "<unknown>"); \ - value = nvlist_take_##type(nvl, name); \ - free(name); \ - return (value); \ -} - -NVLIST_TAKEV(bool, bool, BOOL) -NVLIST_TAKEV(uint64_t, number, NUMBER) -NVLIST_TAKEV(char *, string, STRING) -NVLIST_TAKEV(nvlist_t *, nvlist, NVLIST) -NVLIST_TAKEV(int, descriptor, DESCRIPTOR) - -#undef NVLIST_TAKEV - -void * -nvlist_takev_binary(nvlist_t *nvl, size_t *sizep, const char *namefmt, - va_list nameap) -{ - char *name; - void *binary; - - nv_vasprintf(&name, namefmt, nameap); - if (name == NULL) - nvlist_report_missing(NV_TYPE_BINARY, "<unknown>"); - - binary = nvlist_take_binary(nvl, name, sizep); - nv_free(name); - return (binary); -} -#endif - void nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { @@ -1927,68 +1473,6 @@ NVLIST_FREE(binary, BINARY) #undef NVLIST_FREE -#ifndef _KERNEL -void -nvlist_freef(nvlist_t *nvl, const char *namefmt, ...) -{ - va_list nameap; - - va_start(nameap, namefmt); - nvlist_freev(nvl, namefmt, nameap); - va_end(nameap); -} - -#define NVLIST_FREEF(type) \ -void \ -nvlist_freef_##type(nvlist_t *nvl, const char *namefmt, ...) \ -{ \ - va_list nameap; \ - \ - va_start(nameap, namefmt); \ - nvlist_freev_##type(nvl, namefmt, nameap); \ - va_end(nameap); \ -} - -NVLIST_FREEF(null) -NVLIST_FREEF(bool) -NVLIST_FREEF(number) -NVLIST_FREEF(string) -NVLIST_FREEF(nvlist) -NVLIST_FREEF(descriptor) -NVLIST_FREEF(binary) - -#undef NVLIST_FREEF - -void -nvlist_freev(nvlist_t *nvl, const char *namefmt, va_list nameap) -{ - - nvlist_freev_type(nvl, NV_TYPE_NONE, namefmt, nameap); -} - -#define NVLIST_FREEV(type, TYPE) \ -void \ -nvlist_freev_##type(nvlist_t *nvl, const char *namefmt, va_list nameap) \ -{ \ - char *name; \ - \ - vasprintf(&name, namefmt, nameap); \ - if (name == NULL) \ - nvlist_report_missing(NV_TYPE_##TYPE, "<unknown>"); \ - nvlist_free_##type(nvl, name); \ - free(name); \ -} - -NVLIST_FREEV(null, NULL) -NVLIST_FREEV(bool, BOOL) -NVLIST_FREEV(number, NUMBER) -NVLIST_FREEV(string, STRING) -NVLIST_FREEV(nvlist, NVLIST) -NVLIST_FREEV(descriptor, DESCRIPTOR) -NVLIST_FREEV(binary, BINARY) -#undef NVLIST_FREEV -#endif - void nvlist_free_nvpair(nvlist_t *nvl, nvpair_t *nvp) { @@ -2000,3 +1484,4 @@ nvlist_free_nvpair(nvlist_t *nvl, nvpair_t *nvp) nvlist_remove_nvpair(nvl, nvp); nvpair_free(nvp); } + diff --git a/sys/kern/subr_nvpair.c b/sys/kern/subr_nvpair.c index 7b88e42..b0fc322 100644 --- a/sys/kern/subr_nvpair.c +++ b/sys/kern/subr_nvpair.c @@ -468,7 +468,7 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr, return (ptr); failed: - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -480,7 +480,7 @@ nvpair_unpack_null(bool isbe __unused, nvpair_t *nvp, const unsigned char *ptr, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NULL); if (nvp->nvp_datasize != 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -496,11 +496,11 @@ nvpair_unpack_bool(bool isbe __unused, nvpair_t *nvp, const unsigned char *ptr, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL); if (nvp->nvp_datasize != sizeof(value)) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } if (*leftp < sizeof(value)) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -509,7 +509,7 @@ nvpair_unpack_bool(bool isbe __unused, nvpair_t *nvp, const unsigned char *ptr, *leftp -= sizeof(value); if (value != 0 && value != 1) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -526,11 +526,11 @@ nvpair_unpack_number(bool isbe, nvpair_t *nvp, const unsigned char *ptr, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER); if (nvp->nvp_datasize != sizeof(uint64_t)) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } if (*leftp < sizeof(uint64_t)) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -552,13 +552,13 @@ nvpair_unpack_string(bool isbe __unused, nvpair_t *nvp, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING); if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } if (strnlen((const char *)ptr, nvp->nvp_datasize) != nvp->nvp_datasize - 1) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -581,7 +581,7 @@ nvpair_unpack_nvlist(bool isbe __unused, nvpair_t *nvp, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST); if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -609,11 +609,11 @@ nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp, const unsigned char *ptr, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR); if (nvp->nvp_datasize != sizeof(idx)) { - errno = EINVAL; + ERRNO_SET(EINVAL); return (NULL); } if (*leftp < sizeof(idx)) { - errno = EINVAL; + ERRNO_SET(EINVAL); return (NULL); } @@ -623,12 +623,12 @@ nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp, const unsigned char *ptr, idx = le64dec(ptr); if (idx < 0) { - errno = EINVAL; + ERRNO_SET(EINVAL); return (NULL); } if ((size_t)idx >= nfds) { - errno = EINVAL; + ERRNO_SET(EINVAL); return (NULL); } @@ -650,7 +650,7 @@ nvpair_unpack_binary(bool isbe __unused, nvpair_t *nvp, PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BINARY); if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -716,69 +716,34 @@ nvpair_name(const nvpair_t *nvp) } static nvpair_t * -nvpair_allocv(int type, uint64_t data, size_t datasize, const char *namefmt, - va_list nameap) +nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize) { nvpair_t *nvp; - char *name; - int namelen; + size_t namelen; PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST); - namelen = nv_vasprintf(&name, namefmt, nameap); - if (namelen < 0) - return (NULL); - - PJDLOG_ASSERT(namelen > 0); + namelen = strlen(name); if (namelen >= NV_NAME_MAX) { - nv_free(name); - RESTORE_ERRNO(ENAMETOOLONG); + ERRNO_SET(ENAMETOOLONG); return (NULL); } nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1); if (nvp != NULL) { nvp->nvp_name = (char *)(nvp + 1); - memcpy(nvp->nvp_name, name, namelen + 1); + memcpy(nvp->nvp_name, name, namelen); + nvp->nvp_name[namelen + 1] = '\0'; nvp->nvp_type = type; nvp->nvp_data = data; nvp->nvp_datasize = datasize; nvp->nvp_magic = NVPAIR_MAGIC; } - nv_free(name); return (nvp); }; nvpair_t * -nvpair_create_null(const char *name) -{ - - return (nvpair_createf_null("%s", name)); -} - -nvpair_t * -nvpair_create_bool(const char *name, bool value) -{ - - return (nvpair_createf_bool(value, "%s", name)); -} - -nvpair_t * -nvpair_create_number(const char *name, uint64_t value) -{ - - return (nvpair_createf_number(value, "%s", name)); -} - -nvpair_t * -nvpair_create_string(const char *name, const char *value) -{ - - return (nvpair_createf_string(value, "%s", name)); -} - -nvpair_t * nvpair_create_stringf(const char *name, const char *valuefmt, ...) { va_list valueap; @@ -808,153 +773,36 @@ nvpair_create_stringv(const char *name, const char *valuefmt, va_list valueap) } nvpair_t * -nvpair_create_nvlist(const char *name, const nvlist_t *value) -{ - - return (nvpair_createf_nvlist(value, "%s", name)); -} - -#ifndef _KERNEL -nvpair_t * -nvpair_create_descriptor(const char *name, int value) -{ - - return (nvpair_createf_descriptor(value, "%s", name)); -} -#endif - -nvpair_t * -nvpair_create_binary(const char *name, const void *value, size_t size) -{ - - return (nvpair_createf_binary(value, size, "%s", name)); -} - -nvpair_t * -nvpair_createf_null(const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_null(namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_createf_bool(bool value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_bool(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_createf_number(uint64_t value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_number(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_createf_string(const char *value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_string(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_createf_nvlist(const nvlist_t *value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_nvlist(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -#ifndef _KERNEL -nvpair_t * -nvpair_createf_descriptor(int value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_descriptor(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} -#endif - -nvpair_t * -nvpair_createf_binary(const void *value, size_t size, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_createv_binary(value, size, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_createv_null(const char *namefmt, va_list nameap) +nvpair_create_null(const char *name) { - return (nvpair_allocv(NV_TYPE_NULL, 0, 0, namefmt, nameap)); + return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0)); } nvpair_t * -nvpair_createv_bool(bool value, const char *namefmt, va_list nameap) +nvpair_create_bool(const char *name, bool value) { - return (nvpair_allocv(NV_TYPE_BOOL, value ? 1 : 0, sizeof(uint8_t), - namefmt, nameap)); + return (nvpair_allocv(name, NV_TYPE_BOOL, value ? 1 : 0, + sizeof(uint8_t))); } nvpair_t * -nvpair_createv_number(uint64_t value, const char *namefmt, va_list nameap) +nvpair_create_number(const char *name, uint64_t value) { - return (nvpair_allocv(NV_TYPE_NUMBER, value, sizeof(value), namefmt, - nameap)); + return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value))); } nvpair_t * -nvpair_createv_string(const char *value, const char *namefmt, va_list nameap) +nvpair_create_string(const char *name, const char *value) { nvpair_t *nvp; size_t size; char *data; if (value == NULL) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -963,8 +811,8 @@ nvpair_createv_string(const char *value, const char *namefmt, va_list nameap) return (NULL); size = strlen(value) + 1; - nvp = nvpair_allocv(NV_TYPE_STRING, (uint64_t)(uintptr_t)data, size, - namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)data, + size); if (nvp == NULL) nv_free(data); @@ -972,14 +820,13 @@ nvpair_createv_string(const char *value, const char *namefmt, va_list nameap) } nvpair_t * -nvpair_createv_nvlist(const nvlist_t *value, const char *namefmt, - va_list nameap) +nvpair_create_nvlist(const char *name, const nvlist_t *value) { nvlist_t *nvl; nvpair_t *nvp; if (value == NULL) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -987,8 +834,7 @@ nvpair_createv_nvlist(const nvlist_t *value, const char *namefmt, if (nvl == NULL) return (NULL); - nvp = nvpair_allocv(NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0, - namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0); if (nvp == NULL) nvlist_destroy(nvl); else @@ -999,12 +845,12 @@ nvpair_createv_nvlist(const nvlist_t *value, const char *namefmt, #ifndef _KERNEL nvpair_t * -nvpair_createv_descriptor(int value, const char *namefmt, va_list nameap) +nvpair_create_descriptor(const char *name, int value) { nvpair_t *nvp; if (value < 0 || !fd_is_valid(value)) { - errno = EBADF; + ERRNO_SET(EBADF); return (NULL); } @@ -1012,24 +858,26 @@ nvpair_createv_descriptor(int value, const char *namefmt, va_list nameap) if (value < 0) return (NULL); - nvp = nvpair_allocv(NV_TYPE_DESCRIPTOR, (uint64_t)value, - sizeof(int64_t), namefmt, nameap); - if (nvp == NULL) + nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value, + sizeof(int64_t)); + if (nvp == NULL) { + ERRNO_SAVE(); close(value); + ERRNO_RESTORE(); + } return (nvp); } #endif nvpair_t * -nvpair_createv_binary(const void *value, size_t size, const char *namefmt, - va_list nameap) +nvpair_create_binary(const char *name, const void *value, size_t size) { nvpair_t *nvp; void *data; if (value == NULL || size == 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } @@ -1038,8 +886,8 @@ nvpair_createv_binary(const void *value, size_t size, const char *namefmt, return (NULL); memcpy(data, value, size); - nvp = nvpair_allocv(NV_TYPE_BINARY, (uint64_t)(uintptr_t)data, size, - namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)data, + size); if (nvp == NULL) nv_free(data); @@ -1049,127 +897,42 @@ nvpair_createv_binary(const void *value, size_t size, const char *namefmt, nvpair_t * nvpair_move_string(const char *name, char *value) { - - return (nvpair_movef_string(value, "%s", name)); -} - -nvpair_t * -nvpair_move_nvlist(const char *name, nvlist_t *value) -{ - - return (nvpair_movef_nvlist(value, "%s", name)); -} - -#ifndef _KERNEL -nvpair_t * -nvpair_move_descriptor(const char *name, int value) -{ - - return (nvpair_movef_descriptor(value, "%s", name)); -} -#endif - -nvpair_t * -nvpair_move_binary(const char *name, void *value, size_t size) -{ - - return (nvpair_movef_binary(value, size, "%s", name)); -} - -nvpair_t * -nvpair_movef_string(char *value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_movev_string(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_movef_nvlist(nvlist_t *value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_movev_nvlist(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -#ifndef _KERNEL -nvpair_t * -nvpair_movef_descriptor(int value, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_movev_descriptor(value, namefmt, nameap); - va_end(nameap); - - return (nvp); -} -#endif - -nvpair_t * -nvpair_movef_binary(void *value, size_t size, const char *namefmt, ...) -{ - va_list nameap; - nvpair_t *nvp; - - va_start(nameap, namefmt); - nvp = nvpair_movev_binary(value, size, namefmt, nameap); - va_end(nameap); - - return (nvp); -} - -nvpair_t * -nvpair_movev_string(char *value, const char *namefmt, va_list nameap) -{ nvpair_t *nvp; - int serrno; if (value == NULL) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } - nvp = nvpair_allocv(NV_TYPE_STRING, (uint64_t)(uintptr_t)value, - strlen(value) + 1, namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)value, + strlen(value) + 1); if (nvp == NULL) { - SAVE_ERRNO(serrno); + ERRNO_SAVE(); nv_free(value); - RESTORE_ERRNO(serrno); + ERRNO_RESTORE(); } return (nvp); } nvpair_t * -nvpair_movev_nvlist(nvlist_t *value, const char *namefmt, va_list nameap) +nvpair_move_nvlist(const char *name, nvlist_t *value) { nvpair_t *nvp; if (value == NULL || nvlist_get_nvpair_parent(value) != NULL) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } if (nvlist_error(value) != 0) { - RESTORE_ERRNO(nvlist_error(value)); + ERRNO_SET(nvlist_error(value)); nvlist_destroy(value); return (NULL); } - nvp = nvpair_allocv(NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value, 0, - namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value, + 0); if (nvp == NULL) nvlist_destroy(value); else @@ -1180,22 +943,21 @@ nvpair_movev_nvlist(nvlist_t *value, const char *namefmt, va_list nameap) #ifndef _KERNEL nvpair_t * -nvpair_movev_descriptor(int value, const char *namefmt, va_list nameap) +nvpair_move_descriptor(const char *name, int value) { nvpair_t *nvp; - int serrno; if (value < 0 || !fd_is_valid(value)) { - errno = EBADF; + ERRNO_SET(EBADF); return (NULL); } - nvp = nvpair_allocv(NV_TYPE_DESCRIPTOR, (uint64_t)value, - sizeof(int64_t), namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value, + sizeof(int64_t)); if (nvp == NULL) { - serrno = errno; + ERRNO_SAVE(); close(value); - errno = serrno; + ERRNO_RESTORE(); } return (nvp); @@ -1203,23 +965,21 @@ nvpair_movev_descriptor(int value, const char *namefmt, va_list nameap) #endif nvpair_t * -nvpair_movev_binary(void *value, size_t size, const char *namefmt, - va_list nameap) +nvpair_move_binary(const char *name, void *value, size_t size) { nvpair_t *nvp; - int serrno; if (value == NULL || size == 0) { - RESTORE_ERRNO(EINVAL); + ERRNO_SET(EINVAL); return (NULL); } - nvp = nvpair_allocv(NV_TYPE_BINARY, (uint64_t)(uintptr_t)value, size, - namefmt, nameap); + nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)value, + size); if (nvp == NULL) { - SAVE_ERRNO(serrno); + ERRNO_SAVE(); nv_free(value); - RESTORE_ERRNO(serrno); + ERRNO_RESTORE(); } return (nvp); @@ -1348,3 +1108,4 @@ nvpair_type_string(int type) return ("<UNKNOWN>"); } } + diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f662ec2..cba656a 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -99,11 +99,7 @@ pid_t pid_max = PID_MAX; long maxswzone; /* max swmeta KVA storage */ long maxbcache; /* max buffer cache KVA storage */ long maxpipekva; /* Limit on pipe KVA */ -#ifdef XEN -int vm_guest = VM_GUEST_XEN; -#else int vm_guest = VM_GUEST_NO; /* Running as virtual machine guest? */ -#endif u_long maxtsiz; /* max text size */ u_long dfldsiz; /* initial data size limit */ u_long maxdsiz; /* max data size */ diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index cfc3ed7..93f7557 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ktr.h> #include <sys/pioctl.h> #include <sys/ptrace.h> +#include <sys/racct.h> #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/signalvar.h> @@ -79,12 +80,6 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #endif -#ifdef XEN -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/pmap.h> -#endif - #ifdef HWPMC_HOOKS #include <sys/pmckern.h> #endif @@ -135,9 +130,6 @@ userret(struct thread *td, struct trapframe *frame) * Let the scheduler adjust our priority etc. */ sched_userret(td); -#ifdef XEN - PT_UPDATES_FLUSH(); -#endif /* * Check for misbehavior. @@ -172,11 +164,13 @@ userret(struct thread *td, struct trapframe *frame) __func__, td, p->p_pid, td->td_name, curvnet, (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif -#ifdef RACCT - PROC_LOCK(p); - while (p->p_throttled == 1) - msleep(p->p_racct, &p->p_mtx, 0, "racct", 0); - PROC_UNLOCK(p); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(p); + while (p->p_throttled == 1) + msleep(p->p_racct, &p->p_mtx, 0, "racct", 0); + PROC_UNLOCK(p); + } #endif } diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index acc44710..59357c4 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -617,12 +617,14 @@ sys_msgget(td, uap) goto done2; } #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_add(td->td_proc, RACCT_NMSGQ, 1); - PROC_UNLOCK(td->td_proc); - if (error != 0) { - error = ENOSPC; - goto done2; + if (racct_enable) { + PROC_LOCK(td->td_proc); + error = racct_add(td->td_proc, RACCT_NMSGQ, 1); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = ENOSPC; + goto done2; + } } #endif DPRINTF(("msqid %d is available\n", msqid)); @@ -724,20 +726,22 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) #endif #ifdef RACCT - PROC_LOCK(td->td_proc); - if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { - PROC_UNLOCK(td->td_proc); - error = EAGAIN; - goto done2; - } - saved_msgsz = msgsz; - if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) { - racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); + if (racct_enable) { + PROC_LOCK(td->td_proc); + if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { + PROC_UNLOCK(td->td_proc); + error = EAGAIN; + goto done2; + } + saved_msgsz = msgsz; + if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) { + racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); + PROC_UNLOCK(td->td_proc); + error = EAGAIN; + goto done2; + } PROC_UNLOCK(td->td_proc); - error = EAGAIN; - goto done2; } - PROC_UNLOCK(td->td_proc); #endif segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; @@ -994,7 +998,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) td->td_retval[0] = 0; done3: #ifdef RACCT - if (error != 0) { + if (racct_enable && error != 0) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz); diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index 6ff5789..d9324f4 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -915,12 +915,14 @@ sys_semget(struct thread *td, struct semget_args *uap) goto done2; } #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_add(td->td_proc, RACCT_NSEM, nsems); - PROC_UNLOCK(td->td_proc); - if (error != 0) { - error = ENOSPC; - goto done2; + if (racct_enable) { + PROC_LOCK(td->td_proc); + error = racct_add(td->td_proc, RACCT_NSEM, nsems); + PROC_UNLOCK(td->td_proc); + if (error != 0) { + error = ENOSPC; + goto done2; + } } #endif DPRINTF(("semid %d is available\n", semid)); @@ -1009,12 +1011,15 @@ sys_semop(struct thread *td, struct semop_args *uap) return (E2BIG); } else { #ifdef RACCT - PROC_LOCK(td->td_proc); - if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) { + if (racct_enable) { + PROC_LOCK(td->td_proc); + if (nsops > + racct_get_available(td->td_proc, RACCT_NSEMOP)) { + PROC_UNLOCK(td->td_proc); + return (E2BIG); + } PROC_UNLOCK(td->td_proc); - return (E2BIG); } - PROC_UNLOCK(td->td_proc); #endif sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 274deda..3240a5f 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -651,17 +651,19 @@ shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode) ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; #ifdef RACCT - PROC_LOCK(td->td_proc); - if (racct_add(td->td_proc, RACCT_NSHM, 1)) { - PROC_UNLOCK(td->td_proc); - return (ENOSPC); - } - if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) { - racct_sub(td->td_proc, RACCT_NSHM, 1); + if (racct_enable) { + PROC_LOCK(td->td_proc); + if (racct_add(td->td_proc, RACCT_NSHM, 1)) { + PROC_UNLOCK(td->td_proc); + return (ENOSPC); + } + if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) { + racct_sub(td->td_proc, RACCT_NSHM, 1); + PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } PROC_UNLOCK(td->td_proc); - return (ENOMEM); } - PROC_UNLOCK(td->td_proc); #endif /* @@ -672,10 +674,12 @@ shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode) 0, size, VM_PROT_DEFAULT, 0, cred); if (shm_object == NULL) { #ifdef RACCT - PROC_LOCK(td->td_proc); - racct_sub(td->td_proc, RACCT_NSHM, 1); - racct_sub(td->td_proc, RACCT_SHMSIZE, size); - PROC_UNLOCK(td->td_proc); + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_sub(td->td_proc, RACCT_NSHM, 1); + racct_sub(td->td_proc, RACCT_SHMSIZE, size); + PROC_UNLOCK(td->td_proc); + } #endif return (ENOMEM); } @@ -961,39 +965,39 @@ oshmctl(struct thread *td, struct oshmctl_args *uap) if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); + if (uap->cmd != IPC_STAT) { + return (freebsd7_shmctl(td, + (struct freebsd7_shmctl_args *)uap)); + } SYSVSHM_LOCK(); shmseg = shm_find_segment(uap->shmid, true); if (shmseg == NULL) { SYSVSHM_UNLOCK(); return (EINVAL); } - switch (uap->cmd) { - case IPC_STAT: - error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); - if (error != 0) - break; + error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); + if (error != 0) { + SYSVSHM_UNLOCK(); + return (error); + } #ifdef MAC - error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, - uap->cmd); - if (error != 0) - break; -#endif - ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm); - outbuf.shm_segsz = shmseg->u.shm_segsz; - outbuf.shm_cpid = shmseg->u.shm_cpid; - outbuf.shm_lpid = shmseg->u.shm_lpid; - outbuf.shm_nattch = shmseg->u.shm_nattch; - outbuf.shm_atime = shmseg->u.shm_atime; - outbuf.shm_dtime = shmseg->u.shm_dtime; - outbuf.shm_ctime = shmseg->u.shm_ctime; - outbuf.shm_handle = shmseg->object; - error = copyout(&outbuf, uap->ubuf, sizeof(outbuf)); - break; - default: - error = freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap); - break; + error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd); + if (error != 0) { + SYSVSHM_UNLOCK(); + return (error); } +#endif + ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm); + outbuf.shm_segsz = shmseg->u.shm_segsz; + outbuf.shm_cpid = shmseg->u.shm_cpid; + outbuf.shm_lpid = shmseg->u.shm_lpid; + outbuf.shm_nattch = shmseg->u.shm_nattch; + outbuf.shm_atime = shmseg->u.shm_atime; + outbuf.shm_dtime = shmseg->u.shm_dtime; + outbuf.shm_ctime = shmseg->u.shm_ctime; + outbuf.shm_handle = shmseg->object; SYSVSHM_UNLOCK(); + error = copyout(&outbuf, uap->ubuf, sizeof(outbuf)); return (error); #else return (EINVAL); @@ -1025,9 +1029,7 @@ sys_shmsys(struct thread *td, struct shmsys_args *uap) return (ENOSYS); if (uap->which < 0 || uap->which >= nitems(shmcalls)) return (EINVAL); - SYSVSHM_LOCK(); error = (*shmcalls[uap->which])(td, &uap->a2); - SYSVSHM_UNLOCK(); return (error); } diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 8410ed9..93c7ed1 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -170,7 +170,7 @@ uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); - return (uiomove(__DECONST(void *, zero_region), len, uio)); + return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index c7e602e..0bfcf2d 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -59,10 +59,12 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <sys/event.h> #include <sys/mount.h> +#include <geom/geom.h> #include <machine/atomic.h> #include <vm/vm.h> +#include <vm/vm_page.h> #include <vm/vm_extern.h> #include <vm/pmap.h> #include <vm/vm_map.h> @@ -232,9 +234,10 @@ struct aiocblist { int jobstate; /* (b) job state */ int inputcharge; /* (*) input blockes */ int outputcharge; /* (*) output blockes */ - struct buf *bp; /* (*) private to BIO backend, - * buffer pointer - */ + struct bio *bp; /* (*) BIO backend BIO pointer */ + struct buf *pbuf; /* (*) BIO backend buffer pointer */ + struct vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */ + int npages; /* BIO backend number of pages */ struct proc *userproc; /* (*) user process */ struct ucred *cred; /* (*) active credential when created */ struct file *fd_file; /* (*) pointer to file structure */ @@ -243,7 +246,6 @@ struct aiocblist { struct knlist klist; /* (a) list of knotes */ struct aiocb uaiocb; /* (*) kernel I/O control block */ ksiginfo_t ksi; /* (a) realtime signal info */ - struct task biotask; /* (*) private to BIO backend */ uint64_t seqno; /* (*) job number */ int pending; /* (a) number of pending I/O, aio_fsync only */ }; @@ -344,11 +346,10 @@ static void aio_process_mlock(struct aiocblist *aiocbe); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lio, int type, struct aiocb_ops *ops); -static void aio_physwakeup(struct buf *bp); +static void aio_physwakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); -static void biohelper(void *, int); static void aio_daemon(void *param); static void aio_swake_cb(struct socket *, struct sockbuf *); static int aio_unload(void); @@ -1294,13 +1295,15 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe) { struct aiocb *cb; struct file *fp; - struct buf *bp; + struct bio *bp; + struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; struct aioliojob *lj; - int error, ref; + int error, ref, unmap, poff; + vm_prot_t prot; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; @@ -1309,107 +1312,121 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe) return (-1); vp = fp->f_vnode; - - /* - * If its not a disk, we don't want to return a positive error. - * It causes the aio code to not fall through to try the thread - * way when you're talking to a regular file. - */ - if (!vn_isdisk(vp, &error)) { - if (error == ENOTBLK) - return (-1); - else - return (error); - } - - if (vp->v_bufobj.bo_bsize == 0) - return (-1); - - if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) + if (vp->v_type != VCHR) return (-1); - - if (cb->aio_nbytes > - MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) + if (vp->v_bufobj.bo_bsize == 0) return (-1); - - ki = p->p_aioinfo; - if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) + if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); + + if ((csw->d_flags & D_DISK) == 0) { + error = -1; + goto unref; + } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } - /* Create and build a buffer header for a transfer. */ - bp = (struct buf *)getpbuf(NULL); - BUF_KERNPROC(bp); + ki = p->p_aioinfo; + poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; + unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed); + if (unmap) { + if (cb->aio_nbytes > MAXPHYS) { + error = -1; + goto unref; + } + } else { + if (cb->aio_nbytes > MAXPHYS - poff) { + error = -1; + goto unref; + } + if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { + error = -1; + goto unref; + } + } + aiocbe->bp = bp = g_alloc_bio(); + if (!unmap) { + aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL); + BUF_KERNPROC(pbuf); + } AIO_LOCK(ki); ki->kaio_count++; - ki->kaio_buffer_count++; + if (!unmap) + ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) lj->lioj_count++; - AIO_UNLOCK(ki); - - /* - * Get a copy of the kva from the physical buffer. - */ - error = 0; - - bp->b_bcount = cb->aio_nbytes; - bp->b_bufsize = cb->aio_nbytes; - bp->b_iodone = aio_physwakeup; - bp->b_saveaddr = bp->b_data; - bp->b_data = (void *)(uintptr_t)cb->aio_buf; - bp->b_offset = cb->aio_offset; - bp->b_iooffset = cb->aio_offset; - bp->b_blkno = btodb(cb->aio_offset); - bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; - - /* - * Bring buffer into kernel space. - */ - if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) { - error = EFAULT; - goto doerror; - } - - AIO_LOCK(ki); - aiocbe->bp = bp; - bp->b_caller1 = (void *)aiocbe; TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; AIO_UNLOCK(ki); - atomic_add_int(&num_queue_count, 1); - atomic_add_int(&num_buf_aio, 1); - - bp->b_error = 0; + bp->bio_length = cb->aio_nbytes; + bp->bio_bcount = cb->aio_nbytes; + bp->bio_done = aio_physwakeup; + bp->bio_data = (void *)(uintptr_t)cb->aio_buf; + bp->bio_offset = cb->aio_offset; + bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; + bp->bio_dev = dev; + bp->bio_caller1 = (void *)aiocbe; + + prot = VM_PROT_READ; + if (cb->aio_lio_opcode == LIO_READ) + prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + if ((aiocbe->npages = vm_fault_quick_hold_pages( + &curproc->p_vmspace->vm_map, + (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages, + sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) { + error = EFAULT; + goto doerror; + } + if (!unmap) { + pmap_qenter((vm_offset_t)pbuf->b_data, + aiocbe->pages, aiocbe->npages); + bp->bio_data = pbuf->b_data + poff; + } else { + bp->bio_ma = aiocbe->pages; + bp->bio_ma_n = aiocbe->npages; + bp->bio_ma_offset = poff; + bp->bio_data = unmapped_buf; + bp->bio_flags |= BIO_UNMAPPED; + } - TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe); + atomic_add_int(&num_queue_count, 1); + if (!unmap) + atomic_add_int(&num_buf_aio, 1); /* Perform transfer. */ - dev_strategy_csw(dev, csw, bp); + csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: AIO_LOCK(ki); + aiocbe->jobstate = JOBST_NULL; + TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); + TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); ki->kaio_count--; - ki->kaio_buffer_count--; + if (!unmap) + ki->kaio_buffer_count--; if (lj) lj->lioj_count--; - aiocbe->bp = NULL; AIO_UNLOCK(ki); - relpbuf(bp, NULL); + if (pbuf) { + relpbuf(pbuf, NULL); + aiocbe->pbuf = NULL; + } + g_destroy_bio(bp); + aiocbe->bp = NULL; unref: dev_relthread(dev, ref); return (error); @@ -1787,8 +1804,6 @@ no_kqueue: } #endif queueit: - /* No buffer for daemon I/O. */ - aiocbe->bp = NULL; atomic_add_int(&num_queue_count, 1); AIO_LOCK(ki); @@ -2425,54 +2440,43 @@ sys_lio_listio(struct thread *td, struct lio_listio_args *uap) return (error); } -/* - * Called from interrupt thread for physio, we should return as fast - * as possible, so we schedule a biohelper task. - */ static void -aio_physwakeup(struct buf *bp) +aio_physwakeup(struct bio *bp) { - struct aiocblist *aiocbe; - - aiocbe = (struct aiocblist *)bp->b_caller1; - taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask); -} - -/* - * Task routine to perform heavy tasks, process wakeup, and signals. - */ -static void -biohelper(void *context, int pending) -{ - struct aiocblist *aiocbe = context; - struct buf *bp; + struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; int nblks; + /* Release mapping into kernel space. */ + if (aiocbe->pbuf) { + pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages); + relpbuf(aiocbe->pbuf, NULL); + aiocbe->pbuf = NULL; + atomic_subtract_int(&num_buf_aio, 1); + } + vm_page_unhold_pages(aiocbe->pages, aiocbe->npages); + bp = aiocbe->bp; + aiocbe->bp = NULL; userp = aiocbe->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); - aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; + aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid; aiocbe->uaiocb._aiocb_private.error = 0; - if (bp->b_ioflags & BIO_ERROR) - aiocbe->uaiocb._aiocb_private.error = bp->b_error; + if (bp->bio_flags & BIO_ERROR) + aiocbe->uaiocb._aiocb_private.error = bp->bio_error; nblks = btodb(aiocbe->uaiocb.aio_nbytes); if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) aiocbe->outputcharge += nblks; else aiocbe->inputcharge += nblks; - aiocbe->bp = NULL; TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); ki->kaio_buffer_count--; aio_bio_done_notify(userp, aiocbe, DONE_BUF); AIO_UNLOCK(ki); - /* Release mapping into kernel space. */ - vunmapbuf(bp); - relpbuf(bp, NULL); - atomic_subtract_int(&num_buf_aio, 1); + g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index dfe2997..5ac04ac 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -113,8 +113,8 @@ static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); -static int buf_flush(int); -static int flushbufqueues(int, int); +static int buf_flush(struct vnode *vp, int); +static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); @@ -805,6 +805,7 @@ bufinit(void) struct buf *bp; int i; + CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); @@ -846,8 +847,8 @@ bufinit(void) * by the system. */ maxbufspace = (long)nbuf * BKVASIZE; - hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); - lobufspace = hibufspace - MAXBSIZE; + hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); + lobufspace = hibufspace - MAXBCACHEBUF; /* * Note: The 16 MiB upper limit for hirunningspace was chosen @@ -857,9 +858,9 @@ bufinit(void) * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ - hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE), + hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 16 * 1024 * 1024), 1024 * 1024); - lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE); + lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); /* * Limit the amount of malloc memory since it is wired permanently into @@ -2096,7 +2097,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, { struct thread *td; char *waitmsg; - int cnt, error, flags, norunbuf, wait; + int error, fl, flags, norunbuf; mtx_assert(&bqclean, MA_OWNED); @@ -2118,8 +2119,6 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, return; td = curthread; - cnt = 0; - wait = MNT_NOWAIT; rw_wlock(&nblock); while ((needsbuffer & flags) != 0) { if (vp != NULL && vp->v_type != VCHR && @@ -2133,20 +2132,23 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ - if (cnt++ > 2) - wait = MNT_WAIT; - ASSERT_VOP_LOCKED(vp, "bufd_helper"); - error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : - vn_lock(vp, LK_TRYUPGRADE); - if (error == 0) { - /* play bufdaemon */ - norunbuf = curthread_pflags_set(TDP_BUFNEED | - TDP_NORUNNINGBUF); - VOP_FSYNC(vp, wait, td); - atomic_add_long(¬bufdflushes, 1); - curthread_pflags_restore(norunbuf); - } + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + + /* + * Play bufdaemon. The getnewbuf() function + * may be called while the thread owns lock + * for another dirty buffer for the same + * vnode, which makes it impossible to use + * VOP_FSYNC() there, due to the buffer lock + * recursion. + */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_flush(vp, flushbufqtarget); + td->td_pflags &= norunbuf; rw_wlock(&nblock); + if (fl != 0) + continue; if ((needsbuffer & flags) == 0) break; } @@ -2565,18 +2567,20 @@ static struct kproc_desc buf_kp = { SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int -buf_flush(int target) +buf_flush(struct vnode *vp, int target) { int flushed; - flushed = flushbufqueues(target, 0); + flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ - flushed = flushbufqueues(target, 1); + if (vp != NULL && target > 2) + target /= 2; + flushbufqueues(vp, target, 1); } return (flushed); } @@ -2613,7 +2617,7 @@ buf_daemon() * the I/O system. */ while (numdirtybuffers > lodirty) { - if (buf_flush(numdirtybuffers - lodirty) == 0) + if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } @@ -2668,7 +2672,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int -flushbufqueues(int target, int flushdeps) +flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; @@ -2678,6 +2682,7 @@ flushbufqueues(int target, int flushdeps) int flushed; int queue; int error; + bool unlock; flushed = 0; queue = QUEUE_DIRTY; @@ -2699,8 +2704,18 @@ flushbufqueues(int target, int flushdeps) mtx_unlock(&bqdirty); break; } - KASSERT(bp->b_qindex != QUEUE_SENTINEL, - ("parallel calls to flushbufqueues() bp %p", bp)); + /* + * Skip sentinels inserted by other invocations of the + * flushbufqueues(), taking care to not reorder them. + * + * Only flush the buffers that belong to the + * vnode locked by the curthread. + */ + if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && + bp->b_vp != lvp)) { + mtx_unlock(&bqdirty); + continue; + } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqdirty); if (error != 0) @@ -2748,16 +2763,37 @@ flushbufqueues(int target, int flushdeps) BUF_UNLOCK(bp); continue; } - error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (lvp == NULL) { + unlock = true; + error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + } else { + ASSERT_VOP_LOCKED(vp, "getbuf"); + unlock = false; + error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : + vn_lock(vp, LK_TRYUPGRADE); + } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); - vfs_bio_awrite(bp); + if (curproc == bufdaemonproc) { + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bwrite(bp); + notbufdflushes++; + } vn_finished_write(mp); - VOP_UNLOCK(vp, 0); + if (unlock) + VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; - if (runningbufspace > hirunningspace) + + /* + * Sleeping on runningbufspace while holding + * vnode lock leads to deadlock. + */ + if (curproc == bufdaemonproc && + runningbufspace > hirunningspace) waitrunningbufspace(); continue; } @@ -3073,8 +3109,9 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); - if (size > MAXBSIZE) - panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); + if (size > MAXBCACHEBUF) + panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, + MAXBCACHEBUF); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); diff --git a/sys/net/zlib.c b/sys/libkern/zlib.c index b348248..f5c3854 100644 --- a/sys/net/zlib.c +++ b/sys/libkern/zlib.c @@ -54,7 +54,7 @@ #define _Z_UTIL_H #ifdef _KERNEL -#include <net/zlib.h> +#include <sys/zlib.h> #else #include "zlib.h" #endif diff --git a/sys/mips/mips/busdma_machdep.c b/sys/mips/mips/busdma_machdep.c index 083e8f5..90ec399 100644 --- a/sys/mips/mips/busdma_machdep.c +++ b/sys/mips/mips/busdma_machdep.c @@ -1359,8 +1359,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= vaddr & PAGE_MASK; - bpage->busaddr |= vaddr & PAGE_MASK; + bpage->vaddr |= addr & PAGE_MASK; + bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; bpage->dataaddr = addr; diff --git a/sys/modules/ix/Makefile b/sys/modules/ix/Makefile index 5a5485d..1f30cb0 100644 --- a/sys/modules/ix/Makefile +++ b/sys/modules/ix/Makefile @@ -9,7 +9,7 @@ SRCS += if_ix.c ix_txrx.c # Shared source SRCS += ixgbe_common.c ixgbe_api.c ixgbe_phy.c ixgbe_mbx.c ixgbe_vf.c SRCS += ixgbe_dcb.c ixgbe_dcb_82598.c ixgbe_dcb_82599.c -SRCS += ixgbe_82599.c ixgbe_82598.c ixgbe_x540.c +SRCS += ixgbe_82598.c ixgbe_82599.c ixgbe_x540.c ixgbe_x550.c CFLAGS+= -I${.CURDIR}/../../dev/ixgbe -DSMP .include <bsd.kmod.mk> diff --git a/sys/modules/ixv/Makefile b/sys/modules/ixv/Makefile index 20ecaf1..f8ce347 100644 --- a/sys/modules/ixv/Makefile +++ b/sys/modules/ixv/Makefile @@ -9,7 +9,7 @@ SRCS += if_ixv.c ix_txrx.c # Shared source SRCS += ixgbe_common.c ixgbe_api.c ixgbe_phy.c ixgbe_mbx.c ixgbe_vf.c SRCS += ixgbe_dcb.c ixgbe_dcb_82598.c ixgbe_dcb_82599.c -SRCS += ixgbe_82599.c ixgbe_82598.c ixgbe_x540.c +SRCS += ixgbe_82598.c ixgbe_82599.c ixgbe_x540.c ixgbe_x550.c CFLAGS+= -I${.CURDIR}/../../dev/ixgbe -DSMP .include <bsd.kmod.mk> diff --git a/sys/modules/oce/Makefile b/sys/modules/oce/Makefile index 4821533..95e828d 100644 --- a/sys/modules/oce/Makefile +++ b/sys/modules/oce/Makefile @@ -3,7 +3,7 @@ # .PATH: ${.CURDIR}/../../dev/oce -KMOD = oce +KMOD = if_oce SRCS = oce_if.c oce_hw.c oce_mbox.c oce_util.c oce_queue.c oce_sysctl.c SRCS += bus_if.h device_if.h pci_if.h opt_inet.h opt_inet6.h diff --git a/sys/modules/zlib/Makefile b/sys/modules/zlib/Makefile index 0a475b5..235d1c7 100644 --- a/sys/modules/zlib/Makefile +++ b/sys/modules/zlib/Makefile @@ -1,6 +1,6 @@ # $FreeBSD$ -.PATH: ${.CURDIR}/../../net +.PATH: ${.CURDIR}/../../libkern KMOD= zlib SRCS= zlib.c diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 0753e0a..462c907 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1705,27 +1705,6 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } break; - case SIOCSIFCAP: - VLAN_LOCK(); - if (TRUNK(ifv) != NULL) { - p = PARENT(ifv); - VLAN_UNLOCK(); - if ((p->if_type != IFT_ETHER) && - (ifr->ifr_reqcap & IFCAP_VLAN_HWTAGGING) == 0) { - error = EINVAL; - break; - } - error = (*p->if_ioctl)(p, cmd, data); - if (error) - break; - /* Propogate vlan interface capabilities */ - vlan_trunk_capabilities(p); - } else { - VLAN_UNLOCK(); - error = EINVAL; - } - break; - default: error = EINVAL; break; diff --git a/sys/net/netisr.c b/sys/net/netisr.c index 178c3cb..4b3576d 100644 --- a/sys/net/netisr.c +++ b/sys/net/netisr.c @@ -156,10 +156,13 @@ SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, /* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for - * CPU 0, so in practice we ignore values <= 1. This must be set at boot. - * We will create at most one thread per CPU. + * CPU 0. This must be set at boot. We will create at most one thread per CPU. + * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and + * therefore only 1 workstream. If set to -1, netisr would use all cpus + * (mp_ncpus) and therefore would have those many workstreams. One workstream + * per thread (CPU). */ -static int netisr_maxthreads = -1; /* Max number of threads. */ +static int netisr_maxthreads = 1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); @@ -1120,8 +1123,10 @@ netisr_init(void *arg) KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); NETISR_LOCK_INIT(); - if (netisr_maxthreads < 1) - netisr_maxthreads = 1; + if (netisr_maxthreads == 0 || netisr_maxthreads < -1 ) + netisr_maxthreads = 1; /* default behavior */ + else if (netisr_maxthreads == -1) + netisr_maxthreads = mp_ncpus; /* use max cpus */ if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); diff --git a/sys/netgraph/ng_deflate.c b/sys/netgraph/ng_deflate.c index da68e49..be52942 100644 --- a/sys/netgraph/ng_deflate.c +++ b/sys/netgraph/ng_deflate.c @@ -39,8 +39,7 @@ #include <sys/endian.h> #include <sys/errno.h> #include <sys/syslog.h> - -#include <net/zlib.h> +#include <sys/zlib.h> #include <netgraph/ng_message.h> #include <netgraph/netgraph.h> diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c index d37e3c0..edcc853 100644 --- a/sys/netinet/in_kdtrace.c +++ b/sys/netinet/in_kdtrace.c @@ -102,6 +102,9 @@ SDT_PROBE_DEFINE5_XLATE(tcp, , , send, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); +SDT_PROBE_DEFINE1_XLATE(tcp, , , siftr, + "struct pkt_node *", "siftrinfo_t *"); + SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change, "void *", "void *", "struct tcpcb *", "csinfo_t *", diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h index 07b3e3a..c0511d0 100644 --- a/sys/netinet/in_kdtrace.h +++ b/sys/netinet/in_kdtrace.h @@ -32,6 +32,8 @@ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) #define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) +#define TCP_PROBE1(probe, arg0) \ + SDT_PROBE1(tcp, , , probe, arg0) #define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ @@ -51,6 +53,7 @@ SDT_PROBE_DECLARE(tcp, , , connect__refused); SDT_PROBE_DECLARE(tcp, , , connect__request); SDT_PROBE_DECLARE(tcp, , , receive); SDT_PROBE_DECLARE(tcp, , , send); +SDT_PROBE_DECLARE(tcp, , , siftr); SDT_PROBE_DECLARE(tcp, , , state__change); SDT_PROBE_DECLARE(udp, , , receive); diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 9c53793..d1764bb 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -40,10 +40,12 @@ #define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/ /* - * Default number of ipfw tables. + * Compat values for old clients */ +#ifndef _KERNEL #define IPFW_TABLES_MAX 65535 #define IPFW_TABLES_DEFAULT 128 +#endif /* * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit @@ -963,7 +965,6 @@ typedef struct _ipfw_ta_info { uint64_t spare1; } ipfw_ta_info; -#define IPFW_OBJTYPE_TABLE 1 typedef struct _ipfw_obj_header { ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t spare; diff --git a/sys/netinet/ip_ipsec.c b/sys/netinet/ip_ipsec.c index 1a643fb..47f683d 100644 --- a/sys/netinet/ip_ipsec.c +++ b/sys/netinet/ip_ipsec.c @@ -199,6 +199,9 @@ ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *error) /* NB: callee frees mbuf */ *error = ipsec4_process_packet(*m, sp->req); + /* Release SP if an error occured */ + if (*error != 0) + KEY_FREESP(&sp); if (*error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We @@ -234,13 +237,9 @@ ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *error) /* No IPsec processing for this packet. */ } done: - if (sp != NULL) - KEY_FREESP(&sp); - return 0; + return (0); reinjected: - if (sp != NULL) - KEY_FREESP(&sp); - return -1; + return (-1); bad: if (sp != NULL) KEY_FREESP(&sp); diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3 index 45e7fa4..543420b 100644 --- a/sys/netinet/libalias/libalias.3 +++ b/sys/netinet/libalias/libalias.3 @@ -173,10 +173,12 @@ Mainly useful for debugging when the log file is viewed continuously with .It Dv PKT_ALIAS_DENY_INCOMING If this mode bit is set, all incoming packets associated with new TCP connections or new UDP transactions will be marked for being ignored -.Fn ( LibAliasIn +.Po +.Fn LibAliasIn returns .Dv PKT_ALIAS_IGNORED -code) +code +.Pc by the calling program. Response packets to connections or transactions initiated from the packet aliasing host or local network will be unaffected. @@ -1001,7 +1003,7 @@ If this results in a conflict, then port numbers are randomly chosen until a unique aliasing link can be established. In an alternate operating mode, the first choice of an aliasing port is also random and unrelated to the local port number. -.Sh MODULAR ARCHITECTURE (AND Xr ipfw 4 Sh SUPPORT) +.Sh MODULAR ARCHITECTURE Po AND Xr ipfw 4 SUPPORT Pc One of the latest improvements to .Nm was to make its support diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index ea3d3e7..d3b4855 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -3614,24 +3614,17 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, send_s = asoc->sending_seq; } if (SCTP_TSN_GE(cumack, send_s)) { -#ifndef INVARIANTS struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; -#endif -#ifdef INVARIANTS - panic("Impossible sack 1"); -#else - *abort_now = 1; /* XXX */ - snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x", + snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", cumack, send_s); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); return; -#endif } } asoc->this_sack_highest_gap = cumack; @@ -4195,7 +4188,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, hopeless_peer: *abort_now = 1; /* XXX */ - snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x", + snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", cum_ack, send_s); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 2cd58ad..6b31d7b 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -75,6 +75,7 @@ __FBSDID("$FreeBSD$"); #include <sys/pcpu.h> #include <sys/proc.h> #include <sys/sbuf.h> +#include <sys/sdt.h> #include <sys/smp.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$"); #include <net/pfil.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -811,6 +813,8 @@ siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp, * maximum pps throughput processing when SIFTR is loaded and enabled. */ microtime(&pn->tval); + TCP_PROBE1(siftr, &pn); + } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index d7da6c6..d75221c 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2159,6 +2159,7 @@ tcp_signature_do_compute(struct mbuf *m, int len, int optlen, break; #endif default: + KEY_FREESAV(&sav); return (-1); /* NOTREACHED */ break; diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index da48615..1ba74d8 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -251,6 +251,13 @@ tcp_twstart(struct tcpcb *tp) } } + + /* + * For use only by DTrace. We do not reference the state + * after this point so modifying it in place is not a problem. + */ + tcp_state_change(tp, TCPS_TIME_WAIT); + tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { /* diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c index 08234d5..83e58c1 100644 --- a/sys/netinet6/ip6_forward.c +++ b/sys/netinet6/ip6_forward.c @@ -248,8 +248,7 @@ ip6_forward(struct mbuf *m, int srcrt) /* * when the kernel forwards a packet, it is not proper to apply - * IPsec transport mode to the packet is not proper. this check - * avoid from this. + * IPsec transport mode to the packet. This check avoid from this. * at present, if there is even a transport mode SA request in the * security policy, the kernel does not apply IPsec to the packet. * this check is not enough because the following case is valid. @@ -283,9 +282,9 @@ ip6_forward(struct mbuf *m, int srcrt) * ipsec6_proces_packet will send the packet using ip6_output */ error = ipsec6_process_packet(m, sp->req); - - KEY_FREESP(&sp); - + /* Release SP if an error occured */ + if (error != 0) + KEY_FREESP(&sp); if (error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We diff --git a/sys/netinet6/ip6_ipsec.c b/sys/netinet6/ip6_ipsec.c index 0a416cd..e6e16ed 100644 --- a/sys/netinet6/ip6_ipsec.c +++ b/sys/netinet6/ip6_ipsec.c @@ -213,7 +213,9 @@ ip6_ipsec_output(struct mbuf **m, struct inpcb *inp, int *error) /* NB: callee frees mbuf */ *error = ipsec6_process_packet(*m, sp->req); - + /* Release SP if an error occured */ + if (*error != 0) + KEY_FREESP(&sp); if (*error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We @@ -249,13 +251,9 @@ ip6_ipsec_output(struct mbuf **m, struct inpcb *inp, int *error) /* No IPsec processing for this packet. */ } done: - if (sp != NULL) - KEY_FREESP(&sp); - return 0; + return (0); reinjected: - if (sp != NULL) - KEY_FREESP(&sp); - return -1; + return (-1); bad: if (sp != NULL) KEY_FREESP(&sp); diff --git a/sys/netipsec/ipsec.c b/sys/netipsec/ipsec.c index abad3d2..266f6d0 100644 --- a/sys/netipsec/ipsec.c +++ b/sys/netipsec/ipsec.c @@ -238,6 +238,7 @@ SYSCTL_VNET_PCPUSTAT(_net_inet6_ipsec6, IPSECCTL_STATS, ipsecstats, struct ipsecstat, ipsec6stat, "IPsec IPv6 statistics."); #endif /* INET6 */ +static int ipsec_in_reject(struct secpolicy *, struct mbuf *); static int ipsec_setspidx_inpcb(struct mbuf *, struct inpcb *); static int ipsec_setspidx(struct mbuf *, struct secpolicyindex *, int); static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *, int); @@ -1191,7 +1192,7 @@ ipsec_get_reqlevel(struct ipsecrequest *isr) * 0: valid * 1: invalid */ -int +static int ipsec_in_reject(struct secpolicy *sp, struct mbuf *m) { struct ipsecrequest *isr; diff --git a/sys/netipsec/ipsec.h b/sys/netipsec/ipsec.h index 7f4c25e..442387a 100644 --- a/sys/netipsec/ipsec.h +++ b/sys/netipsec/ipsec.h @@ -309,7 +309,6 @@ struct inpcb; extern int ipsec_init_policy(struct socket *so, struct inpcbpolicy **); extern int ipsec_copy_policy(struct inpcbpolicy *, struct inpcbpolicy *); extern u_int ipsec_get_reqlevel(struct ipsecrequest *); -extern int ipsec_in_reject(struct secpolicy *, struct mbuf *); extern int ipsec_set_policy(struct inpcb *inp, int optname, caddr_t request, size_t len, struct ucred *cred); diff --git a/sys/netipsec/ipsec_input.c b/sys/netipsec/ipsec_input.c index 6c52781..5837699 100644 --- a/sys/netipsec/ipsec_input.c +++ b/sys/netipsec/ipsec_input.c @@ -391,6 +391,7 @@ ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_BEFORE); if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0) return (error); + ip = mtod(m, struct ip *); #endif /* DEV_ENC */ /* IP-in-IP encapsulation */ diff --git a/sys/netipsec/ipsec_output.c b/sys/netipsec/ipsec_output.c index 583f4b0..ae36070 100644 --- a/sys/netipsec/ipsec_output.c +++ b/sys/netipsec/ipsec_output.c @@ -104,6 +104,7 @@ ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(isr != NULL, ("null ISR")); + IPSEC_ASSERT(isr->sp != NULL, ("NULL isr->sp")); sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->sah != NULL, ("null SAH")); @@ -163,6 +164,10 @@ ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) * If this is a problem we'll need to introduce a queue * to set the packet on so we can unwind the stack before * doing further processing. + * + * If ipsec[46]_process_packet() will successfully queue + * the request, we need to take additional reference to SP, + * because xform callback will release reference. */ if (isr->next) { /* XXX-BZ currently only support same AF bundles. */ @@ -170,7 +175,11 @@ ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) #ifdef INET case AF_INET: IPSECSTAT_INC(ips_out_bundlesa); - return ipsec4_process_packet(m, isr->next); + key_addref(isr->sp); + error = ipsec4_process_packet(m, isr->next); + if (error != 0) + KEY_FREESP(&isr->sp); + return (error); /* NOTREACHED */ #endif #ifdef notyet @@ -178,7 +187,11 @@ ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) case AF_INET6: /* XXX */ IPSEC6STAT_INC(ips_out_bundlesa); - return ipsec6_process_packet(m, isr->next); + key_addref(isr->sp); + error = ipsec6_process_packet(m, isr->next); + if (error != 0) + KEY_FREESP(&isr->sp); + return (error); /* NOTREACHED */ #endif /* INET6 */ #endif @@ -193,8 +206,7 @@ ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) /* * We're done with IPsec processing, transmit the packet using the - * appropriate network protocol (IP or IPv6). SPD lookup will be - * performed again there. + * appropriate network protocol (IP or IPv6). */ switch (saidx->dst.sa.sa_family) { #ifdef INET @@ -565,6 +577,7 @@ ipsec4_process_packet(struct mbuf *m, struct ipsecrequest *isr) /* pass the mbuf to enc0 for packet filtering */ if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT|ENC_BEFORE)) != 0) goto bad; + ip = mtod(m, struct ip *); #endif /* Do the appropriate encapsulation, if necessary */ if (isr->saidx.mode == IPSEC_MODE_TUNNEL || /* Tunnel requ'd */ @@ -686,6 +699,7 @@ ipsec6_process_packet(struct mbuf *m, struct ipsecrequest *isr) /* pass the mbuf to enc0 for packet filtering */ if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT|ENC_BEFORE)) != 0) goto bad; + ip6 = mtod(m, struct ip6_hdr *); #endif /* DEV_ENC */ /* Do the appropriate encapsulation, if necessary */ diff --git a/sys/netipsec/xform_ah.c b/sys/netipsec/xform_ah.c index cb257a5..8f791db 100644 --- a/sys/netipsec/xform_ah.c +++ b/sys/netipsec/xform_ah.c @@ -1091,6 +1091,7 @@ ah_output_cb(struct cryptop *crp) m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; + IPSEC_ASSERT(isr->sp != NULL, ("NULL isr->sp")); IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ @@ -1154,16 +1155,18 @@ ah_output_cb(struct cryptop *crp) error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); - return error; + KEY_FREESP(&isr->sp); + return (error); bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); + KEY_FREESP(&isr->sp); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); - return error; + return (error); } static struct xformsw ah_xformsw = { diff --git a/sys/netipsec/xform_esp.c b/sys/netipsec/xform_esp.c index 2fbdb81..003c514 100644 --- a/sys/netipsec/xform_esp.c +++ b/sys/netipsec/xform_esp.c @@ -888,6 +888,7 @@ esp_output_cb(struct cryptop *crp) m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; + IPSEC_ASSERT(isr->sp != NULL, ("NULL isr->sp")); IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ @@ -966,16 +967,18 @@ esp_output_cb(struct cryptop *crp) error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); - return error; + KEY_FREESP(&isr->sp); + return (error); bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); + KEY_FREESP(&isr->sp); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); - return error; + return (error); } static struct xformsw esp_xformsw = { diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index 5f3afd9..ef460cd 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -492,6 +492,7 @@ ipcomp_output_cb(struct cryptop *crp) skip = tc->tc_skip; isr = tc->tc_isr; + IPSEC_ASSERT(isr->sp != NULL, ("NULL isr->sp")); IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ @@ -606,16 +607,18 @@ ipcomp_output_cb(struct cryptop *crp) error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); - return error; + KEY_FREESP(&isr->sp); + return (error); bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); + KEY_FREESP(&isr->sp); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); - return error; + return (error); } static struct xformsw ipcomp_xformsw = { diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index a30f6bf..4da0ee0 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -2762,6 +2762,10 @@ vnet_ipfw_init(const void *unused) LIST_INIT(&chain->nat); #endif + /* Init shared services hash table */ + ipfw_init_srv(chain); + + ipfw_init_obj_rewriter(); ipfw_init_counters(); /* insert the default rule and create the initial map */ chain->n_rules = 1; @@ -2860,9 +2864,11 @@ vnet_ipfw_uninit(const void *unused) if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); + ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); + ipfw_destroy_obj_rewriter(); return (0); } diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c index 201be2f..df40398 100644 --- a/sys/netpfil/ipfw/ip_fw_nat.c +++ b/sys/netpfil/ipfw/ip_fw_nat.c @@ -242,6 +242,8 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) } if (r->alink[0] == NULL) { printf("LibAliasRedirect* returned NULL\n"); + free(r->alink, M_IPFW); + free(r, M_IPFW); return (EINVAL); } /* LSNAT handling. */ @@ -263,6 +265,16 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) return (0); } +static void +free_nat_instance(struct cfg_nat *ptr) +{ + + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); +} + + /* * ipfw_nat - perform mbuf header translation. * @@ -536,7 +548,7 @@ nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) IPFW_UH_WUNLOCK(chain); if (tcfg != NULL) - free(tcfg, M_IPFW); + free_nat_instance(ptr); } /* @@ -626,9 +638,7 @@ nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); + free_nat_instance(ptr); return (0); } @@ -994,9 +1004,7 @@ ipfw_nat_del(struct sockopt *sopt) flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); + free_nat_instance(ptr); return (0); } @@ -1139,9 +1147,7 @@ vnet_ipfw_nat_uninit(const void *arg __unused) IPFW_WLOCK(chain); LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); + free_nat_instance(ptr); } flush_nat_ptrs(chain, -1 /* flush all */); V_ipfw_nat_ready = 0; diff --git a/sys/netpfil/ipfw/ip_fw_private.h b/sys/netpfil/ipfw/ip_fw_private.h index bb5b3ba..504093b 100644 --- a/sys/netpfil/ipfw/ip_fw_private.h +++ b/sys/netpfil/ipfw/ip_fw_private.h @@ -264,10 +264,10 @@ struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ - LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ void *tablestate; /* runtime table info */ void *valuestate; /* runtime table value info */ int *idxmap; /* skipto array of rules */ + void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else @@ -275,10 +275,12 @@ struct ip_fw_chain { #endif int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct ip_fw *default_rule; struct tables_config *tblcfg; /* tables module data */ void *ifcfg; /* interface module data */ int *idxmap_back; /* standby skipto array of rules */ + struct namedobj_instance *srvmap; /* cfg name->number mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else @@ -306,16 +308,15 @@ struct table_value { uint64_t refcnt; /* Number of references */ }; -struct namedobj_instance; struct named_object { TAILQ_ENTRY(named_object) nn_next; /* namehash */ TAILQ_ENTRY(named_object) nv_next; /* valuehash */ char *name; /* object name */ - uint8_t type; /* object type */ - uint8_t compat; /* Object name is number */ + uint8_t subtype; /* object subtype within class */ + uint8_t etlv; /* Export TLV id */ + uint16_t spare[2]; uint16_t kidx; /* object kernel index */ - uint16_t uidx; /* userland idx for compat records */ uint32_t set; /* set object belongs to */ uint32_t refcnt; /* number of references */ }; @@ -450,7 +451,7 @@ struct obj_idx { struct rule_check_info { uint16_t flags; /* rule-specific check flags */ - uint16_t table_opcodes; /* count of opcodes referencing table */ + uint16_t object_opcodes; /* num of opcodes referencing objects */ uint16_t urule_numoff; /* offset of rulenum in bytes */ uint8_t version; /* rule version */ uint8_t spare; @@ -507,6 +508,84 @@ struct ip_fw_bcounter0 { (r)->cmd_len * 4 - 4, 8)) #define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) +/* + * Tables/Objects index rewriting code + */ + +/* Default and maximum number of ipfw tables/objects. */ +#define IPFW_TABLES_MAX 65536 +#define IPFW_TABLES_DEFAULT 128 +#define IPFW_OBJECTS_MAX 65536 +#define IPFW_OBJECTS_DEFAULT 128 + +#define CHAIN_TO_SRV(ch) ((ch)->srvmap) + +struct tid_info { + uint32_t set; /* table set */ + uint16_t uidx; /* table index */ + uint8_t type; /* table type */ + uint8_t atype; + uint8_t spare; + int tlen; /* Total TLV size block */ + void *tlvs; /* Pointer to first TLV */ +}; + +/* + * Classifier callback. Checks if @cmd opcode contains kernel object reference. + * If true, returns its index and type. + * Returns 0 if match is found, 1 overwise. + */ +typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); +/* + * Updater callback. Sets kernel object reference index to @puidx + */ +typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx); +/* + * Finder callback. Tries to find named object by name (specified via @ti). + * Stores found named object pointer in @pno. + * If object was not found, NULL is stored. + * + * Return 0 if input data was valid. + */ +typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch, + struct tid_info *ti, struct named_object **pno); +/* + * Another finder callback. Tries to findex named object by kernel index. + * + * Returns pointer to named object or NULL. + */ +typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch, + uint16_t kidx); +/* + * Object creator callback. Tries to create object specified by @ti. + * Stores newly-allocated object index in @pkidx. + * + * Returns 0 on success. + */ +typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti, + uint16_t *pkidx); + + +struct opcode_obj_rewrite { + uint32_t opcode; /* Opcode to act upon */ + uint32_t etlv; /* Relevant export TLV id */ + ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */ + ipfw_obj_rw_upd *update; /* update cmd with new value */ + ipfw_obj_fname_cb *find_byname; /* Find named object by name */ + ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */ + ipfw_obj_create_cb *create_object; /* Create named object */ +}; + +#define IPFW_ADD_OBJ_REWRITER(f, c) do { \ + if ((f) != 0) \ + ipfw_add_obj_rewriter(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) +#define IPFW_DEL_OBJ_REWRITER(l, c) do { \ + if ((l) != 0) \ + ipfw_del_obj_rewriter(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) /* In ip_fw_iface.c */ int ipfw_iface_init(void); @@ -562,6 +641,7 @@ caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); sizeof(c) / sizeof(c[0])); \ } while(0) +struct namedobj_instance; typedef void (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, void *arg); typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, void *key, @@ -578,6 +658,8 @@ void ipfw_objhash_bitmap_free(void *idx, int blocks); void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name); +struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, + uint32_t set, uint32_t type, char *name); struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, @@ -591,6 +673,25 @@ int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); +void ipfw_init_obj_rewriter(void); +void ipfw_destroy_obj_rewriter(void); +void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); +int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); + +int ipfw_rewrite_rule_uidx(struct ip_fw_chain *chain, + struct rule_check_info *ci); +int ipfw_mark_object_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, + uint32_t *bmask); +int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, + struct obj_idx *pidx, int *found, int *unresolved); +void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *end); +int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti); +void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx); +int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx); +void ipfw_init_srv(struct ip_fw_chain *ch); +void ipfw_destroy_srv(struct ip_fw_chain *ch); /* In ip_fw_table.c */ struct table_info; diff --git a/sys/netpfil/ipfw/ip_fw_sockopt.c b/sys/netpfil/ipfw/ip_fw_sockopt.c index ef1ff6c..d618a6c 100644 --- a/sys/netpfil/ipfw/ip_fw_sockopt.c +++ b/sys/netpfil/ipfw/ip_fw_sockopt.c @@ -148,6 +148,21 @@ static struct ipfw_sopt_handler scodes[] = { { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, }; +static int +set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule); +struct opcode_obj_rewrite *ipfw_find_op_rw(uint16_t opcode); +static int mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule, + uint32_t *bmask); +static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule); +static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, + struct sockopt_data *sd); + +/* + * Opcode object rewriter variables + */ +struct opcode_obj_rewrite *ctl3_rewriters; +static size_t ctl3_rsize; + /* * static variables followed by global ones */ @@ -646,17 +661,18 @@ commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) struct ip_fw *krule; struct ip_fw **map; /* the new array of pointers */ - /* Check if we need to do table remap */ + /* Check if we need to do table/obj index remap */ tcount = 0; for (ci = rci, i = 0; i < count; ci++, i++) { - if (ci->table_opcodes == 0) + if (ci->object_opcodes == 0) continue; /* - * Rule has some table opcodes. - * Reference & allocate needed tables/ + * Rule has some object opcodes. + * We need to find (and create non-existing) + * kernel objects, and reference existing ones. */ - error = ipfw_rewrite_table_uidx(chain, ci); + error = ipfw_rewrite_rule_uidx(chain, ci); if (error != 0) { /* @@ -674,9 +690,9 @@ commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) IPFW_UH_WLOCK(chain); while (ci != rci) { ci--; - if (ci->table_opcodes == 0) + if (ci->object_opcodes == 0) continue; - ipfw_unref_rule_tables(chain,ci->krule); + unref_rule_objects(chain,ci->krule); } IPFW_UH_WUNLOCK(chain); @@ -696,10 +712,10 @@ commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) /* Unbind tables */ IPFW_UH_WLOCK(chain); for (ci = rci, i = 0; i < count; ci++, i++) { - if (ci->table_opcodes == 0) + if (ci->object_opcodes == 0) continue; - ipfw_unref_rule_tables(chain, ci->krule); + unref_rule_objects(chain, ci->krule); } IPFW_UH_WUNLOCK(chain); } @@ -759,7 +775,7 @@ ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, IPFW_UH_WLOCK_ASSERT(chain); /* Unlink rule from everywhere */ - ipfw_unref_rule_tables(chain, rule); + unref_rule_objects(chain, rule); *((struct ip_fw **)rule) = *head; *head = rule; @@ -1542,7 +1558,7 @@ check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; - ci->table_opcodes++; + ci->object_opcodes++; break; case O_IP_FLOW_LOOKUP: if (cmd->arg1 >= V_fw_tables_max) { @@ -1553,7 +1569,7 @@ check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; - ci->table_opcodes++; + ci->object_opcodes++; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) @@ -1587,7 +1603,7 @@ check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) case O_XMIT: case O_VIA: if (((ipfw_insn_if *)cmd)->name[0] == '\1') - ci->table_opcodes++; + ci->object_opcodes++; if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; @@ -1788,7 +1804,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) l = RULESIZE7(rule); if (bp + l + sizeof(uint32_t) <= ep) { bcopy(rule, bp, l + sizeof(uint32_t)); - error = ipfw_rewrite_table_kidx(chain, + error = set_legacy_obj_kidx(chain, (struct ip_fw_rule0 *)bp); if (error != 0) return (0); @@ -1817,7 +1833,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) } dst = (struct ip_fw_rule0 *)bp; export_rule0(rule, dst, l); - error = ipfw_rewrite_table_kidx(chain, dst); + error = set_legacy_obj_kidx(chain, dst); /* * XXX HACK. Store the disable mask in the "next" @@ -1861,6 +1877,34 @@ struct dump_args { }; /* + * Export named object info in instance @ni, identified by @kidx + * to ipfw_obj_ntlv. TLV is allocated from @sd space. + * + * Returns 0 on success. + */ +static int +export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, + struct sockopt_data *sd) +{ + struct named_object *no; + ipfw_obj_ntlv *ntlv; + + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, ("invalid object kernel index passed")); + + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); + if (ntlv == NULL) + return (ENOMEM); + + ntlv->head.type = no->etlv; + ntlv->head.length = sizeof(*ntlv); + ntlv->idx = no->kidx; + strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); + + return (0); +} + +/* * Dumps static rules with table TLVs in buffer @sd. * * Returns 0 on success. @@ -1874,6 +1918,7 @@ dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, uint32_t tcount; ipfw_obj_ctlv *ctlv; struct ip_fw *krule; + struct namedobj_instance *ni; caddr_t dst; /* Dump table names first (if any) */ @@ -1891,13 +1936,21 @@ dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, i = 0; tcount = da->tcount; + ni = ipfw_get_table_objhash(chain); while (tcount > 0) { if ((bmask[i / 32] & (1 << (i % 32))) == 0) { i++; continue; } - if ((error = ipfw_export_table_ntlv(chain, i, sd)) != 0) + /* Jump to shared named object bitmask */ + if (i >= IPFW_TABLES_MAX) { + ni = CHAIN_TO_SRV(chain); + i -= IPFW_TABLES_MAX; + bmask += IPFW_TABLES_MAX / 32; + } + + if ((error = export_objhash_ntlv(ni, i, sd)) != 0) return (error); i++; @@ -1929,6 +1982,52 @@ dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, } /* + * Marks every object index used in @rule with bit in @bmask. + * Used to generate bitmask of referenced tables/objects for given ruleset + * or its part. + * + * Returns number of newly-referenced objects. + */ +static int +mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule, + uint32_t *bmask) +{ + int cmdlen, l, count; + ipfw_insn *cmd; + uint16_t kidx; + struct opcode_obj_rewrite *rw; + int bidx; + uint8_t subtype; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + count = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + rw = ipfw_find_op_rw(cmd->opcode); + if (rw == NULL) + continue; + + if (rw->classifier(cmd, &kidx, &subtype) != 0) + continue; + + bidx = kidx / 32; + /* Maintain separate bitmasks for table and non-table objects */ + if (rw->etlv != IPFW_TLV_TBL_NAME) + bidx += IPFW_TABLES_MAX / 32; + + if ((bmask[bidx] & (1 << (kidx % 32))) == 0) + count++; + + bmask[bidx] |= 1 << (kidx % 32); + } + + return (count); +} + +/* * Dumps requested objects data * Data layout (version 0)(current): * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags @@ -1963,9 +2062,9 @@ dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, error = 0; bmask = NULL; - /* Allocate needed state */ + /* Allocate needed state. Note we allocate 2xspace mask, for table&srv */ if (hdr->flags & IPFW_CFG_GET_STATIC) - bmask = malloc(IPFW_TABLES_MAX / 8, M_TEMP, M_WAITOK | M_ZERO); + bmask = malloc(IPFW_TABLES_MAX / 4, M_TEMP, M_WAITOK | M_ZERO); IPFW_UH_RLOCK(chain); @@ -1994,7 +2093,8 @@ dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, rule = chain->map[i]; da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); da.rcount++; - da.tcount += ipfw_mark_table_kidx(chain, rule, bmask); + /* Update bitmask of used objects for given range */ + da.tcount += mark_object_kidx(chain, rule, bmask); } /* Add counters if requested */ if (hdr->flags & IPFW_CFG_GET_COUNTERS) { @@ -2064,6 +2164,241 @@ check_object_name(ipfw_obj_ntlv *ntlv) } /* + * Creates non-existent objects referenced by rule. + * + * Return 0 on success. + */ +int +create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, + struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti) +{ + struct opcode_obj_rewrite *rw; + struct obj_idx *p; + uint16_t kidx; + int error; + + /* + * Compatibility stuff: do actual creation for non-existing, + * but referenced objects. + */ + for (p = oib; p < pidx; p++) { + if (p->kidx != 0) + continue; + + ti->uidx = p->uidx; + ti->type = p->type; + ti->atype = 0; + + rw = ipfw_find_op_rw((cmd + p->off)->opcode); + KASSERT(rw != NULL, ("Unable to find handler for op %d", + (cmd + p->off)->opcode)); + + error = rw->create_object(ch, ti, &kidx); + if (error == 0) { + p->kidx = kidx; + continue; + } + + /* + * Error happened. We have to rollback everything. + * Drop all already acquired references. + */ + IPFW_UH_WLOCK(ch); + unref_oib_objects(ch, cmd, oib, pidx); + IPFW_UH_WUNLOCK(ch); + + return (error); + } + + return (0); +} + +/* + * Compatibility function for old ipfw(8) binaries. + * Rewrites table/nat kernel indices with userland ones. + * Convert tables matching '/^\d+$/' to their atoi() value. + * Use number 65535 for other tables. + * + * Returns 0 on success. + */ +static int +set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule) +{ + int cmdlen, error, l; + ipfw_insn *cmd; + uint16_t kidx, uidx; + struct named_object *no; + struct opcode_obj_rewrite *rw; + uint8_t subtype; + char *end; + long val; + + error = 0; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + rw = ipfw_find_op_rw(cmd->opcode); + if (rw == NULL) + continue; + + /* Check if is index in given opcode */ + if (rw->classifier(cmd, &kidx, &subtype) != 0) + continue; + + /* Try to find referenced kernel object */ + no = rw->find_bykidx(ch, kidx); + if (no == NULL) + continue; + + val = strtol(no->name, &end, 10); + if (*end == '\0' && val < 65535) { + uidx = val; + } else { + + /* + * We are called via legacy opcode. + * Save error and show table as fake number + * not to make ipfw(8) hang. + */ + uidx = 65535; + error = 2; + } + + rw->update(cmd, uidx); + } + + return (error); +} + + +/* + * Unreferences all already-referenced objects in given @cmd rule, + * using information in @oib. + * + * Used to rollback partially converted rule on error. + */ +void +unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, + struct obj_idx *end) +{ + struct opcode_obj_rewrite *rw; + struct named_object *no; + struct obj_idx *p; + + IPFW_UH_WLOCK_ASSERT(ch); + + for (p = oib; p < end; p++) { + if (p->kidx == 0) + continue; + + rw = ipfw_find_op_rw((cmd + p->off)->opcode); + KASSERT(rw != NULL, ("Unable to find handler for op %d", + (cmd + p->off)->opcode)); + + /* Find & unref by existing idx */ + no = rw->find_bykidx(ch, p->kidx); + KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx)); + no->refcnt--; + } +} + +/* + * Remove references from every object used in @rule. + * Used at rule removal code. + */ +static void +unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule) +{ + int cmdlen, l; + ipfw_insn *cmd; + struct named_object *no; + uint16_t kidx; + struct opcode_obj_rewrite *rw; + uint8_t subtype; + + IPFW_UH_WLOCK_ASSERT(ch); + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + rw = ipfw_find_op_rw(cmd->opcode); + if (rw == NULL) + continue; + if (rw->classifier(cmd, &kidx, &subtype) != 0) + continue; + + no = rw->find_bykidx(ch, kidx); + + KASSERT(no != NULL, ("table id %d not found", kidx)); + KASSERT(no->subtype == subtype, + ("wrong type %d (%d) for table id %d", + no->subtype, subtype, kidx)); + KASSERT(no->refcnt > 0, ("refcount for table %d is %d", + kidx, no->refcnt)); + + no->refcnt--; + } +} + + +/* + * Find and reference object (if any) stored in instruction @cmd. + * + * Saves object info in @pidx, sets + * - @found to 1 if object was found and references + * - @unresolved to 1 if object should exists but not found + * + * Returns non-zero value in case of error. + */ +int +ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, + struct obj_idx *pidx, int *found, int *unresolved) +{ + struct named_object *no; + struct opcode_obj_rewrite *rw; + int error; + + *found = 0; + *unresolved = 0; + + /* Check if this opcode is candidate for rewrite */ + rw = ipfw_find_op_rw(cmd->opcode); + if (rw == NULL) + return (0); + + /* Check if we need to rewrite this opcode */ + if (rw->classifier(cmd, &ti->uidx, &ti->type) != 0) + return (0); + + /* Need to rewrite. Save necessary fields */ + pidx->uidx = ti->uidx; + pidx->type = ti->type; + + /* Try to find referenced kernel object */ + error = rw->find_byname(ch, ti, &no); + if (error != 0) + return (error); + if (no == NULL) { + *unresolved = 1; + return (0); + } + + /* Found. bump refcount */ + *found = 1; + no->refcnt++; + pidx->kidx = no->kidx; + + return (0); +} + +/* * Adds one or more rules to ipfw @chain. * Data layout (version 0)(current): * Request: @@ -2315,6 +2650,160 @@ dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, } /* + * Compares two opcodes. + * Used both in qsort() and bsearch(). + * + * Returns 0 if match is found. + */ +static int +compare_opcodes(const void *_a, const void *_b) +{ + const struct opcode_obj_rewrite *a, *b; + + a = (const struct opcode_obj_rewrite *)_a; + b = (const struct opcode_obj_rewrite *)_b; + + if (a->opcode < b->opcode) + return (-1); + else if (a->opcode > b->opcode) + return (1); + + return (0); +} + +/* + * Finds opcode object rewriter based on @code. + * + * Returns pointer to handler or NULL. + */ +struct opcode_obj_rewrite * +ipfw_find_op_rw(uint16_t opcode) +{ + struct opcode_obj_rewrite *rw, h; + + memset(&h, 0, sizeof(h)); + h.opcode = opcode; + + rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters, + ctl3_rsize, sizeof(h), compare_opcodes); + + return (rw); +} + +int +classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx) +{ + struct opcode_obj_rewrite *rw; + uint8_t subtype; + + rw = ipfw_find_op_rw(cmd->opcode); + if (rw == NULL) + return (1); + + return (rw->classifier(cmd, puidx, &subtype)); +} + +void +update_opcode_kidx(ipfw_insn *cmd, uint16_t idx) +{ + struct opcode_obj_rewrite *rw; + + rw = ipfw_find_op_rw(cmd->opcode); + KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode)); + rw->update(cmd, idx); +} + +void +ipfw_init_obj_rewriter() +{ + + ctl3_rewriters = NULL; + ctl3_rsize = 0; +} + +void +ipfw_destroy_obj_rewriter() +{ + + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = NULL; + ctl3_rsize = 0; +} + +/* + * Adds one or more opcode object rewrite handlers to the global array. + * Function may sleep. + */ +void +ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) +{ + size_t sz; + struct opcode_obj_rewrite *tmp; + + CTL3_LOCK(); + + for (;;) { + sz = ctl3_rsize + count; + CTL3_UNLOCK(); + tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO); + CTL3_LOCK(); + if (ctl3_rsize + count <= sz) + break; + + /* Retry */ + free(tmp, M_IPFW); + } + + /* Merge old & new arrays */ + sz = ctl3_rsize + count; + memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw)); + memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw)); + qsort(tmp, sz, sizeof(*rw), compare_opcodes); + /* Switch new and free old */ + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = tmp; + ctl3_rsize = sz; + + CTL3_UNLOCK(); +} + +/* + * Removes one or more object rewrite handlers from the global array. + */ +int +ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) +{ + size_t sz; + struct opcode_obj_rewrite *tmp, *h; + int i; + + CTL3_LOCK(); + + for (i = 0; i < count; i++) { + tmp = &rw[i]; + h = ipfw_find_op_rw(tmp->opcode); + if (h == NULL) + continue; + + sz = (ctl3_rewriters + ctl3_rsize - (h + 1)) * sizeof(*h); + memmove(h, h + 1, sz); + ctl3_rsize--; + } + + if (ctl3_rsize == 0) { + if (ctl3_rewriters != NULL) + free(ctl3_rewriters, M_IPFW); + ctl3_rewriters = NULL; + } + + CTL3_UNLOCK(); + + return (0); +} + +/* * Compares two sopt handlers (code, version and handler ptr). * Used both as qsort() and bsearch(). * Does not compare handler for latter case. @@ -3150,6 +3639,23 @@ convert_rule_to_8(struct ip_fw_rule0 *rule) * */ +void +ipfw_init_srv(struct ip_fw_chain *ch) +{ + + ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT); + ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT, + M_IPFW, M_WAITOK | M_ZERO); +} + +void +ipfw_destroy_srv(struct ip_fw_chain *ch) +{ + + free(ch->srvstate, M_IPFW); + ipfw_objhash_destroy(ch->srvmap); +} + /* * Allocate new bitmask which can be used to enlarge/shrink * named instance index. @@ -3323,6 +3829,26 @@ ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name) return (NULL); } +/* + * Find named object by name, considering also its TLV type. + */ +struct named_object * +ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, + uint32_t type, char *name) +{ + struct named_object *no; + uint32_t hash; + + hash = ni->hash_f(ni, name, set) % ni->nn_size; + + TAILQ_FOREACH(no, &ni->names[hash], nn_next) { + if (ni->cmp_f(no, name, set) == 0 && no->etlv == type) + return (no); + } + + return (NULL); +} + struct named_object * ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) { diff --git a/sys/netpfil/ipfw/ip_fw_table.c b/sys/netpfil/ipfw/ip_fw_table.c index 4498ace..ca4e0e4 100644 --- a/sys/netpfil/ipfw/ip_fw_table.c +++ b/sys/netpfil/ipfw/ip_fw_table.c @@ -89,6 +89,8 @@ struct table_config { struct namedobj_instance *vi; }; +static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti, + struct table_config **tc); static struct table_config *find_table(struct namedobj_instance *ni, struct tid_info *ti); static struct table_config *alloc_table_config(struct ip_fw_chain *ch, @@ -122,7 +124,6 @@ static struct table_algo *find_table_algo(struct tables_config *tableconf, static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); -static int classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); #define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) #define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) @@ -297,7 +298,7 @@ find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, tc = NULL; if ((tc = find_table(ni, ti)) != NULL) { /* check table type */ - if (tc->no.type != ti->type) + if (tc->no.subtype != ti->type) return (EINVAL); if (tc->locked != 0) @@ -1116,7 +1117,7 @@ find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, } /* check table type */ - if (tc->no.type != ti.type) { + if (tc->no.subtype != ti.type) { IPFW_UH_RUNLOCK(ch); return (EINVAL); } @@ -1399,7 +1400,7 @@ swap_tables(struct ip_fw_chain *ch, struct tid_info *a, } /* Check type and value are the same */ - if (tc_a->no.type != tc_b->no.type || tc_a->tflags != tc_b->tflags) { + if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } @@ -1613,7 +1614,6 @@ ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) ipfw_insn *cmd; int cmdlen, i, l; uint16_t kidx; - uint8_t type; IPFW_UH_WLOCK(ch); @@ -1636,7 +1636,7 @@ ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &kidx, &type) != 0) + if (classify_opcode_kidx(cmd, &kidx) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); @@ -1920,7 +1920,7 @@ create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, * requesting to create existing table * which has the same type */ - if (compat == 0 || tc_new->no.type != tc->no.type) { + if (compat == 0 || tc_new->no.subtype != tc->no.subtype) { IPFW_UH_WUNLOCK(ch); free_table_config(ni, tc); return (EEXIST); @@ -1940,6 +1940,7 @@ create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, return (EBUSY); } tc->no.kidx = kidx; + tc->no.etlv = IPFW_TLV_TBL_NAME; IPFW_WLOCK(ch); link_table(ch, tc); @@ -1977,6 +1978,13 @@ objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) ntlv_to_ti(&oh->ntlv, ti); } +struct namedobj_instance * +ipfw_get_table_objhash(struct ip_fw_chain *ch) +{ + + return (CHAIN_TO_NI(ch)); +} + /* * Exports basic table info as name TLV. * Used inside dump_static_rules() to provide info @@ -2009,40 +2017,6 @@ ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, return (0); } -/* - * Marks every table kidx used in @rule with bit in @bmask. - * Used to generate bitmask of referenced tables for given ruleset. - * - * Returns number of newly-referenced tables. - */ -int -ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, - uint32_t *bmask) -{ - int cmdlen, l, count; - ipfw_insn *cmd; - uint16_t kidx; - uint8_t type; - - l = rule->cmd_len; - cmd = rule->cmd; - cmdlen = 0; - count = 0; - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - if (classify_table_opcode(cmd, &kidx, &type) != 0) - continue; - - if ((bmask[kidx / 32] & (1 << (kidx % 32))) == 0) - count++; - - bmask[kidx / 32] |= 1 << (kidx % 32); - } - - return (count); -} - struct dump_args { struct ip_fw_chain *ch; struct table_info *ti; @@ -2111,7 +2085,7 @@ export_table_info(struct ip_fw_chain *ch, struct table_config *tc, struct table_info *ti; struct table_algo *ta; - i->type = tc->no.type; + i->type = tc->no.subtype; i->tflags = tc->tflags; i->vmask = tc->vmask; i->set = tc->no.set; @@ -2296,7 +2270,7 @@ dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, xtbl->cnt = count; xtbl->size = sz; - xtbl->type = tc->no.type; + xtbl->type = tc->no.subtype; xtbl->tbl = ti.uidx; if (sd->valsize < sz) { @@ -2440,7 +2414,7 @@ ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, ta = tc->ta; /* This dump format supports IPv4 only */ - if (tc->no.type != IPFW_TABLE_ADDR) + if (tc->no.subtype != IPFW_TABLE_ADDR) return (0); memset(&da, 0, sizeof(da)); @@ -2531,7 +2505,7 @@ dump_table_xentry(void *e, void *arg) pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); xent->value = ipfw_export_table_value_legacy(pval); /* Apply some hacks */ - if (tc->no.type == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { + if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; xent->flags = IPFW_TCF_INET; } else @@ -2781,114 +2755,157 @@ list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, return (0); } -/* - * Tables rewriting code - */ - -/* - * Determine table number and lookup type for @cmd. - * Fill @tbl and @type with appropriate values. - * Returns 0 for relevant opcodes, 1 otherwise. - */ static int -classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { - ipfw_insn_if *cmdif; - int skip; - uint16_t v; - - skip = 1; - - switch (cmd->opcode) { - case O_IP_SRC_LOOKUP: - case O_IP_DST_LOOKUP: - /* Basic IPv4/IPv6 or u32 lookups */ - *puidx = cmd->arg1; - /* Assume ADDR by default */ - *ptype = IPFW_TABLE_ADDR; - skip = 0; + /* Basic IPv4/IPv6 or u32 lookups */ + *puidx = cmd->arg1; + /* Assume ADDR by default */ + *ptype = IPFW_TABLE_ADDR; + int v; - if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { - /* - * generic lookup. The key must be - * in 32bit big-endian format. - */ - v = ((ipfw_insn_u32 *)cmd)->d[1]; - switch (v) { - case 0: - case 1: - /* IPv4 src/dst */ - break; - case 2: - case 3: - /* src/dst port */ - *ptype = IPFW_TABLE_NUMBER; - break; - case 4: - /* uid/gid */ - *ptype = IPFW_TABLE_NUMBER; - break; - case 5: - /* jid */ - *ptype = IPFW_TABLE_NUMBER; - break; - case 6: - /* dscp */ - *ptype = IPFW_TABLE_NUMBER; - break; - } - } - break; - case O_XMIT: - case O_RECV: - case O_VIA: - /* Interface table, possibly */ - cmdif = (ipfw_insn_if *)cmd; - if (cmdif->name[0] != '\1') + if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { + /* + * generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + switch (v) { + case 0: + case 1: + /* IPv4 src/dst */ break; - - *ptype = IPFW_TABLE_INTERFACE; - *puidx = cmdif->p.kidx; - skip = 0; - break; - case O_IP_FLOW_LOOKUP: - *puidx = cmd->arg1; - *ptype = IPFW_TABLE_FLOW; - skip = 0; - break; + case 2: + case 3: + /* src/dst port */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 4: + /* uid/gid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 5: + /* jid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 6: + /* dscp */ + *ptype = IPFW_TABLE_NUMBER; + break; + } } - return (skip); + return (0); +} + +static int +classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn_if *cmdif; + + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + return (1); + + *ptype = IPFW_TABLE_INTERFACE; + *puidx = cmdif->p.kidx; + + return (0); +} + +static int +classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + + *puidx = cmd->arg1; + *ptype = IPFW_TABLE_FLOW; + + return (0); +} + +static void +update_arg1(ipfw_insn *cmd, uint16_t idx) +{ + + cmd->arg1 = idx; } -/* - * Sets new table value for given opcode. - * Assume the same opcodes as classify_table_opcode() - */ static void -update_table_opcode(ipfw_insn *cmd, uint16_t idx) +update_via(ipfw_insn *cmd, uint16_t idx) { ipfw_insn_if *cmdif; - switch (cmd->opcode) { - case O_IP_SRC_LOOKUP: - case O_IP_DST_LOOKUP: - /* Basic IPv4/IPv6 or u32 lookups */ - cmd->arg1 = idx; - break; - case O_XMIT: - case O_RECV: - case O_VIA: - /* Interface table, possibly */ - cmdif = (ipfw_insn_if *)cmd; - cmdif->p.kidx = idx; - break; - case O_IP_FLOW_LOOKUP: - cmd->arg1 = idx; - break; - } + cmdif = (ipfw_insn_if *)cmd; + cmdif->p.kidx = idx; } +static int +table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, + struct named_object **pno) +{ + struct table_config *tc; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = find_table_err(CHAIN_TO_NI(ch), ti, &tc); + if (error != 0) + return (error); + + *pno = &tc->no; + return (0); +} + +/* XXX: sets-sets! */ +static struct named_object * +table_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +{ + struct namedobj_instance *ni; + struct table_config *tc; + + IPFW_UH_WLOCK_ASSERT(ch); + ni = CHAIN_TO_NI(ch); + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx); + KASSERT(tc != NULL, ("Table with index %d not found", idx)); + + return (&tc->no); +} + +static struct opcode_obj_rewrite opcodes[] = { + { + O_IP_SRC_LOOKUP, IPFW_TLV_TBL_NAME, + classify_srcdst, update_arg1, + table_findbyname, table_findbykidx, create_table_compat + }, + { + O_IP_DST_LOOKUP, IPFW_TLV_TBL_NAME, + classify_srcdst, update_arg1, + table_findbyname, table_findbykidx, create_table_compat + }, + { + O_IP_FLOW_LOOKUP, IPFW_TLV_TBL_NAME, + classify_flow, update_arg1, + table_findbyname, table_findbykidx, create_table_compat + }, + { + O_XMIT, IPFW_TLV_TBL_NAME, + classify_via, update_via, + table_findbyname, table_findbykidx, create_table_compat + }, + { + O_RECV, IPFW_TLV_TBL_NAME, + classify_via, update_via, + table_findbyname, table_findbykidx, create_table_compat + }, + { + O_VIA, IPFW_TLV_TBL_NAME, + classify_via, update_via, + table_findbyname, table_findbykidx, create_table_compat + }, +}; + + /* * Checks table name for validity. * Enforce basic length checks, the rest @@ -2960,10 +2977,11 @@ find_name_tlv(void *tlvs, int len, uint16_t uidx) * or name in ntlv. * Note @ti structure contains unchecked data from userland. * - * Returns pointer to table_config or NULL. + * Returns 0 in success and fills in @tc with found config */ -static struct table_config * -find_table(struct namedobj_instance *ni, struct tid_info *ti) +static int +find_table_err(struct namedobj_instance *ni, struct tid_info *ti, + struct table_config **tc) { char *name, bname[16]; struct named_object *no; @@ -2973,7 +2991,7 @@ find_table(struct namedobj_instance *ni, struct tid_info *ti) if (ti->tlvs != NULL) { ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); if (ntlv == NULL) - return (NULL); + return (EINVAL); name = ntlv->name; /* @@ -2989,8 +3007,27 @@ find_table(struct namedobj_instance *ni, struct tid_info *ti) } no = ipfw_objhash_lookup_name(ni, set, name); + *tc = (struct table_config *)no; + + return (0); +} + +/* + * Finds table config based on either legacy index + * or name in ntlv. + * Note @ti structure contains unchecked data from userland. + * + * Returns pointer to table_config or NULL. + */ +static struct table_config * +find_table(struct namedobj_instance *ni, struct tid_info *ti) +{ + struct table_config *tc; + + if (find_table_err(ni, ti, &tc) != 0) + return (NULL); - return ((struct table_config *)no); + return (tc); } /* @@ -3016,6 +3053,7 @@ alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, name = ntlv->name; set = ntlv->set; } else { + /* Compat part: convert number to string representation */ snprintf(bname, sizeof(bname), "%d", ti->uidx); name = bname; set = 0; @@ -3023,7 +3061,7 @@ alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); tc->no.name = tc->tablename; - tc->no.type = ta->type; + tc->no.subtype = ta->type; tc->no.set = set; tc->tflags = tflags; tc->ta = ta; @@ -3031,11 +3069,6 @@ alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, /* Set "shared" value type by default */ tc->vshared = 1; - if (ti->tlvs == NULL) { - tc->no.compat = 1; - tc->no.uidx = ti->uidx; - } - /* Preallocate data structures for new tables */ error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); if (error != 0) { @@ -3211,7 +3244,6 @@ ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, struct namedobj_instance *ni; int bad, i, l, cmdlen; uint16_t kidx; - uint8_t type; ipfw_insn *cmd; IPFW_UH_WLOCK_ASSERT(ch); @@ -3229,7 +3261,7 @@ ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &kidx, &type) != 0) + if (classify_opcode_kidx(cmd, &kidx) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, @@ -3252,7 +3284,7 @@ ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &kidx, &type) != 0) + if (classify_opcode_kidx(cmd, &kidx) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, @@ -3291,7 +3323,7 @@ ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &kidx, &type) != 0) + if (classify_opcode_kidx(cmd, &kidx) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, @@ -3313,215 +3345,68 @@ ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, } /* - * Finds and bumps refcount for tables referenced by given @rule. + * Finds and bumps refcount for objects referenced by given @rule. * Auto-creates non-existing tables. * Fills in @oib array with userland/kernel indexes. - * First free oidx pointer is saved back in @oib. * * Returns 0 on success. */ static int -find_ref_rule_tables(struct ip_fw_chain *ch, struct ip_fw *rule, - struct rule_check_info *ci, struct obj_idx **oib, struct tid_info *ti) +ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, + struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti) { - struct table_config *tc; - struct namedobj_instance *ni; - struct named_object *no; int cmdlen, error, l, numnew; - uint16_t kidx; ipfw_insn *cmd; - struct obj_idx *pidx, *pidx_first, *p; + struct obj_idx *pidx; + int found, unresolved; - pidx_first = *oib; - pidx = pidx_first; + pidx = oib; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; error = 0; numnew = 0; + found = 0; + unresolved = 0; IPFW_UH_WLOCK(ch); - ni = CHAIN_TO_NI(ch); /* Increase refcount on each existing referenced table. */ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &ti->uidx, &ti->type) != 0) - continue; - - pidx->uidx = ti->uidx; - pidx->type = ti->type; - - if ((tc = find_table(ni, ti)) != NULL) { - if (tc->no.type != ti->type) { - /* Incompatible types */ - error = EINVAL; - break; - } - - /* Reference found table and save kidx */ - tc->no.refcnt++; - pidx->kidx = tc->no.kidx; + error = ref_opcode_object(ch, cmd, ti, pidx, &found, &unresolved); + if (error != 0) + break; + if (found || unresolved) { + pidx->off = rule->cmd_len - l; pidx++; - continue; } - /* * Compability stuff for old clients: - * prepare to manually create non-existing tables. + * prepare to manually create non-existing objects. */ - pidx++; - numnew++; + if (unresolved) + numnew++; } if (error != 0) { /* Unref everything we have already done */ - for (p = *oib; p < pidx; p++) { - if (p->kidx == 0) - continue; - - /* Find & unref by existing idx */ - no = ipfw_objhash_lookup_kidx(ni, p->kidx); - KASSERT(no != NULL, ("Ref'd table %d disappeared", - p->kidx)); - - no->refcnt--; - } - } - - IPFW_UH_WUNLOCK(ch); - - if (numnew == 0) { - *oib = pidx; - return (error); - } - - /* - * Compatibility stuff: do actual creation for non-existing, - * but referenced tables. - */ - for (p = pidx_first; p < pidx; p++) { - if (p->kidx != 0) - continue; - - ti->uidx = p->uidx; - ti->type = p->type; - ti->atype = 0; - - error = create_table_compat(ch, ti, &kidx); - if (error == 0) { - p->kidx = kidx; - continue; - } - - /* Error. We have to drop references */ - IPFW_UH_WLOCK(ch); - for (p = pidx_first; p < pidx; p++) { - if (p->kidx == 0) - continue; - - /* Find & unref by existing idx */ - no = ipfw_objhash_lookup_kidx(ni, p->kidx); - KASSERT(no != NULL, ("Ref'd table %d disappeared", - p->kidx)); - - no->refcnt--; - } + unref_oib_objects(ch, rule->cmd, oib, pidx); IPFW_UH_WUNLOCK(ch); - return (error); } - *oib = pidx; - - return (error); -} - -/* - * Remove references from every table used in @rule. - */ -void -ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule) -{ - int cmdlen, l; - ipfw_insn *cmd; - struct namedobj_instance *ni; - struct named_object *no; - uint16_t kidx; - uint8_t type; - - IPFW_UH_WLOCK_ASSERT(chain); - ni = CHAIN_TO_NI(chain); - - l = rule->cmd_len; - cmd = rule->cmd; - cmdlen = 0; - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - if (classify_table_opcode(cmd, &kidx, &type) != 0) - continue; - - no = ipfw_objhash_lookup_kidx(ni, kidx); - - KASSERT(no != NULL, ("table id %d not found", kidx)); - KASSERT(no->type == type, ("wrong type %d (%d) for table id %d", - no->type, type, kidx)); - KASSERT(no->refcnt > 0, ("refcount for table %d is %d", - kidx, no->refcnt)); - - no->refcnt--; - } -} - -/* - * Compatibility function for old ipfw(8) binaries. - * Rewrites table kernel indices with userland ones. - * Convert tables matching '/^\d+$/' to their atoi() value. - * Use number 65535 for other tables. - * - * Returns 0 on success. - */ -int -ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, struct ip_fw_rule0 *rule) -{ - int cmdlen, error, l; - ipfw_insn *cmd; - uint16_t kidx, uidx; - uint8_t type; - struct named_object *no; - struct namedobj_instance *ni; - - ni = CHAIN_TO_NI(chain); - error = 0; - - l = rule->cmd_len; - cmd = rule->cmd; - cmdlen = 0; - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - if (classify_table_opcode(cmd, &kidx, &type) != 0) - continue; + IPFW_UH_WUNLOCK(ch); - if ((no = ipfw_objhash_lookup_kidx(ni, kidx)) == NULL) - return (1); + found = pidx - oib; + KASSERT(found == ci->object_opcodes, + ("refcount inconsistency: found: %d total: %d", + found, ci->object_opcodes)); - uidx = no->uidx; - if (no->compat == 0) { - - /* - * We are called via legacy opcode. - * Save error and show table as fake number - * not to make ipfw(8) hang. - */ - uidx = 65535; - error = 2; - } - - update_table_opcode(cmd, uidx); - } + /* Perform auto-creation for non-existing objects */ + if (numnew != 0) + error = create_objects_compat(ch, rule->cmd, oib, pidx, ti); return (error); } @@ -3534,31 +3419,27 @@ ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, struct ip_fw_rule0 *rule) * Returns 0 on success and appropriate error code otherwise. */ int -ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, +ipfw_rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci) { - int cmdlen, error, l; + int error; ipfw_insn *cmd; - uint16_t uidx; uint8_t type; - struct namedobj_instance *ni; struct obj_idx *p, *pidx_first, *pidx_last; struct tid_info ti; - ni = CHAIN_TO_NI(chain); - /* * Prepare an array for storing opcode indices. * Use stack allocation by default. */ - if (ci->table_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { + if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { /* Stack */ pidx_first = ci->obuf; } else - pidx_first = malloc(ci->table_opcodes * sizeof(struct obj_idx), + pidx_first = malloc(ci->object_opcodes * sizeof(struct obj_idx), M_IPFW, M_WAITOK | M_ZERO); - pidx_last = pidx_first; + pidx_last = pidx_first + ci->object_opcodes; error = 0; type = 0; memset(&ti, 0, sizeof(ti)); @@ -3573,28 +3454,18 @@ ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); } - /* Reference all used tables */ - error = find_ref_rule_tables(chain, ci->krule, ci, &pidx_last, &ti); + /* Reference all used tables and other objects */ + error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti); if (error != 0) goto free; - IPFW_UH_WLOCK(chain); - /* Perform rule rewrite */ - l = ci->krule->cmd_len; - cmd = ci->krule->cmd; - cmdlen = 0; p = pidx_first; - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - if (classify_table_opcode(cmd, &uidx, &type) != 0) - continue; - update_table_opcode(cmd, p->kidx); - p++; + for (p = pidx_first; p < pidx_last; p++) { + cmd = ci->krule->cmd + p->off; + update_opcode_kidx(cmd, p->kidx); } - IPFW_UH_WUNLOCK(chain); - free: if (pidx_first != ci->obuf) free(pidx_first, M_IPFW); @@ -3641,6 +3512,7 @@ ipfw_destroy_tables(struct ip_fw_chain *ch, int last) { IPFW_DEL_SOPT_HANDLER(last, scodes); + IPFW_DEL_OBJ_REWRITER(last, opcodes); /* Remove all tables from working set */ IPFW_UH_WLOCK(ch); @@ -3678,6 +3550,7 @@ ipfw_init_tables(struct ip_fw_chain *ch, int first) ipfw_table_value_init(ch, first); ipfw_table_algo_init(ch); + IPFW_ADD_OBJ_REWRITER(first, opcodes); IPFW_ADD_SOPT_HANDLER(first, scodes); return (0); } diff --git a/sys/netpfil/ipfw/ip_fw_table.h b/sys/netpfil/ipfw/ip_fw_table.h index 028e450..ca49fd4 100644 --- a/sys/netpfil/ipfw/ip_fw_table.h +++ b/sys/netpfil/ipfw/ip_fw_table.h @@ -53,16 +53,6 @@ struct table_info { u_long data; /* Hints for given func */ }; -/* Internal structures for handling sockopt data */ -struct tid_info { - uint32_t set; /* table set */ - uint16_t uidx; /* table index */ - uint8_t type; /* table type */ - uint8_t atype; - void *tlvs; /* Pointer to first TLV */ - int tlen; /* Total TLV size block */ -}; - struct table_value; struct tentry_info { void *paddr; @@ -189,13 +179,12 @@ void rollback_table_values(struct tableop_state *ts); int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci); -int ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, - struct ip_fw_rule0 *rule); int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, uint32_t *bmask); int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, struct sockopt_data *sd); void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule); +struct namedobj_instance *ipfw_get_table_objhash(struct ip_fw_chain *ch); /* utility functions */ int ipfw_check_table_name(char *name); diff --git a/sys/opencrypto/cryptodeflate.c b/sys/opencrypto/cryptodeflate.c index 0ad86e3..c55210d 100644 --- a/sys/opencrypto/cryptodeflate.c +++ b/sys/opencrypto/cryptodeflate.c @@ -29,7 +29,7 @@ /* * This file contains a wrapper around the deflate algo compression - * functions using the zlib library (see net/zlib.{c,h}) + * functions using the zlib library (see libkern/zlib.c and sys/zlib.h}) */ #include <sys/cdefs.h> @@ -42,7 +42,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/sdt.h> #include <sys/systm.h> -#include <net/zlib.h> +#include <sys/zlib.h> #include <opencrypto/cryptodev.h> #include <opencrypto/deflate.h> diff --git a/sys/opencrypto/deflate.h b/sys/opencrypto/deflate.h index dcf7a84..d31a3bf 100644 --- a/sys/opencrypto/deflate.h +++ b/sys/opencrypto/deflate.h @@ -36,7 +36,7 @@ #ifndef _CRYPTO_DEFLATE_H_ #define _CRYPTO_DEFLATE_H_ -#include <net/zlib.h> +#include <sys/zlib.h> #define Z_METHOD 8 #define Z_MEMLEVEL 8 diff --git a/sys/powerpc/aim/machdep.c b/sys/powerpc/aim/aim_machdep.c index 1c22220..ab09ab6 100644 --- a/sys/powerpc/aim/machdep.c +++ b/sys/powerpc/aim/aim_machdep.c @@ -129,106 +129,14 @@ __FBSDID("$FreeBSD$"); #include <dev/ofw/openfirm.h> -int cold = 1; #ifdef __powerpc64__ extern int n_slbs; -int cacheline_size = 128; -#else -int cacheline_size = 32; #endif -int hw_direct_map = 1; - -extern void *ap_pcpu; - -struct pcpu __pcpu[MAXCPU]; - -static struct trapframe frame0; - -char machine[] = "powerpc"; -SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); - -static void cpu_startup(void *); -SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); - -SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, - CTLFLAG_RD, &cacheline_size, 0, ""); - -uintptr_t powerpc_init(vm_offset_t, vm_offset_t, vm_offset_t, void *); - -long Maxmem = 0; -long realmem = 0; #ifndef __powerpc64__ struct bat battable[16]; #endif -struct kva_md_info kmi; - -static void -cpu_startup(void *dummy) -{ - - /* - * Initialise the decrementer-based clock. - */ - decr_init(); - - /* - * Good {morning,afternoon,evening,night}. - */ - cpu_setup(PCPU_GET(cpuid)); - -#ifdef PERFMON - perfmon_init(); -#endif - printf("real memory = %ld (%ld MB)\n", ptoa(physmem), - ptoa(physmem) / 1048576); - realmem = physmem; - - if (bootverbose) - printf("available KVA = %zd (%zd MB)\n", - virtual_end - virtual_avail, - (virtual_end - virtual_avail) / 1048576); - - /* - * Display any holes after the first chunk of extended memory. - */ - if (bootverbose) { - int indx; - - printf("Physical memory chunk(s):\n"); - for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { - vm_offset_t size1 = - phys_avail[indx + 1] - phys_avail[indx]; - - #ifdef __powerpc64__ - printf("0x%016lx - 0x%016lx, %ld bytes (%ld pages)\n", - #else - printf("0x%08x - 0x%08x, %d bytes (%ld pages)\n", - #endif - phys_avail[indx], phys_avail[indx + 1] - 1, size1, - size1 / PAGE_SIZE); - } - } - - vm_ksubmap_init(&kmi); - - printf("avail memory = %ld (%ld MB)\n", ptoa(vm_cnt.v_free_count), - ptoa(vm_cnt.v_free_count) / 1048576); - - /* - * Set up buffers, so they can be used to read disk labels. - */ - bufinit(); - vm_pager_bufferinit(); -} - -extern vm_offset_t __startkernel, __endkernel; -extern unsigned char __bss_start[]; -extern unsigned char __sbss_start[]; -extern unsigned char __sbss_end[]; -extern unsigned char _end[]; - #ifndef __powerpc64__ /* Bits for running on 64-bit systems in 32-bit mode. */ extern void *testppc64, *testppc64size; @@ -252,121 +160,25 @@ extern void *imisstrap, *imisssize; extern void *dlmisstrap, *dlmisssize; extern void *dsmisstrap, *dsmisssize; -uintptr_t -powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp) +extern void *ap_pcpu; + +void aim_cpu_init(vm_offset_t toc); + +void +aim_cpu_init(vm_offset_t toc) { - struct pcpu *pc; - vm_offset_t startkernel, endkernel; size_t trap_offset, trapsize; vm_offset_t trap; - void *kmdp; - char *env; register_t msr, scratch; uint8_t *cache_check; int cacheline_warn; #ifndef __powerpc64__ int ppc64; #endif -#ifdef DDB - vm_offset_t ksym_start; - vm_offset_t ksym_end; -#endif - kmdp = NULL; trap_offset = 0; cacheline_warn = 0; - /* First guess at start/end kernel positions */ - startkernel = __startkernel; - endkernel = __endkernel; - - /* Check for ePAPR loader, which puts a magic value into r6 */ - if (mdp == (void *)0x65504150) - mdp = NULL; - - /* - * Parse metadata if present and fetch parameters. Must be done - * before console is inited so cninit gets the right value of - * boothowto. - */ - if (mdp != NULL) { - preload_metadata = mdp; - kmdp = preload_search_by_type("elf kernel"); - if (kmdp != NULL) { - boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); - kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); - endkernel = ulmax(endkernel, MD_FETCH(kmdp, - MODINFOMD_KERNEND, vm_offset_t)); -#ifdef DDB - ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); - ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); - db_fetch_ksymtab(ksym_start, ksym_end); -#endif - } - } else { - bzero(__sbss_start, __sbss_end - __sbss_start); - bzero(__bss_start, _end - __bss_start); - } - - /* Store boot environment state */ - OF_initial_setup((void *)fdt, NULL, (int (*)(void *))ofentry); - - /* - * Init params/tunables that can be overridden by the loader - */ - init_param1(); - - /* - * Start initializing proc0 and thread0. - */ - proc_linkup0(&proc0, &thread0); - thread0.td_frame = &frame0; - - /* - * Set up per-cpu data. - */ - pc = __pcpu; - pcpu_init(pc, 0, sizeof(struct pcpu)); - pc->pc_curthread = &thread0; -#ifdef __powerpc64__ - __asm __volatile("mr 13,%0" :: "r"(pc->pc_curthread)); -#else - __asm __volatile("mr 2,%0" :: "r"(pc->pc_curthread)); -#endif - pc->pc_cpuid = 0; - - __asm __volatile("mtsprg 0, %0" :: "r"(pc)); - - /* - * Init mutexes, which we use heavily in PMAP - */ - - mutex_init(); - - /* - * Install the OF client interface - */ - - OF_bootstrap(); - - /* - * Initialize the console before printing anything. - */ - cninit(); - - /* - * Complain if there is no metadata. - */ - if (mdp == NULL || kmdp == NULL) { - printf("powerpc_init: no loader metadata.\n"); - } - - /* - * Init KDB - */ - - kdb_init(); - /* Various very early CPU fix ups */ switch (mfpvr() >> 16) { /* @@ -431,9 +243,6 @@ powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp) cacheline_size = 32; } - /* Make sure the kernel icache is valid before we go too much further */ - __syncicache((caddr_t)startkernel, endkernel - startkernel); - #ifndef __powerpc64__ /* * Figure out whether we need to use the 64 bit PMAP. This works by @@ -552,12 +361,6 @@ powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp) } /* - * Choose a platform module so we can get the physical memory map. - */ - - platform_probe_and_attach(); - - /* * Initialise virtual memory. Use BUS_PROBE_GENERIC priority * in case the platform module had a better idea of what we * should do. @@ -566,96 +369,6 @@ powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp) pmap_mmu_install(MMU_TYPE_G5, BUS_PROBE_GENERIC); else pmap_mmu_install(MMU_TYPE_OEA, BUS_PROBE_GENERIC); - - pmap_bootstrap(startkernel, endkernel); - mtmsr(PSL_KERNSET & ~PSL_EE); - - /* - * Initialize params/tunables that are derived from memsize - */ - init_param2(physmem); - - /* - * Grab booted kernel's name - */ - env = kern_getenv("kernelname"); - if (env != NULL) { - strlcpy(kernelname, env, sizeof(kernelname)); - freeenv(env); - } - - /* - * Finish setting up thread0. - */ - thread0.td_pcb = (struct pcb *) - ((thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - - sizeof(struct pcb)) & ~15UL); - bzero((void *)thread0.td_pcb, sizeof(struct pcb)); - pc->pc_curpcb = thread0.td_pcb; - - /* Initialise the message buffer. */ - msgbufinit(msgbufp, msgbufsize); - -#ifdef KDB - if (boothowto & RB_KDB) - kdb_enter(KDB_WHY_BOOTFLAGS, - "Boot flags requested debugger"); -#endif - - return (((uintptr_t)thread0.td_pcb - - (sizeof(struct callframe) - 3*sizeof(register_t))) & ~15UL); -} - -void -bzero(void *buf, size_t len) -{ - caddr_t p; - - p = buf; - - while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { - *p++ = 0; - len--; - } - - while (len >= sizeof(u_long) * 8) { - *(u_long*) p = 0; - *((u_long*) p + 1) = 0; - *((u_long*) p + 2) = 0; - *((u_long*) p + 3) = 0; - len -= sizeof(u_long) * 8; - *((u_long*) p + 4) = 0; - *((u_long*) p + 5) = 0; - *((u_long*) p + 6) = 0; - *((u_long*) p + 7) = 0; - p += sizeof(u_long) * 8; - } - - while (len >= sizeof(u_long)) { - *(u_long*) p = 0; - len -= sizeof(u_long); - p += sizeof(u_long); - } - - while (len) { - *p++ = 0; - len--; - } -} - -void -cpu_boot(int howto) -{ -} - -/* - * Flush the D-cache for non-DMA I/O so that the I-cache can - * be made coherent later. - */ -void -cpu_flush_dcache(void *ptr, size_t len) -{ - /* TBD */ } /* @@ -669,17 +382,6 @@ cpu_halt(void) } int -ptrace_set_pc(struct thread *td, unsigned long addr) -{ - struct trapframe *tf; - - tf = td->td_frame; - tf->srr0 = (register_t)addr; - - return (0); -} - -int ptrace_single_step(struct thread *td) { struct trapframe *tf; @@ -727,41 +429,7 @@ memcpy(pcpu->pc_slb, PCPU_GET(slb), sizeof(pcpu->pc_slb)); #endif } -void -spinlock_enter(void) -{ - struct thread *td; - register_t msr; - - td = curthread; - if (td->td_md.md_spinlock_count == 0) { - __asm __volatile("or 2,2,2"); /* Set high thread priority */ - msr = intr_disable(); - td->td_md.md_spinlock_count = 1; - td->td_md.md_saved_msr = msr; - } else - td->td_md.md_spinlock_count++; - critical_enter(); -} - -void -spinlock_exit(void) -{ - struct thread *td; - register_t msr; - - td = curthread; - critical_exit(); - msr = td->td_md.md_saved_msr; - td->td_md.md_spinlock_count--; - if (td->td_md.md_spinlock_count == 0) { - intr_restore(msr); - __asm __volatile("or 6,6,6"); /* Set normal thread priority */ - } -} - #ifndef __powerpc64__ - uint64_t va_to_vsid(pmap_t pm, vm_offset_t va) { @@ -945,3 +613,4 @@ cpu_sleep() enable_vec(curthread); powerpc_sync(); } + diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 23bd449..c45f941 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -137,8 +137,6 @@ struct ofw_map { extern unsigned char _etext[]; extern unsigned char _end[]; -extern int ofw_real_mode; - /* * Map of physical memory regions. */ @@ -852,8 +850,7 @@ moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend */ chosen = OF_finddevice("/chosen"); - if (!ofw_real_mode && chosen != -1 && - OF_getprop(chosen, "mmu", &mmui, 4) != -1) { + if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) diff --git a/sys/powerpc/booke/machdep.c b/sys/powerpc/booke/booke_machdep.c index 943063c..2a27775 100644 --- a/sys/powerpc/booke/machdep.c +++ b/sys/powerpc/booke/booke_machdep.c @@ -163,6 +163,7 @@ extern unsigned char __bss_start[]; extern unsigned char __sbss_start[]; extern unsigned char __sbss_end[]; extern unsigned char _end[]; +extern vm_offset_t __endkernel; /* * Bootinfo is passed to us by legacy loaders. Save the address of the @@ -170,25 +171,6 @@ extern unsigned char _end[]; */ uint32_t *bootinfo; -struct kva_md_info kmi; -struct pcpu __pcpu[MAXCPU]; -struct trapframe frame0; -int cold = 1; -long realmem = 0; -long Maxmem = 0; -char machine[] = "powerpc"; -SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); - -int cacheline_size = 32; - -SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, - CTLFLAG_RD, &cacheline_size, 0, ""); - -int hw_direct_map = 0; - -static void cpu_booke_startup(void *); -SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_booke_startup, NULL); - void print_kernel_section_addr(void); void print_kenv(void); u_int booke_init(uint32_t, uint32_t); @@ -219,6 +201,16 @@ extern void *int_performance_counter; ("Handler " #handler " too far from interrupt vector base")); \ mtspr(ivor, (uintptr_t)(&handler) & 0xffffUL); +uintptr_t powerpc_init(vm_offset_t fdt, vm_offset_t, vm_offset_t, void *mdp); +void booke_cpu_init(void); + +void +booke_cpu_init(void) +{ + + pmap_mmu_install(MMU_TYPE_BOOKE, BUS_PROBE_GENERIC); +} + void ivor_setup(void) { @@ -244,90 +236,6 @@ ivor_setup(void) #endif } -static void -cpu_booke_startup(void *dummy) -{ - int indx; - unsigned long size; - - /* Initialise the decrementer-based clock. */ - decr_init(); - - /* Good {morning,afternoon,evening,night}. */ - cpu_setup(PCPU_GET(cpuid)); - - printf("real memory = %lu (%ld MB)\n", ptoa(physmem), - ptoa(physmem) / 1048576); - realmem = physmem; - - /* Display any holes after the first chunk of extended memory. */ - if (bootverbose) { - printf("Physical memory chunk(s):\n"); - for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { - size = phys_avail[indx + 1] - phys_avail[indx]; - - printf("0x%08x - 0x%08x, %lu bytes (%lu pages)\n", - phys_avail[indx], phys_avail[indx + 1] - 1, - size, size / PAGE_SIZE); - } - } - - vm_ksubmap_init(&kmi); - - printf("avail memory = %lu (%ld MB)\n", ptoa(vm_cnt.v_free_count), - ptoa(vm_cnt.v_free_count) / 1048576); - - /* Set up buffers, so they can be used to read disk labels. */ - bufinit(); - vm_pager_bufferinit(); -} - -static char * -kenv_next(char *cp) -{ - - if (cp != NULL) { - while (*cp != 0) - cp++; - cp++; - if (*cp == 0) - cp = NULL; - } - return (cp); -} - -void -print_kenv(void) -{ - int len; - char *cp; - - debugf("loader passed (static) kenv:\n"); - if (kern_envp == NULL) { - debugf(" no env, null ptr\n"); - return; - } - debugf(" kern_envp = 0x%08x\n", (u_int32_t)kern_envp); - - len = 0; - for (cp = kern_envp; cp != NULL; cp = kenv_next(cp)) - debugf(" %x %s\n", (u_int32_t)cp, cp); -} - -void -print_kernel_section_addr(void) -{ - - debugf("kernel image addresses:\n"); - debugf(" kernel_text = 0x%08x\n", (uint32_t)kernel_text); - debugf(" _etext (sdata) = 0x%08x\n", (uint32_t)_etext); - debugf(" _edata = 0x%08x\n", (uint32_t)_edata); - debugf(" __sbss_start = 0x%08x\n", (uint32_t)__sbss_start); - debugf(" __sbss_end = 0x%08x\n", (uint32_t)__sbss_end); - debugf(" __sbss_start = 0x%08x\n", (uint32_t)__bss_start); - debugf(" _end = 0x%08x\n", (uint32_t)_end); -} - static int booke_check_for_fdt(uint32_t arg1, vm_offset_t *dtbp) { @@ -345,24 +253,20 @@ booke_check_for_fdt(uint32_t arg1, vm_offset_t *dtbp) return (0); } -u_int +uintptr_t booke_init(uint32_t arg1, uint32_t arg2) { - struct pcpu *pc; - void *kmdp, *mdp; + uintptr_t ret; + void *mdp; vm_offset_t dtbp, end; -#ifdef DDB - vm_offset_t ksym_start; - vm_offset_t ksym_end; -#endif - - kmdp = NULL; end = (uintptr_t)_end; dtbp = (vm_offset_t)NULL; /* Set up TLB initially */ bootinfo = NULL; + bzero(__sbss_start, __sbss_end - __sbss_start); + bzero(__bss_start, _end - __bss_start); tlb1_init(); /* @@ -391,152 +295,22 @@ booke_init(uint32_t arg1, uint32_t arg2) memmove((void *)end, (void *)dtbp, fdt_totalsize((void *)dtbp)); dtbp = end; end += fdt_totalsize((void *)dtbp); + __endkernel = end; mdp = NULL; } else if (arg1 > (uintptr_t)kernel_text) /* FreeBSD loader */ mdp = (void *)arg1; else /* U-Boot */ mdp = NULL; - /* - * Parse metadata and fetch parameters. - */ - if (mdp != NULL) { - preload_metadata = mdp; - kmdp = preload_search_by_type("elf kernel"); - if (kmdp != NULL) { - boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); - kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); - dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); - end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); - - bootinfo = (uint32_t *)preload_search_info(kmdp, - MODINFO_METADATA | MODINFOMD_BOOTINFO); - -#ifdef DDB - ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); - ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); - db_fetch_ksymtab(ksym_start, ksym_end); -#endif - } - } else { - bzero(__sbss_start, __sbss_end - __sbss_start); - bzero(__bss_start, _end - __bss_start); - } - -#if defined(FDT_DTB_STATIC) - /* - * In case the device tree blob was not retrieved (from metadata) try - * to use the statically embedded one. - */ - if (dtbp == (vm_offset_t)NULL) - dtbp = (vm_offset_t)&fdt_static_dtb; -#endif - - if (OF_install(OFW_FDT, 0) == FALSE) - while (1); - - if (OF_init((void *)dtbp) != 0) - while (1); - - OF_interpret("perform-fixup", 0); - /* Reset TLB1 to get rid of temporary mappings */ tlb1_init(); - /* Reset Time Base */ - mttb(0); - - /* Init params/tunables that can be overridden by the loader. */ - init_param1(); - - /* Start initializing proc0 and thread0. */ - proc_linkup0(&proc0, &thread0); - thread0.td_frame = &frame0; - - /* Set up per-cpu data and store the pointer in SPR general 0. */ - pc = &__pcpu[0]; - pcpu_init(pc, 0, sizeof(struct pcpu)); - pc->pc_curthread = &thread0; -#ifdef __powerpc64__ - __asm __volatile("mr 13,%0" :: "r"(pc->pc_curthread)); -#else - __asm __volatile("mr 2,%0" :: "r"(pc->pc_curthread)); -#endif - __asm __volatile("mtsprg 0, %0" :: "r"(pc)); - - /* Initialize system mutexes. */ - mutex_init(); - - /* Initialize the console before printing anything. */ - cninit(); - - /* Print out some debug info... */ - debugf("%s: console initialized\n", __func__); - debugf(" arg3 mdp = 0x%08x\n", (u_int32_t)mdp); - debugf(" end = 0x%08x\n", (u_int32_t)end); - debugf(" boothowto = 0x%08x\n", boothowto); -#ifdef MPC85XX - debugf(" kernel ccsrbar = 0x%08x\n", CCSRBAR_VA); -#endif - debugf(" MSR = 0x%08x\n", mfmsr()); -#if defined(BOOKE_E500) - debugf(" HID0 = 0x%08x\n", mfspr(SPR_HID0)); - debugf(" HID1 = 0x%08x\n", mfspr(SPR_HID1)); - debugf(" BUCSR = 0x%08x\n", mfspr(SPR_BUCSR)); -#endif - - debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp); - - print_kernel_section_addr(); - print_kenv(); -#if defined(BOOKE_E500) - //tlb1_print_entries(); - //tlb1_print_tlbentries(); -#endif - - kdb_init(); - -#ifdef KDB - if (boothowto & RB_KDB) - kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); -#endif - - /* Initialise platform module */ - platform_probe_and_attach(); - - /* Initialise virtual memory. */ - pmap_mmu_install(MMU_TYPE_BOOKE, 0); - pmap_bootstrap((uintptr_t)kernel_text, end); - debugf("MSR = 0x%08x\n", mfmsr()); -#if defined(BOOKE_E500) - //tlb1_print_entries(); - //tlb1_print_tlbentries(); -#endif - - /* Initialize params/tunables that are derived from memsize. */ - init_param2(physmem); - - /* Finish setting up thread0. */ - thread0.td_pcb = (struct pcb *) - ((thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - - sizeof(struct pcb)) & ~15); - bzero((void *)thread0.td_pcb, sizeof(struct pcb)); - pc->pc_curpcb = thread0.td_pcb; - - /* Initialise the message buffer. */ - msgbufinit(msgbufp, msgbufsize); - - /* Enable Machine Check interrupt. */ - mtmsr(mfmsr() | PSL_ME); - isync(); + ret = powerpc_init(dtbp, 0, 0, mdp); /* Enable L1 caches */ booke_enable_l1_cache(); - debugf("%s: SP = 0x%08x\n", __func__, - ((uintptr_t)thread0.td_pcb - 16) & ~15); - - return (((uintptr_t)thread0.td_pcb - 16) & ~15); + return (ret); } #define RES_GRANULE 32 @@ -560,63 +334,6 @@ cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz) #endif } -/* - * Flush the D-cache for non-DMA I/O so that the I-cache can - * be made coherent later. - */ -void -cpu_flush_dcache(void *ptr, size_t len) -{ - register_t addr, off; - - /* - * Align the address to a cacheline and adjust the length - * accordingly. Then round the length to a multiple of the - * cacheline for easy looping. - */ - addr = (uintptr_t)ptr; - off = addr & (cacheline_size - 1); - addr -= off; - len = (len + off + cacheline_size - 1) & ~(cacheline_size - 1); - - while (len > 0) { - __asm __volatile ("dcbf 0,%0" :: "r"(addr)); - __asm __volatile ("sync"); - addr += cacheline_size; - len -= cacheline_size; - } -} - -void -spinlock_enter(void) -{ - struct thread *td; - register_t msr; - - td = curthread; - if (td->td_md.md_spinlock_count == 0) { - msr = intr_disable(); - td->td_md.md_spinlock_count = 1; - td->td_md.md_saved_msr = msr; - } else - td->td_md.md_spinlock_count++; - critical_enter(); -} - -void -spinlock_exit(void) -{ - struct thread *td; - register_t msr; - - td = curthread; - critical_exit(); - msr = td->td_md.md_saved_msr; - td->td_md.md_spinlock_count--; - if (td->td_md.md_spinlock_count == 0) - intr_restore(msr); -} - /* Shutdown the CPU as much as possible. */ void cpu_halt(void) @@ -628,17 +345,6 @@ cpu_halt(void) } int -ptrace_set_pc(struct thread *td, unsigned long addr) -{ - struct trapframe *tf; - - tf = td->td_frame; - tf->srr0 = (register_t)addr; - - return (0); -} - -int ptrace_single_step(struct thread *td) { struct trapframe *tf; @@ -680,40 +386,3 @@ kdb_cpu_set_singlestep(void) kdb_frame->srr1 |= PSL_DE; } -void -bzero(void *buf, size_t len) -{ - caddr_t p; - - p = buf; - - while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { - *p++ = 0; - len--; - } - - while (len >= sizeof(u_long) * 8) { - *(u_long*) p = 0; - *((u_long*) p + 1) = 0; - *((u_long*) p + 2) = 0; - *((u_long*) p + 3) = 0; - len -= sizeof(u_long) * 8; - *((u_long*) p + 4) = 0; - *((u_long*) p + 5) = 0; - *((u_long*) p + 6) = 0; - *((u_long*) p + 7) = 0; - p += sizeof(u_long) * 8; - } - - while (len >= sizeof(u_long)) { - *(u_long*) p = 0; - len -= sizeof(u_long); - p += sizeof(u_long); - } - - while (len) { - *p++ = 0; - len--; - } -} - diff --git a/sys/powerpc/booke/pmap.c b/sys/powerpc/booke/pmap.c index 2c6df63..c5e3bf3 100644 --- a/sys/powerpc/booke/pmap.c +++ b/sys/powerpc/booke/pmap.c @@ -1031,13 +1031,8 @@ mmu_booke_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t kernelend) * Align kernel start and end address (kernel image). * Note that kernel end does not necessarily relate to kernsize. * kernsize is the size of the kernel that is actually mapped. - * Also note that "start - 1" is deliberate. With SMP, the - * entry point is exactly a page from the actual load address. - * As such, trunc_page() has no effect and we're off by a page. - * Since we always have the ELF header between the load address - * and the entry point, we can safely subtract 1 to compensate. */ - kernstart = trunc_page(start - 1); + kernstart = trunc_page(start); data_start = round_page(kernelend); data_end = data_start; diff --git a/sys/powerpc/conf/GENERIC b/sys/powerpc/conf/GENERIC index e3939be..b5bf850 100644 --- a/sys/powerpc/conf/GENERIC +++ b/sys/powerpc/conf/GENERIC @@ -32,6 +32,7 @@ options PSIM #GDB PSIM ppc simulator options MAMBO #IBM Mambo Full System Simulator options PSERIES #PAPR-compliant systems +options FDT options SCHED_ULE #ULE scheduler options PREEMPTION #Enable kernel thread preemption options INET #InterNETworking diff --git a/sys/powerpc/ofw/ofw_machdep.c b/sys/powerpc/ofw/ofw_machdep.c index 05af094..31813fb 100644 --- a/sys/powerpc/ofw/ofw_machdep.c +++ b/sys/powerpc/ofw/ofw_machdep.c @@ -62,11 +62,12 @@ __FBSDID("$FreeBSD$"); #include <machine/ofw_machdep.h> #include <machine/trap.h> +static void *fdt; +int ofw_real_mode; + #ifdef AIM extern register_t ofmsr[5]; extern void *openfirmware_entry; -static void *fdt; -int ofw_real_mode; char save_trap_init[0x2f00]; /* EXC_LAST */ char save_trap_of[0x2f00]; /* EXC_LAST */ @@ -336,10 +337,10 @@ ofw_mem_regions(struct mem_region *memp, int *memsz, *availsz = asz; } -#ifdef AIM void OF_initial_setup(void *fdt_ptr, void *junk, int (*openfirm)(void *)) { +#ifdef AIM ofmsr[0] = mfmsr(); #ifdef __powerpc64__ ofmsr[0] &= ~PSL_SF; @@ -348,22 +349,25 @@ OF_initial_setup(void *fdt_ptr, void *junk, int (*openfirm)(void *)) __asm __volatile("mfsprg1 %0" : "=&r"(ofmsr[2])); __asm __volatile("mfsprg2 %0" : "=&r"(ofmsr[3])); __asm __volatile("mfsprg3 %0" : "=&r"(ofmsr[4])); + openfirmware_entry = openfirm; if (ofmsr[0] & PSL_DR) ofw_real_mode = 0; else ofw_real_mode = 1; + ofw_save_trap_vec(save_trap_init); +#else + ofw_real_mode = 1; +#endif + fdt = fdt_ptr; - openfirmware_entry = openfirm; #ifdef FDT_DTB_STATIC /* Check for a statically included blob */ if (fdt == NULL) fdt = &fdt_static_dtb; #endif - - ofw_save_trap_vec(save_trap_init); } boolean_t @@ -371,6 +375,7 @@ OF_bootstrap() { boolean_t status = FALSE; +#ifdef AIM if (openfirmware_entry != NULL) { if (ofw_real_mode) { status = OF_install(OFW_STD_REAL, 0); @@ -386,18 +391,22 @@ OF_bootstrap() return status; OF_init(openfirmware); - } else if (fdt != NULL) { + } else +#endif + if (fdt != NULL) { status = OF_install(OFW_FDT, 0); if (status != TRUE) return status; OF_init(fdt); + OF_interpret("perform-fixup", 0); } return (status); } +#ifdef AIM void ofw_quiesce(void) { diff --git a/sys/powerpc/powerpc/busdma_machdep.c b/sys/powerpc/powerpc/busdma_machdep.c index bd226c8..9ea51ce 100644 --- a/sys/powerpc/powerpc/busdma_machdep.c +++ b/sys/powerpc/powerpc/busdma_machdep.c @@ -1121,8 +1121,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= vaddr & PAGE_MASK; - bpage->busaddr |= vaddr & PAGE_MASK; + bpage->vaddr |= addr & PAGE_MASK; + bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; bpage->dataaddr = addr; diff --git a/sys/powerpc/powerpc/machdep.c b/sys/powerpc/powerpc/machdep.c new file mode 100644 index 0000000..2bc9496 --- /dev/null +++ b/sys/powerpc/powerpc/machdep.c @@ -0,0 +1,502 @@ +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (C) 2001 Benno Rice + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" +#include "opt_ddb.h" +#include "opt_kstack_pages.h" +#include "opt_platform.h" + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/bus.h> +#include <sys/cons.h> +#include <sys/cpu.h> +#include <sys/eventhandler.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/ptrace.h> +#include <sys/reboot.h> +#include <sys/rwlock.h> +#include <sys/signalvar.h> +#include <sys/syscallsubr.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/ucontext.h> +#include <sys/uio.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> + +#include <net/netisr.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> + +#include <machine/altivec.h> +#ifndef __powerpc64__ +#include <machine/bat.h> +#endif +#include <machine/cpu.h> +#include <machine/elf.h> +#include <machine/fpu.h> +#include <machine/hid.h> +#include <machine/kdb.h> +#include <machine/md_var.h> +#include <machine/metadata.h> +#include <machine/mmuvar.h> +#include <machine/pcb.h> +#include <machine/reg.h> +#include <machine/sigframe.h> +#include <machine/spr.h> +#include <machine/trap.h> +#include <machine/vmparam.h> +#include <machine/ofw_machdep.h> + +#include <ddb/ddb.h> + +#include <dev/ofw/openfirm.h> + +int cold = 1; +#ifdef __powerpc64__ +int cacheline_size = 128; +#else +int cacheline_size = 32; +#endif +int hw_direct_map = 1; + +extern void *ap_pcpu; + +struct pcpu __pcpu[MAXCPU]; + +static struct trapframe frame0; + +char machine[] = "powerpc"; +SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); + +static void cpu_startup(void *); +SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); + +SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, + CTLFLAG_RD, &cacheline_size, 0, ""); + +uintptr_t powerpc_init(vm_offset_t, vm_offset_t, vm_offset_t, void *); + +long Maxmem = 0; +long realmem = 0; + +struct kva_md_info kmi; + +static void +cpu_startup(void *dummy) +{ + + /* + * Initialise the decrementer-based clock. + */ + decr_init(); + + /* + * Good {morning,afternoon,evening,night}. + */ + cpu_setup(PCPU_GET(cpuid)); + +#ifdef PERFMON + perfmon_init(); +#endif + printf("real memory = %ld (%ld MB)\n", ptoa(physmem), + ptoa(physmem) / 1048576); + realmem = physmem; + + if (bootverbose) + printf("available KVA = %zd (%zd MB)\n", + virtual_end - virtual_avail, + (virtual_end - virtual_avail) / 1048576); + + /* + * Display any holes after the first chunk of extended memory. + */ + if (bootverbose) { + int indx; + + printf("Physical memory chunk(s):\n"); + for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { + vm_offset_t size1 = + phys_avail[indx + 1] - phys_avail[indx]; + + #ifdef __powerpc64__ + printf("0x%016lx - 0x%016lx, %ld bytes (%ld pages)\n", + #else + printf("0x%08x - 0x%08x, %d bytes (%ld pages)\n", + #endif + phys_avail[indx], phys_avail[indx + 1] - 1, size1, + size1 / PAGE_SIZE); + } + } + + vm_ksubmap_init(&kmi); + + printf("avail memory = %ld (%ld MB)\n", ptoa(vm_cnt.v_free_count), + ptoa(vm_cnt.v_free_count) / 1048576); + + /* + * Set up buffers, so they can be used to read disk labels. + */ + bufinit(); + vm_pager_bufferinit(); +} + +extern vm_offset_t __startkernel, __endkernel; +extern unsigned char __bss_start[]; +extern unsigned char __sbss_start[]; +extern unsigned char __sbss_end[]; +extern unsigned char _end[]; + +void aim_cpu_init(vm_offset_t toc); +void booke_cpu_init(void); + +uintptr_t +powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp) +{ + struct pcpu *pc; + vm_offset_t startkernel, endkernel; + void *kmdp; + char *env; +#ifdef DDB + vm_offset_t ksym_start; + vm_offset_t ksym_end; +#endif + + kmdp = NULL; + + /* First guess at start/end kernel positions */ + startkernel = __startkernel; + endkernel = __endkernel; + + /* Check for ePAPR loader, which puts a magic value into r6 */ + if (mdp == (void *)0x65504150) + mdp = NULL; + + /* + * Parse metadata if present and fetch parameters. Must be done + * before console is inited so cninit gets the right value of + * boothowto. + */ + if (mdp != NULL) { + preload_metadata = mdp; + kmdp = preload_search_by_type("elf kernel"); + if (kmdp != NULL) { + boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); + kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); + endkernel = ulmax(endkernel, MD_FETCH(kmdp, + MODINFOMD_KERNEND, vm_offset_t)); +#ifdef DDB + ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); + ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); + db_fetch_ksymtab(ksym_start, ksym_end); +#endif + } + } else { + bzero(__sbss_start, __sbss_end - __sbss_start); + bzero(__bss_start, _end - __bss_start); + } +#ifdef BOOKE + tlb1_init(); +#endif + + /* Store boot environment state */ + OF_initial_setup((void *)fdt, NULL, (int (*)(void *))ofentry); + + /* + * Init params/tunables that can be overridden by the loader + */ + init_param1(); + + /* + * Start initializing proc0 and thread0. + */ + proc_linkup0(&proc0, &thread0); + thread0.td_frame = &frame0; + + /* + * Set up per-cpu data. + */ + pc = __pcpu; + pcpu_init(pc, 0, sizeof(struct pcpu)); + pc->pc_curthread = &thread0; +#ifdef __powerpc64__ + __asm __volatile("mr 13,%0" :: "r"(pc->pc_curthread)); +#else + __asm __volatile("mr 2,%0" :: "r"(pc->pc_curthread)); +#endif + pc->pc_cpuid = 0; + + __asm __volatile("mtsprg 0, %0" :: "r"(pc)); + + /* + * Init mutexes, which we use heavily in PMAP + */ + + mutex_init(); + + /* + * Install the OF client interface + */ + + OF_bootstrap(); + + /* + * Initialize the console before printing anything. + */ + cninit(); + + /* + * Complain if there is no metadata. + */ + if (mdp == NULL || kmdp == NULL) { + printf("powerpc_init: no loader metadata.\n"); + } + + /* + * Init KDB + */ + + kdb_init(); + +#ifdef AIM + aim_cpu_init(toc); +#else /* BOOKE */ + booke_cpu_init(); + + /* Make sure the kernel icache is valid before we go too much further */ + __syncicache((caddr_t)startkernel, endkernel - startkernel); +#endif + + /* + * Choose a platform module so we can get the physical memory map. + */ + + platform_probe_and_attach(); + + /* + * Bring up MMU + */ + pmap_bootstrap(startkernel, endkernel); + mtmsr(PSL_KERNSET & ~PSL_EE); + + /* + * Initialize params/tunables that are derived from memsize + */ + init_param2(physmem); + + /* + * Grab booted kernel's name + */ + env = kern_getenv("kernelname"); + if (env != NULL) { + strlcpy(kernelname, env, sizeof(kernelname)); + freeenv(env); + } + + /* + * Finish setting up thread0. + */ + thread0.td_pcb = (struct pcb *) + ((thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - + sizeof(struct pcb)) & ~15UL); + bzero((void *)thread0.td_pcb, sizeof(struct pcb)); + pc->pc_curpcb = thread0.td_pcb; + + /* Initialise the message buffer. */ + msgbufinit(msgbufp, msgbufsize); + +#ifdef KDB + if (boothowto & RB_KDB) + kdb_enter(KDB_WHY_BOOTFLAGS, + "Boot flags requested debugger"); +#endif + + return (((uintptr_t)thread0.td_pcb - + (sizeof(struct callframe) - 3*sizeof(register_t))) & ~15UL); +} + +void +bzero(void *buf, size_t len) +{ + caddr_t p; + + p = buf; + + while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { + *p++ = 0; + len--; + } + + while (len >= sizeof(u_long) * 8) { + *(u_long*) p = 0; + *((u_long*) p + 1) = 0; + *((u_long*) p + 2) = 0; + *((u_long*) p + 3) = 0; + len -= sizeof(u_long) * 8; + *((u_long*) p + 4) = 0; + *((u_long*) p + 5) = 0; + *((u_long*) p + 6) = 0; + *((u_long*) p + 7) = 0; + p += sizeof(u_long) * 8; + } + + while (len >= sizeof(u_long)) { + *(u_long*) p = 0; + len -= sizeof(u_long); + p += sizeof(u_long); + } + + while (len) { + *p++ = 0; + len--; + } +} + +/* + * Flush the D-cache for non-DMA I/O so that the I-cache can + * be made coherent later. + */ +void +cpu_flush_dcache(void *ptr, size_t len) +{ + register_t addr, off; + + /* + * Align the address to a cacheline and adjust the length + * accordingly. Then round the length to a multiple of the + * cacheline for easy looping. + */ + addr = (uintptr_t)ptr; + off = addr & (cacheline_size - 1); + addr -= off; + len = (len + off + cacheline_size - 1) & ~(cacheline_size - 1); + + while (len > 0) { + __asm __volatile ("dcbf 0,%0" :: "r"(addr)); + __asm __volatile ("sync"); + addr += cacheline_size; + len -= cacheline_size; + } +} + +int +ptrace_set_pc(struct thread *td, unsigned long addr) +{ + struct trapframe *tf; + + tf = td->td_frame; + tf->srr0 = (register_t)addr; + + return (0); +} + +void +spinlock_enter(void) +{ + struct thread *td; + register_t msr; + + td = curthread; + if (td->td_md.md_spinlock_count == 0) { + __asm __volatile("or 2,2,2"); /* Set high thread priority */ + msr = intr_disable(); + td->td_md.md_spinlock_count = 1; + td->td_md.md_saved_msr = msr; + } else + td->td_md.md_spinlock_count++; + critical_enter(); +} + +void +spinlock_exit(void) +{ + struct thread *td; + register_t msr; + + td = curthread; + critical_exit(); + msr = td->td_md.md_saved_msr; + td->td_md.md_spinlock_count--; + if (td->td_md.md_spinlock_count == 0) { + intr_restore(msr); + __asm __volatile("or 6,6,6"); /* Set normal thread priority */ + } +} + diff --git a/sys/powerpc/aim/uma_machdep.c b/sys/powerpc/powerpc/uma_machdep.c index 1c27e3e..1c27e3e 100644 --- a/sys/powerpc/aim/uma_machdep.c +++ b/sys/powerpc/powerpc/uma_machdep.c diff --git a/sys/sparc64/pci/apb.c b/sys/sparc64/pci/apb.c index 9b3fa44..c0d6172 100644 --- a/sys/sparc64/pci/apb.c +++ b/sys/sparc64/pci/apb.c @@ -171,20 +171,14 @@ apb_attach(device_t dev) * Get current bridge configuration. */ sc->sc_bsc.ops_pcib_sc.domain = pci_get_domain(dev); - sc->sc_bsc.ops_pcib_sc.secstat = - pci_read_config(dev, PCIR_SECSTAT_1, 2); - sc->sc_bsc.ops_pcib_sc.command = - pci_read_config(dev, PCIR_COMMAND, 2); - sc->sc_bsc.ops_pcib_sc.pribus = - pci_read_config(dev, PCIR_PRIBUS_1, 1); + sc->sc_bsc.ops_pcib_sc.pribus = pci_get_bus(dev); + pci_write_config(dev, PCIR_PRIBUS_1, sc->sc_bsc.ops_pcib_sc.pribus, 1); sc->sc_bsc.ops_pcib_sc.bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1); sc->sc_bsc.ops_pcib_sc.bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1); sc->sc_bsc.ops_pcib_sc.bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2); - sc->sc_bsc.ops_pcib_sc.seclat = - pci_read_config(dev, PCIR_SECLAT_1, 1); sc->sc_iomap = pci_read_config(dev, APBR_IOMAP, 1); sc->sc_memmap = pci_read_config(dev, APBR_MEMMAP, 1); diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h index 553a899..ab7d59d 100644 --- a/sys/sys/cdefs.h +++ b/sys/sys/cdefs.h @@ -388,6 +388,12 @@ #define __alloc_size(x) #endif +#if __has_attribute(alloc_align) || __GNUC_PREREQ__(4, 9) +#define __alloc_align(x) __attribute__((__alloc_align__(x))) +#else +#define __alloc_align(x) +#endif + /* XXX: should use `#if __STDC_VERSION__ < 199901'. */ #if !__GNUC_PREREQ__(2, 7) && !defined(__INTEL_COMPILER) #define __func__ NULL @@ -763,6 +769,24 @@ #endif /* + * Type Safety Checking + * + * Clang provides additional attributes to enable checking type safety + * properties that cannot be enforced by the C type system. + */ + +#if __has_attribute(argument_with_type_tag) && \ + __has_attribute(type_tag_for_datatype) && !defined(lint) +#define __arg_type_tag(arg_kind, arg_idx, type_tag_idx) \ + __attribute__((__argument_with_type_tag__(arg_kind, arg_idx, type_tag_idx))) +#define __datatype_type_tag(kind, type) \ + __attribute__((__type_tag_for_datatype__(kind, type))) +#else +#define __arg_type_tag(arg_kind, arg_idx, type_tag_idx) +#define __datatype_type_tag(kind, type) +#endif + +/* * Lock annotations. * * Clang provides support for doing basic thread-safety tests at diff --git a/sys/sys/nv.h b/sys/sys/nv.h index 0876e57..5c342dc 100644 --- a/sys/sys/nv.h +++ b/sys/sys/nv.h @@ -75,6 +75,7 @@ nvlist_t *nvlist_create(int flags); void nvlist_destroy(nvlist_t *nvl); int nvlist_error(const nvlist_t *nvl); bool nvlist_empty(const nvlist_t *nvl); +int nvlist_flags(const nvlist_t *nvl); void nvlist_set_error(nvlist_t *nvl, int error); nvlist_t *nvlist_clone(const nvlist_t *nvl); @@ -194,129 +195,6 @@ void nvlist_free_descriptor(nvlist_t *nvl, const char *name); #endif void nvlist_free_binary(nvlist_t *nvl, const char *name); -/* - * Below are the same functions, but which operate on format strings and - * variable argument lists. - * - * Functions that are not inserting a new pair into the nvlist cannot handle - * a failure to allocate the memory to hold the new name. Therefore these - * functions are not provided in the kernel. - */ - -#ifndef _KERNEL -bool nvlist_existsf(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_type(const nvlist_t *nvl, int type, const char *namefmt, ...) __printflike(3, 4); - -bool nvlist_existsf_null(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_bool(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_number(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_string(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_nvlist(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_descriptor(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -bool nvlist_existsf_binary(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); - -bool nvlist_existsv(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_type(const nvlist_t *nvl, int type, const char *namefmt, va_list nameap) __printflike(3, 0); - -bool nvlist_existsv_null(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_bool(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_number(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_string(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_nvlist(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_descriptor(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -bool nvlist_existsv_binary(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -#endif - -void nvlist_addf_null(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_addf_bool(nvlist_t *nvl, bool value, const char *namefmt, ...) __printflike(3, 4); -void nvlist_addf_number(nvlist_t *nvl, uint64_t value, const char *namefmt, ...) __printflike(3, 4); -void nvlist_addf_string(nvlist_t *nvl, const char *value, const char *namefmt, ...) __printflike(3, 4); -void nvlist_addf_nvlist(nvlist_t *nvl, const nvlist_t *value, const char *namefmt, ...) __printflike(3, 4); -#ifndef _KERNEL -void nvlist_addf_descriptor(nvlist_t *nvl, int value, const char *namefmt, ...) __printflike(3, 4); -#endif -void nvlist_addf_binary(nvlist_t *nvl, const void *value, size_t size, const char *namefmt, ...) __printflike(4, 5); - -#if !defined(_KERNEL) || defined(_VA_LIST_DECLARED) -void nvlist_addv_null(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_addv_bool(nvlist_t *nvl, bool value, const char *namefmt, va_list nameap) __printflike(3, 0); -void nvlist_addv_number(nvlist_t *nvl, uint64_t value, const char *namefmt, va_list nameap) __printflike(3, 0); -void nvlist_addv_string(nvlist_t *nvl, const char *value, const char *namefmt, va_list nameap) __printflike(3, 0); -void nvlist_addv_nvlist(nvlist_t *nvl, const nvlist_t *value, const char *namefmt, va_list nameap) __printflike(3, 0); -#ifndef _KERNEL -void nvlist_addv_descriptor(nvlist_t *nvl, int value, const char *namefmt, va_list nameap) __printflike(3, 0); -#endif -void nvlist_addv_binary(nvlist_t *nvl, const void *value, size_t size, const char *namefmt, va_list nameap) __printflike(4, 0); -#endif - -void nvlist_movef_string(nvlist_t *nvl, char *value, const char *namefmt, ...) __printflike(3, 4); -void nvlist_movef_nvlist(nvlist_t *nvl, nvlist_t *value, const char *namefmt, ...) __printflike(3, 4); -#ifndef _KERNEL -void nvlist_movef_descriptor(nvlist_t *nvl, int value, const char *namefmt, ...) __printflike(3, 4); -#endif -void nvlist_movef_binary(nvlist_t *nvl, void *value, size_t size, const char *namefmt, ...) __printflike(4, 5); - -#if !defined(_KERNEL) || defined(_VA_LIST_DECLARED) -void nvlist_movev_string(nvlist_t *nvl, char *value, const char *namefmt, va_list nameap) __printflike(3, 0); -void nvlist_movev_nvlist(nvlist_t *nvl, nvlist_t *value, const char *namefmt, va_list nameap) __printflike(3, 0); -#ifndef _KERNEL -void nvlist_movev_descriptor(nvlist_t *nvl, int value, const char *namefmt, va_list nameap) __printflike(3, 0); -#endif -void nvlist_movev_binary(nvlist_t *nvl, void *value, size_t size, const char *namefmt, va_list nameap) __printflike(4, 0); -#endif - -#ifndef _KERNEL -bool nvlist_getf_bool(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -uint64_t nvlist_getf_number(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -const char *nvlist_getf_string(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -const nvlist_t *nvlist_getf_nvlist(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -int nvlist_getf_descriptor(const nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -const void *nvlist_getf_binary(const nvlist_t *nvl, size_t *sizep, const char *namefmt, ...) __printflike(3, 4); - -bool nvlist_getv_bool(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -uint64_t nvlist_getv_number(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -const char *nvlist_getv_string(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -const nvlist_t *nvlist_getv_nvlist(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -int nvlist_getv_descriptor(const nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -const void *nvlist_getv_binary(const nvlist_t *nvl, size_t *sizep, const char *namefmt, va_list nameap) __printflike(3, 0); - -bool nvlist_takef_bool(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -uint64_t nvlist_takef_number(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -char *nvlist_takef_string(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -nvlist_t *nvlist_takef_nvlist(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -int nvlist_takef_descriptor(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void *nvlist_takef_binary(nvlist_t *nvl, size_t *sizep, const char *namefmt, ...) __printflike(3, 4); - -bool nvlist_takev_bool(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -uint64_t nvlist_takev_number(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -char *nvlist_takev_string(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -nvlist_t *nvlist_takev_nvlist(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -int nvlist_takev_descriptor(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void *nvlist_takev_binary(nvlist_t *nvl, size_t *sizep, const char *namefmt, va_list nameap) __printflike(3, 0); - -void nvlist_freef(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_type(nvlist_t *nvl, int type, const char *namefmt, ...) __printflike(3, 4); - -void nvlist_freef_null(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_bool(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_number(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_string(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_nvlist(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_descriptor(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); -void nvlist_freef_binary(nvlist_t *nvl, const char *namefmt, ...) __printflike(2, 3); - -void nvlist_freev(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_type(nvlist_t *nvl, int type, const char *namefmt, va_list nameap) __printflike(3, 0); - -void nvlist_freev_null(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_bool(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_number(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_string(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_nvlist(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_descriptor(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -void nvlist_freev_binary(nvlist_t *nvl, const char *namefmt, va_list nameap) __printflike(2, 0); -#endif /* _KERNEL */ - __END_DECLS #endif /* !_NV_H_ */ diff --git a/sys/sys/nv_impl.h b/sys/sys/nv_impl.h index da90e79..c088c3d 100644 --- a/sys/sys/nv_impl.h +++ b/sys/sys/nv_impl.h @@ -41,23 +41,24 @@ typedef struct nvpair nvpair_t; #define NV_TYPE_NVLIST_UP 255 -#define NV_TYPE_FIRST NV_TYPE_NULL -#define NV_TYPE_LAST NV_TYPE_BINARY +#define NV_TYPE_FIRST NV_TYPE_NULL +#define NV_TYPE_LAST NV_TYPE_BINARY #define NV_FLAG_BIG_ENDIAN 0x80 #ifdef _KERNEL -#define nv_malloc(size) malloc((size), M_NVLIST, M_NOWAIT) +#define nv_malloc(size) malloc((size), M_NVLIST, M_WAITOK) #define nv_calloc(n, size) malloc((n) * (size), M_NVLIST, \ - M_NOWAIT | M_ZERO) + M_WAITOK | M_ZERO) #define nv_realloc(buf, size) realloc((buf), (size), M_NVLIST, \ - M_NOWAIT) + M_WAITOK) #define nv_free(buf) free((buf), M_NVLIST) #define nv_strdup(buf) strdup((buf), M_NVLIST) #define nv_vasprintf(ptr, ...) vasprintf(ptr, M_NVLIST, __VA_ARGS__) -#define SAVE_ERRNO(var) ((void)(var)) -#define RESTORE_ERRNO(var) ((void)(var)) +#define ERRNO_SET(var) do { } while (0) +#define ERRNO_SAVE() do { do { } while(0) +#define ERRNO_RESTORE() } while (0) #define ERRNO_OR_DEFAULT(default) (default) @@ -70,8 +71,14 @@ typedef struct nvpair nvpair_t; #define nv_strdup(buf) strdup((buf)) #define nv_vasprintf(ptr, ...) vasprintf(ptr, __VA_ARGS__) -#define SAVE_ERRNO(var) (var) = errno -#define RESTORE_ERRNO(var) errno = (var) +#define ERRNO_SET(var) do { errno = (var); } while (0) +#define ERRNO_SAVE() do { \ + int _serrno; \ + \ + _serrno = errno + +#define ERRNO_RESTORE() errno = _serrno; \ + } while (0) #define ERRNO_OR_DEFAULT(default) (errno == 0 ? (default) : errno) @@ -128,30 +135,4 @@ const void *nvpair_get_binary(const nvpair_t *nvp, size_t *sizep); void nvpair_free(nvpair_t *nvp); -nvpair_t *nvpair_createf_null(const char *namefmt, ...) __printflike(1, 2); -nvpair_t *nvpair_createf_bool(bool value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_createf_number(uint64_t value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_createf_string(const char *value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_createf_nvlist(const nvlist_t *value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_createf_descriptor(int value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_createf_binary(const void *value, size_t size, const char *namefmt, ...) __printflike(3, 4); - -nvpair_t *nvpair_createv_null(const char *namefmt, va_list nameap) __printflike(1, 0); -nvpair_t *nvpair_createv_bool(bool value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_createv_number(uint64_t value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_createv_string(const char *value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_createv_nvlist(const nvlist_t *value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_createv_descriptor(int value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_createv_binary(const void *value, size_t size, const char *namefmt, va_list nameap) __printflike(3, 0); - -nvpair_t *nvpair_movef_string(char *value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_movef_nvlist(nvlist_t *value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_movef_descriptor(int value, const char *namefmt, ...) __printflike(2, 3); -nvpair_t *nvpair_movef_binary(void *value, size_t size, const char *namefmt, ...) __printflike(3, 4); - -nvpair_t *nvpair_movev_string(char *value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_movev_nvlist(nvlist_t *value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_movev_descriptor(int value, const char *namefmt, va_list nameap) __printflike(2, 0); -nvpair_t *nvpair_movev_binary(void *value, size_t size, const char *namefmt, va_list nameap) __printflike(3, 0); - #endif /* !_NV_IMPL_H_ */ diff --git a/sys/sys/nvlist_impl.h b/sys/sys/nvlist_impl.h index a59a90e..8aeac67 100644 --- a/sys/sys/nvlist_impl.h +++ b/sys/sys/nvlist_impl.h @@ -38,10 +38,6 @@ #include "nv.h" -void *nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep); -nvlist_t *nvlist_xunpack(const void *buf, size_t size, const int *fds, - size_t nfds); - nvpair_t *nvlist_get_nvpair_parent(const nvlist_t *nvl); const unsigned char *nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds, bool *isbep, size_t *leftp); diff --git a/sys/sys/param.h b/sys/sys/param.h index 33eaa71..867dbf1 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100070 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100071 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, @@ -233,10 +233,19 @@ * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * + * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must + * be >= MAXBSIZE and can be set differently for different + * architectures by defining it in <machine/param.h>. + * Making this larger allows NFS to do larger reads/writes. + * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). + * The default value here can be overridden on a per-architecture + * basis by defining it in <machine/param.h>. This should + * probably be done to increase its value, when MAXBCACHEBUF is + * defined as a larger value in <machine/param.h>. * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you @@ -248,7 +257,12 @@ * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ +#ifndef MAXBCACHEBUF +#define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ +#endif +#ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ +#endif #define BKVAMASK (BKVASIZE-1) /* diff --git a/sys/sys/procctl.h b/sys/sys/procctl.h index c57eee6..75dbf53 100644 --- a/sys/sys/procctl.h +++ b/sys/sys/procctl.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2013 Advanced Computing Technologies LLC + * Copyright (c) 2013 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/sys/racct.h b/sys/sys/racct.h index 966d27a..313d5d0 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -83,6 +83,10 @@ struct ucred; #define RACCT_DECAYING 0x20 extern int racct_types[]; +extern int racct_enable; + +#define ASSERT_RACCT_ENABLED() KASSERT(racct_enable, \ + ("%s called with !racct_enable", __func__)) /* * Amount stored in c_resources[] is 10**6 times bigger than what's diff --git a/sys/sys/seq.h b/sys/sys/seq.h index eaab86e..21067cb 100644 --- a/sys/sys/seq.h +++ b/sys/sys/seq.h @@ -79,7 +79,7 @@ typedef uint32_t seq_t; * on amd64 but still has unnecessary cost. */ static __inline int -atomic_load_rmb_int(volatile u_int *p) +atomic_load_rmb_int(volatile const u_int *p) { volatile u_int v; @@ -89,7 +89,7 @@ atomic_load_rmb_int(volatile u_int *p) } static __inline int -atomic_rmb_load_int(volatile u_int *p) +atomic_rmb_load_int(volatile const u_int *p) { volatile u_int v = 0; @@ -122,7 +122,7 @@ seq_write_end(seq_t *seqp) } static __inline seq_t -seq_read(seq_t *seqp) +seq_read(const seq_t *seqp) { seq_t ret; @@ -139,7 +139,7 @@ seq_read(seq_t *seqp) } static __inline seq_t -seq_consistent(seq_t *seqp, seq_t oldseq) +seq_consistent(const seq_t *seqp, seq_t oldseq) { return (atomic_rmb_load_int(seqp) == oldseq); diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 8a358ce..786414f 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -187,6 +187,7 @@ void *phashinit(int count, struct malloc_type *type, u_long *nentries); void g_waitidle(void); void panic(const char *, ...) __dead2 __printflike(1, 2); +void vpanic(const char *, __va_list) __dead2 __printflike(1, 0); void cpu_boot(int); void cpu_flush_dcache(void *, size_t); diff --git a/sys/net/zlib.h b/sys/sys/zlib.h index 16edae1..16edae1 100644 --- a/sys/net/zlib.h +++ b/sys/sys/zlib.h diff --git a/sys/net/zutil.h b/sys/sys/zutil.h index 74f0221..1431b6f 100644 --- a/sys/net/zutil.h +++ b/sys/sys/zutil.h @@ -17,7 +17,7 @@ #define ZEXPORT #ifdef _KERNEL -#include <net/zlib.h> +#include <sys/zlib.h> #else #include "zlib.h" #endif diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index c918cd7..c384064 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -112,8 +112,7 @@ static void ffs_blkfree_trim_task(void *ctx, int pending __unused); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif -static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int, - int); +static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, int, int); @@ -460,10 +459,16 @@ nospace: SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); static int doasyncfree = 1; -SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, ""); +SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, +"do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; -SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); +SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, +"enable block reallocation"); + +static int maxclustersearch = 10; +SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, +0, "max number of cylinder group to search for contigous blocks"); #ifdef DEBUG static volatile int prtrealloc = 0; @@ -510,7 +515,7 @@ ffs_reallocblks_ufs1(ap) ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; - int i, len, start_lvl, end_lvl, ssize; + int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); @@ -597,18 +602,39 @@ ffs_reallocblks_ufs1(ap) ebap = (ufs1_daddr_t *)ebp->b_data; } /* - * Find the preferred location for the cluster. + * Find the preferred location for the cluster. If we have not + * previously failed at this endeavor, then follow our standard + * preference calculation. If we have failed at it, then pick up + * where we last ended our search. */ UFS_LOCK(ump); - pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); + if (ip->i_nextclustercg == -1) + pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); + else + pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. + * To avoid wasting too much time, we limit the number of cylinder + * groups that we will search. + */ + cg = dtog(fs, pref); + for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { + if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) + break; + cg += 1; + if (cg >= fs->fs_ncg) + cg = 0; + } + /* + * If we have failed in our search, record where we gave up for + * next time. Otherwise, fall back to our usual search citerion. */ - if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, len, ffs_clusteralloc)) == 0) { + if (newblk == 0) { + ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } + ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * @@ -737,7 +763,7 @@ ffs_reallocblks_ufs2(ap) ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; - int i, len, start_lvl, end_lvl, ssize; + int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); @@ -824,18 +850,39 @@ ffs_reallocblks_ufs2(ap) ebap = (ufs2_daddr_t *)ebp->b_data; } /* - * Find the preferred location for the cluster. + * Find the preferred location for the cluster. If we have not + * previously failed at this endeavor, then follow our standard + * preference calculation. If we have failed at it, then pick up + * where we last ended our search. */ UFS_LOCK(ump); - pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); + if (ip->i_nextclustercg == -1) + pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); + else + pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. + * To avoid wasting too much time, we limit the number of cylinder + * groups that we will search. + */ + cg = dtog(fs, pref); + for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { + if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) + break; + cg += 1; + if (cg >= fs->fs_ncg) + cg = 0; + } + /* + * If we have failed in our search, record where we gave up for + * next time. Otherwise, fall back to our usual search citerion. */ - if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, - len, len, ffs_clusteralloc)) == 0) { + if (newblk == 0) { + ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } + ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * @@ -1786,12 +1833,11 @@ gotit: * take the first one that we find following bpref. */ static ufs2_daddr_t -ffs_clusteralloc(ip, cg, bpref, len, unused) +ffs_clusteralloc(ip, cg, bpref, len) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int len; - int unused; { struct fs *fs; struct cg *cgp; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 88f952f..2a368bd 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1688,6 +1688,7 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags) ip->i_dev = dev; ip->i_number = ino; ip->i_ea_refs = 0; + ip->i_nextclustercg = -1; #ifdef QUOTA { int i; diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 37081d5..cd7c472 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -87,6 +87,8 @@ struct inode { daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ } i_un; + int i_nextclustercg; /* last cg searched for cluster */ + /* * Data for extended attribute modification. */ diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c index 08e8fc7..e35741e 100644 --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009 Advanced Computing Technologies LLC + * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 55e02c4..15e5b24 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -198,11 +198,13 @@ swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) panic("swap_reserve: & PAGE_MASK"); #ifdef RACCT - PROC_LOCK(curproc); - error = racct_add(curproc, RACCT_SWAP, incr); - PROC_UNLOCK(curproc); - if (error != 0) - return (0); + if (racct_enable) { + PROC_LOCK(curproc); + error = racct_add(curproc, RACCT_SWAP, incr); + PROC_UNLOCK(curproc); + if (error != 0) + return (0); + } #endif res = 0; diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index 11ab24f..ad2a405 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -311,7 +311,7 @@ struct uma_zone { * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. */ - struct uma_cache uz_cpu[]; /* Per cpu caches */ + struct uma_cache uz_cpu[1]; /* Per cpu caches */ }; /* diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 13f46e1..0c84d44 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -348,6 +348,10 @@ RetryFault:; vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { + if (fs.vp != NULL) { + vput(fs.vp); + fs.vp = NULL; + } fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index b7e668b..bad9994 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -300,11 +300,11 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) return (vm); } +#ifdef RACCT static void vmspace_container_reset(struct proc *p) { -#ifdef RACCT PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); @@ -312,8 +312,8 @@ vmspace_container_reset(struct proc *p) racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); -#endif } +#endif static inline void vmspace_dofree(struct vmspace *vm) @@ -415,7 +415,10 @@ vmspace_exit(struct thread *td) pmap_activate(td); vmspace_dofree(vm); } - vmspace_container_reset(p); +#ifdef RACCT + if (racct_enable) + vmspace_container_reset(p); +#endif } /* Acquire reference to vmspace owned by another process. */ @@ -3652,14 +3655,16 @@ Retry: return (KERN_NO_SPACE); } #ifdef RACCT - PROC_LOCK(p); - if (is_procstack && - racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { + if (racct_enable) { + PROC_LOCK(p); + if (is_procstack && racct_set(p, RACCT_STACK, + ctob(vm->vm_ssize) + grow_amount)) { + PROC_UNLOCK(p); + vm_map_unlock_read(map); + return (KERN_NO_SPACE); + } PROC_UNLOCK(p); - vm_map_unlock_read(map); - return (KERN_NO_SPACE); } - PROC_UNLOCK(p); #endif /* Round up the grow amount modulo sgrowsiz */ @@ -3685,15 +3690,17 @@ Retry: goto out; } #ifdef RACCT - PROC_LOCK(p); - if (racct_set(p, RACCT_MEMLOCK, - ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { + if (racct_enable) { + PROC_LOCK(p); + if (racct_set(p, RACCT_MEMLOCK, + ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { + PROC_UNLOCK(p); + vm_map_unlock_read(map); + rv = KERN_NO_SPACE; + goto out; + } PROC_UNLOCK(p); - vm_map_unlock_read(map); - rv = KERN_NO_SPACE; - goto out; } - PROC_UNLOCK(p); #endif } /* If we would blow our VMEM resource limit, no go */ @@ -3703,14 +3710,16 @@ Retry: goto out; } #ifdef RACCT - PROC_LOCK(p); - if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { + if (racct_enable) { + PROC_LOCK(p); + if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { + PROC_UNLOCK(p); + vm_map_unlock_read(map); + rv = KERN_NO_SPACE; + goto out; + } PROC_UNLOCK(p); - vm_map_unlock_read(map); - rv = KERN_NO_SPACE; - goto out; } - PROC_UNLOCK(p); #endif if (vm_map_lock_upgrade(map)) @@ -3809,7 +3818,7 @@ Retry: out: #ifdef RACCT - if (rv != KERN_SUCCESS) { + if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 02634d6..1dd2479 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1122,16 +1122,18 @@ vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef RACCT - PROC_LOCK(proc); - error = racct_set(proc, RACCT_MEMLOCK, nsize); - PROC_UNLOCK(proc); - if (error != 0) - return (ENOMEM); + if (racct_enable) { + PROC_LOCK(proc); + error = racct_set(proc, RACCT_MEMLOCK, nsize); + PROC_UNLOCK(proc); + if (error != 0) + return (ENOMEM); + } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT - if (error != KERN_SUCCESS) { + if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); @@ -1179,11 +1181,13 @@ sys_mlockall(td, uap) PROC_UNLOCK(td->td_proc); } #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); - PROC_UNLOCK(td->td_proc); - if (error != 0) - return (ENOMEM); + if (racct_enable) { + PROC_LOCK(td->td_proc); + error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); + PROC_UNLOCK(td->td_proc); + if (error != 0) + return (ENOMEM); + } #endif if (uap->how & MCL_FUTURE) { @@ -1205,7 +1209,7 @@ sys_mlockall(td, uap) error = (error == KERN_SUCCESS ? 0 : EAGAIN); } #ifdef RACCT - if (error != KERN_SUCCESS) { + if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); @@ -1247,7 +1251,7 @@ sys_munlockall(td, uap) error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT - if (error == KERN_SUCCESS) { + if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); @@ -1291,7 +1295,7 @@ sys_munlock(td, uap) error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT - if (error == KERN_SUCCESS) { + if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 64ca130..0aec01f 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -91,12 +91,14 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/kernel.h> #include <sys/limits.h> +#include <sys/linker.h> #include <sys/malloc.h> #include <sys/mman.h> #include <sys/msgbuf.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/rwlock.h> +#include <sys/sbuf.h> #include <sys/sysctl.h> #include <sys/vmmeter.h> #include <sys/vnode.h> @@ -142,6 +144,12 @@ static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); +static TAILQ_HEAD(, vm_page) blacklist_head; +static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); + + static uma_zone_t fakepg_zone; static struct vnode *vm_page_alloc_init(vm_page_t m); @@ -216,35 +224,153 @@ vm_set_page_size(void) } /* - * vm_page_blacklist_lookup: + * vm_page_blacklist_next: * - * See if a physical address in this page has been listed - * in the blacklist tunable. Entries in the tunable are - * separated by spaces or commas. If an invalid integer is - * encountered then the rest of the string is skipped. + * Find the next entry in the provided string of blacklist + * addresses. Entries are separated by space, comma, or newline. + * If an invalid integer is encountered then the rest of the + * string is skipped. Updates the list pointer to the next + * character, or NULL if the string is exhausted or invalid. */ -static int -vm_page_blacklist_lookup(char *list, vm_paddr_t pa) +static vm_paddr_t +vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; - for (pos = list; *pos != '\0'; pos = cp) { + if (list == NULL || *list == NULL) + return (0); + if (**list =='\0') { + *list = NULL; + return (0); + } + + /* + * If there's no end pointer then the buffer is coming from + * the kenv and we know it's null-terminated. + */ + if (end == NULL) + end = *list + strlen(*list); + + /* Ensure that strtoq() won't walk off the end */ + if (*end != '\0') { + if (*end == '\n' || *end == ' ' || *end == ',') + *end = '\0'; + else { + printf("Blacklist not terminated, skipping\n"); + *list = NULL; + return (0); + } + } + + for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); - if (*cp != '\0') { - if (*cp == ' ' || *cp == ',') { - cp++; - if (cp == pos) + if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { + if (bad == 0) { + if (++cp < end) continue; - } else - break; - } - if (pa == trunc_page(bad)) - return (1); + else + break; + } + } else + break; + if (*cp == '\0' || ++cp >= end) + *list = NULL; + else + *list = cp; + return (trunc_page(bad)); } + printf("Garbage in RAM blacklist, skipping\n"); + *list = NULL; return (0); } +/* + * vm_page_blacklist_check: + * + * Iterate through the provided string of blacklist addresses, pulling + * each entry out of the physical allocator free list and putting it + * onto a list for reporting via the vm.page_blacklist sysctl. + */ +static void +vm_page_blacklist_check(char *list, char *end) +{ + vm_paddr_t pa; + vm_page_t m; + char *next; + int ret; + + next = list; + while (next != NULL) { + if ((pa = vm_page_blacklist_next(&next, end)) == 0) + continue; + m = vm_phys_paddr_to_vm_page(pa); + if (m == NULL) + continue; + mtx_lock(&vm_page_queue_free_mtx); + ret = vm_phys_unfree_page(m); + mtx_unlock(&vm_page_queue_free_mtx); + if (ret == TRUE) { + TAILQ_INSERT_TAIL(&blacklist_head, m, listq); + if (bootverbose) + printf("Skipping page with pa 0x%jx\n", + (uintmax_t)pa); + } + } +} + +/* + * vm_page_blacklist_load: + * + * Search for a special module named "ram_blacklist". It'll be a + * plain text file provided by the user via the loader directive + * of the same name. + */ +static void +vm_page_blacklist_load(char **list, char **end) +{ + void *mod; + u_char *ptr; + u_int len; + + mod = NULL; + ptr = NULL; + + mod = preload_search_by_type("ram_blacklist"); + if (mod != NULL) { + ptr = preload_fetch_addr(mod); + len = preload_fetch_size(mod); + } + *list = ptr; + if (ptr != NULL) + *end = ptr + len; + else + *end = NULL; + return; +} + +static int +sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) +{ + vm_page_t m; + struct sbuf sbuf; + int error, first; + + first = 1; + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + TAILQ_FOREACH(m, &blacklist_head, listq) { + sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", + (uintmax_t)m->phys_addr); + first = 0; + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} + static void vm_page_domain_init(struct vm_domain *vmd) { @@ -290,7 +416,7 @@ vm_page_startup(vm_offset_t vaddr) int i; vm_paddr_t pa; vm_paddr_t last_pa; - char *list; + char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; vm_paddr_t low_water, high_water; @@ -305,14 +431,6 @@ vm_page_startup(vm_offset_t vaddr) phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } -#ifdef XEN - /* - * There is no obvious reason why i386 PV Xen needs vm_page structs - * created for these pseudo-physical addresses. XXX - */ - vm_phys_add_seg(0, phys_avail[0]); -#endif - low_water = phys_avail[0]; high_water = phys_avail[1]; @@ -477,20 +595,22 @@ vm_page_startup(vm_offset_t vaddr) */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; - list = kern_getenv("vm.blacklist"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { - if (list != NULL && - vm_page_blacklist_lookup(list, pa)) - printf("Skipping page with pa 0x%jx\n", - (uintmax_t)pa); - else - vm_phys_add_page(pa); + vm_phys_add_page(pa); pa += PAGE_SIZE; } } + + TAILQ_INIT(&blacklist_head); + vm_page_blacklist_load(&list, &listend); + vm_page_blacklist_check(list, listend); + + list = kern_getenv("vm.blacklist"); + vm_page_blacklist_check(list, NULL); + freeenv(list); #if VM_NRESERVLEVEL > 0 /* diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 6f50053..872ffd8 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1787,11 +1787,13 @@ vm_daemon(void) while (TRUE) { mtx_lock(&vm_daemon_mtx); + msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT - msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz); + racct_enable ? hz : 0 #else - msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0); + 0 #endif + ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); @@ -1866,33 +1868,40 @@ again: &vm->vm_map, limit); } #ifdef RACCT - rsize = IDX_TO_OFF(size); - PROC_LOCK(p); - racct_set(p, RACCT_RSS, rsize); - ravailable = racct_get_available(p, RACCT_RSS); - PROC_UNLOCK(p); - if (rsize > ravailable) { - /* - * Don't be overly aggressive; this might be - * an innocent process, and the limit could've - * been exceeded by some memory hog. Don't - * try to deactivate more than 1/4th of process' - * resident set size. - */ - if (attempts <= 8) { - if (ravailable < rsize - (rsize / 4)) - ravailable = rsize - (rsize / 4); - } - vm_pageout_map_deactivate_pages( - &vm->vm_map, OFF_TO_IDX(ravailable)); - /* Update RSS usage after paging out. */ - size = vmspace_resident_count(vm); + if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); + ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); - if (rsize > ravailable) - tryagain = 1; + if (rsize > ravailable) { + /* + * Don't be overly aggressive; this + * might be an innocent process, + * and the limit could've been exceeded + * by some memory hog. Don't try + * to deactivate more than 1/4th + * of process' resident set size. + */ + if (attempts <= 8) { + if (ravailable < rsize - + (rsize / 4)) { + ravailable = rsize - + (rsize / 4); + } + } + vm_pageout_map_deactivate_pages( + &vm->vm_map, + OFF_TO_IDX(ravailable)); + /* Update RSS usage after paging out. */ + size = vmspace_resident_count(vm); + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + racct_set(p, RACCT_RSS, rsize); + PROC_UNLOCK(p); + if (rsize > ravailable) + tryagain = 1; + } } #endif vmspace_free(vm); diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c index de9aa78..cc77ff7 100644 --- a/sys/vm/vm_unix.c +++ b/sys/vm/vm_unix.c @@ -130,35 +130,39 @@ sys_obreak(td, uap) goto done; } #ifdef RACCT - PROC_LOCK(td->td_proc); - error = racct_set(td->td_proc, RACCT_DATA, new - base); - if (error != 0) { - PROC_UNLOCK(td->td_proc); - error = ENOMEM; - goto done; - } - error = racct_set(td->td_proc, RACCT_VMEM, - map->size + (new - old)); - if (error != 0) { - racct_set_force(td->td_proc, RACCT_DATA, old - base); - PROC_UNLOCK(td->td_proc); - error = ENOMEM; - goto done; - } - if (!old_mlock && map->flags & MAP_WIREFUTURE) { - error = racct_set(td->td_proc, RACCT_MEMLOCK, - ptoa(pmap_wired_count(map->pmap)) + (new - old)); + if (racct_enable) { + PROC_LOCK(td->td_proc); + error = racct_set(td->td_proc, RACCT_DATA, new - base); + if (error != 0) { + PROC_UNLOCK(td->td_proc); + error = ENOMEM; + goto done; + } + error = racct_set(td->td_proc, RACCT_VMEM, + map->size + (new - old)); if (error != 0) { racct_set_force(td->td_proc, RACCT_DATA, old - base); - racct_set_force(td->td_proc, RACCT_VMEM, - map->size); PROC_UNLOCK(td->td_proc); error = ENOMEM; goto done; } + if (!old_mlock && map->flags & MAP_WIREFUTURE) { + error = racct_set(td->td_proc, RACCT_MEMLOCK, + ptoa(pmap_wired_count(map->pmap)) + + (new - old)); + if (error != 0) { + racct_set_force(td->td_proc, RACCT_DATA, + old - base); + racct_set_force(td->td_proc, RACCT_VMEM, + map->size); + PROC_UNLOCK(td->td_proc); + error = ENOMEM; + goto done; + } + } + PROC_UNLOCK(td->td_proc); } - PROC_UNLOCK(td->td_proc); #endif prot = VM_PROT_RW; #ifdef COMPAT_FREEBSD32 @@ -170,14 +174,19 @@ sys_obreak(td, uap) rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { #ifdef RACCT - PROC_LOCK(td->td_proc); - racct_set_force(td->td_proc, RACCT_DATA, old - base); - racct_set_force(td->td_proc, RACCT_VMEM, map->size); - if (!old_mlock && map->flags & MAP_WIREFUTURE) { - racct_set_force(td->td_proc, RACCT_MEMLOCK, - ptoa(pmap_wired_count(map->pmap))); + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_set_force(td->td_proc, + RACCT_DATA, old - base); + racct_set_force(td->td_proc, + RACCT_VMEM, map->size); + if (!old_mlock && map->flags & MAP_WIREFUTURE) { + racct_set_force(td->td_proc, + RACCT_MEMLOCK, + ptoa(pmap_wired_count(map->pmap))); + } + PROC_UNLOCK(td->td_proc); } - PROC_UNLOCK(td->td_proc); #endif error = ENOMEM; goto done; @@ -205,14 +214,16 @@ sys_obreak(td, uap) } vm->vm_dsize -= btoc(old - new); #ifdef RACCT - PROC_LOCK(td->td_proc); - racct_set_force(td->td_proc, RACCT_DATA, new - base); - racct_set_force(td->td_proc, RACCT_VMEM, map->size); - if (!old_mlock && map->flags & MAP_WIREFUTURE) { - racct_set_force(td->td_proc, RACCT_MEMLOCK, - ptoa(pmap_wired_count(map->pmap))); + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_set_force(td->td_proc, RACCT_DATA, new - base); + racct_set_force(td->td_proc, RACCT_VMEM, map->size); + if (!old_mlock && map->flags & MAP_WIREFUTURE) { + racct_set_force(td->td_proc, RACCT_MEMLOCK, + ptoa(pmap_wired_count(map->pmap))); + } + PROC_UNLOCK(td->td_proc); } - PROC_UNLOCK(td->td_proc); #endif } done: diff --git a/sys/x86/acpica/srat.c b/sys/x86/acpica/srat.c index 8335a8c..8d2a741 100644 --- a/sys/x86/acpica/srat.c +++ b/sys/x86/acpica/srat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2010 Advanced Computing Technologies LLC + * Copyright (c) 2010 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index 818a831..0bd9fe5 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -454,6 +454,7 @@ void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); void xen_intr_handle_upcall(struct trapframe *frame); +void hv_vector_handler(struct trapframe *frame); extern int x2apic_mode; extern int lapic_eoi_suppression; diff --git a/sys/x86/include/mca.h b/sys/x86/include/mca.h index 29728b9..6420eb8 100644 --- a/sys/x86/include/mca.h +++ b/sys/x86/include/mca.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009 Advanced Computing Technologies LLC + * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/x86/include/segments.h b/sys/x86/include/segments.h index a5c1ea4..65b5870 100644 --- a/sys/x86/include/segments.h +++ b/sys/x86/include/segments.h @@ -46,11 +46,7 @@ */ #define SEL_RPL_MASK 3 /* requester priv level */ #define ISPL(s) ((s)&3) /* priority level of a selector */ -#ifdef XEN -#define SEL_KPL 1 /* kernel priority level */ -#else #define SEL_KPL 0 /* kernel priority level */ -#endif #define SEL_UPL 3 /* user priority level */ #define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ #define SEL_LDT 4 /* local descriptor table */ @@ -244,11 +240,7 @@ union descriptor { #define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ #define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ #define GNDIS_SEL 18 /* For the NDIS layer */ -#ifdef XEN -#define NGDT 9 -#else #define NGDT 19 -#endif /* * Entries in the Local Descriptor Table (LDT) diff --git a/sys/x86/pci/qpi.c b/sys/x86/pci/qpi.c index 2f88891..ae29daa 100644 --- a/sys/x86/pci/qpi.c +++ b/sys/x86/pci/qpi.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2010 Advanced Computing Technologies LLC + * Copyright (c) 2010 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/x86/x86/busdma_bounce.c b/sys/x86/x86/busdma_bounce.c index 1438053..dcdeafa 100644 --- a/sys/x86/x86/busdma_bounce.c +++ b/sys/x86/x86/busdma_bounce.c @@ -147,11 +147,6 @@ static void _bus_dmamap_count_phys(bus_dma_tag_t dmat, bus_dmamap_t map, static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags); -#ifdef XEN -#undef pmap_kextract -#define pmap_kextract pmap_kextract_ma -#endif - /* * Allocate a device specific dma_tag. */ @@ -994,8 +989,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) { /* Page offset needs to be preserved. */ - bpage->vaddr |= vaddr & PAGE_MASK; - bpage->busaddr |= vaddr & PAGE_MASK; + bpage->vaddr |= addr & PAGE_MASK; + bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; bpage->dataaddr = addr; diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c index 846a123..f8d1f08 100644 --- a/sys/x86/x86/cpu_machdep.c +++ b/sys/x86/x86/cpu_machdep.c @@ -100,15 +100,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pager.h> #include <vm/vm_param.h> -#ifdef XEN -/* XEN includes */ -#include <xen/xen-os.h> -#include <xen/hypervisor.h> -#include <machine/xen/xenvar.h> -#include <machine/xen/xenfunc.h> -#include <xen/xen_intr.h> -#endif - /* * Machine dependent boot() routine * @@ -193,33 +184,6 @@ cpu_est_clockrate(int cpu_id, uint64_t *rate) return (0); } -#if defined(__i386__) && defined(XEN) - -static void -idle_block(void) -{ - - HYPERVISOR_sched_op(SCHEDOP_block, 0); -} - -void -cpu_halt(void) -{ - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -} - -int scheduler_running; - -static void -cpu_idle_hlt(sbintime_t sbt) -{ - - scheduler_running = 1; - enable_intr(); - idle_block(); -} - -#else /* * Shutdown the CPU as much as possible */ @@ -230,8 +194,6 @@ cpu_halt(void) halt(); } -#endif - void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ @@ -263,7 +225,6 @@ cpu_idle_acpi(sbintime_t sbt) } #endif /* !PC98 */ -#if !defined(__i386__) || !defined(XEN) static void cpu_idle_hlt(sbintime_t sbt) { @@ -295,7 +256,6 @@ cpu_idle_hlt(sbintime_t sbt) __asm __volatile("sti; hlt"); *state = STATE_RUNNING; } -#endif static void cpu_idle_mwait(sbintime_t sbt) @@ -370,7 +330,7 @@ cpu_probe_amdc1e(void) } } -#if defined(__i386__) && (defined(PC98) || defined(XEN)) +#if defined(__i386__) && defined(PC98) void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; #else void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; @@ -379,17 +339,15 @@ void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { -#if !defined(__i386__) || !defined(XEN) uint64_t msr; -#endif sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); -#if defined(MP_WATCHDOG) && (!defined(__i386__) || !defined(XEN)) +#ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif -#if !defined(__i386__) || !defined(XEN) + /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { @@ -397,7 +355,6 @@ cpu_idle(int busy) goto out; } } -#endif /* If we have time - switch timers into idle mode. */ if (!busy) { @@ -405,14 +362,12 @@ cpu_idle(int busy) sbt = cpu_idleclock(); } -#if !defined(__i386__) || !defined(XEN) /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } -#endif /* Call main idle method. */ cpu_idle_fn(sbt); @@ -422,9 +377,7 @@ cpu_idle(int busy) cpu_activeclock(); critical_exit(); } -#if !defined(__i386__) || !defined(XEN) out: -#endif CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } diff --git a/sys/x86/x86/identcpu.c b/sys/x86/x86/identcpu.c index ca2b5ca..9f39d1c 100644 --- a/sys/x86/x86/identcpu.c +++ b/sys/x86/x86/identcpu.c @@ -1190,7 +1190,6 @@ hook_tsc_freq(void *arg __unused) SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL); -#ifndef XEN static const char *const vm_bnames[] = { "QEMU", /* QEMU */ "Plex86", /* Plex86 */ @@ -1281,7 +1280,6 @@ identify_hypervisor(void) freeenv(p); } } -#endif /* * Final stage of CPU identification. @@ -1314,9 +1312,7 @@ identify_cpu(void) cpu_feature2 = regs[2]; #endif -#ifndef XEN identify_hypervisor(); -#endif cpu_vendor_id = find_cpu_vendor_id(); /* diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c index b27df4a..d81c913 100644 --- a/sys/x86/x86/intr_machdep.c +++ b/sys/x86/x86/intr_machdep.c @@ -532,13 +532,6 @@ intr_shuffle_irqs(void *arg __unused) struct intsrc *isrc; int i; -#ifdef XEN - /* - * Doesn't work yet - */ - return; -#endif - /* Don't bother on UP. */ if (mp_ncpus == 1) return; diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index e0656ef..d75a452 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1579,17 +1579,13 @@ apic_setup_io(void *dummy __unused) * Local APIC must be registered before other PICs and pseudo PICs * for proper suspend/resume order. */ -#ifndef XEN intr_register_pic(&lapic_pic); -#endif retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); -#ifdef XEN - return; -#endif + /* * Finish setting up the local APIC on the BSP once we know * how to properly program the LINT pins. In particular, this diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c index 5c895a8..8026b15 100644 --- a/sys/x86/x86/mca.c +++ b/sys/x86/x86/mca.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2009 Advanced Computing Technologies LLC + * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c new file mode 100644 index 0000000..b4c66a5 --- /dev/null +++ b/sys/x86/x86/mp_x86.c @@ -0,0 +1,1120 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * Copyright (c) 2003, by Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#ifdef __i386__ +#include "opt_apic.h" +#endif +#include "opt_cpu.h" +#include "opt_kstack_pages.h" +#include "opt_pmap.h" +#include "opt_sched.h" +#include "opt_smp.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cons.h> /* cngetc() */ +#include <sys/cpuset.h> +#ifdef GPROF +#include <sys/gmon.h> +#endif +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +#include <x86/apicreg.h> +#include <machine/clock.h> +#include <machine/cputypes.h> +#include <x86/mca.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/cpu.h> + +#define WARMBOOT_TARGET 0 +#define WARMBOOT_OFF (KERNBASE + 0x0467) +#define WARMBOOT_SEG (KERNBASE + 0x0469) + +#define CMOS_REG (0x70) +#define CMOS_DATA (0x71) +#define BIOS_RESET (0x0f) +#define BIOS_WARM (0x0a) + +/* lock region used by kernel profiling */ +int mcount_lock; + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ + +extern struct pcpu __pcpu[]; + +/* AP uses this during bootstrap. Do not staticize. */ +char *bootSTK; +int bootAP; + +/* Free these after use */ +void *bootstacks[MAXCPU]; +void *dpcpu; + +struct pcb stoppcbs[MAXCPU]; +struct susppcb **susppcbs; + +#ifdef COUNT_IPIS +/* Interrupt counts. */ +static u_long *ipi_preempt_counts[MAXCPU]; +static u_long *ipi_ast_counts[MAXCPU]; +u_long *ipi_invltlb_counts[MAXCPU]; +u_long *ipi_invlrng_counts[MAXCPU]; +u_long *ipi_invlpg_counts[MAXCPU]; +u_long *ipi_invlcache_counts[MAXCPU]; +u_long *ipi_rendezvous_counts[MAXCPU]; +static u_long *ipi_hardclock_counts[MAXCPU]; +#endif + +/* Default cpu_ops implementation. */ +struct cpu_ops cpu_ops; + +/* + * Local data and functions. + */ + +static volatile cpuset_t ipi_nmi_pending; + +/* used to hold the AP's until we are ready to release them */ +struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info cpu_info[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; +int apic_cpuids[MAX_APIC_ID + 1]; + +/* Holds pending bitmap based IPIs per CPU */ +volatile u_int cpu_ipi_pending[MAXCPU]; + +int cpu_logical; /* logical cpus per core */ +int cpu_cores; /* cores per package */ + +static void release_aps(void *dummy); + +static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ +static int hyperthreading_allowed = 1; + +void +mem_range_AP_init(void) +{ + + if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) + mem_range_softc.mr_op->initAP(&mem_range_softc); +} + +static void +topo_probe_amd(void) +{ + int core_id_bits; + int id; + + /* AMD processors do not support HTT. */ + cpu_logical = 1; + + if ((amd_feature2 & AMDID2_CMP) == 0) { + cpu_cores = 1; + return; + } + + core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> + AMDID_COREID_SIZE_SHIFT; + if (core_id_bits == 0) { + cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; + return; + } + + /* Fam 10h and newer should get here. */ + for (id = 0; id <= MAX_APIC_ID; id++) { + /* Check logical CPU availability. */ + if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) + continue; + /* Check if logical CPU has the same package ID. */ + if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) + continue; + cpu_cores++; + } +} + +/* + * Round up to the next power of two, if necessary, and then + * take log2. + * Returns -1 if argument is zero. + */ +static __inline int +mask_width(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} + +static void +topo_probe_0x4(void) +{ + u_int p[4]; + int pkg_id_bits; + int core_id_bits; + int max_cores; + int max_logical; + int id; + + /* Both zero and one here mean one logical processor per package. */ + max_logical = (cpu_feature & CPUID_HTT) != 0 ? + (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; + if (max_logical <= 1) + return; + + /* + * Because of uniformity assumption we examine only + * those logical processors that belong to the same + * package as BSP. Further, we count number of + * logical processors that belong to the same core + * as BSP thus deducing number of threads per core. + */ + if (cpu_high >= 0x4) { + cpuid_count(0x04, 0, p); + max_cores = ((p[0] >> 26) & 0x3f) + 1; + } else + max_cores = 1; + core_id_bits = mask_width(max_logical/max_cores); + if (core_id_bits < 0) + return; + pkg_id_bits = core_id_bits + mask_width(max_cores); + + for (id = 0; id <= MAX_APIC_ID; id++) { + /* Check logical CPU availability. */ + if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) + continue; + /* Check if logical CPU has the same package ID. */ + if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) + continue; + cpu_cores++; + /* Check if logical CPU has the same package and core IDs. */ + if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) + cpu_logical++; + } + + KASSERT(cpu_cores >= 1 && cpu_logical >= 1, + ("topo_probe_0x4 couldn't find BSP")); + + cpu_cores /= cpu_logical; + hyperthreading_cpus = cpu_logical; +} + +static void +topo_probe_0xb(void) +{ + u_int p[4]; + int bits; + int cnt; + int i; + int logical; + int type; + int x; + + /* We only support three levels for now. */ + for (i = 0; i < 3; i++) { + cpuid_count(0x0b, i, p); + + /* Fall back if CPU leaf 11 doesn't really exist. */ + if (i == 0 && p[1] == 0) { + topo_probe_0x4(); + return; + } + + bits = p[0] & 0x1f; + logical = p[1] &= 0xffff; + type = (p[2] >> 8) & 0xff; + if (type == 0 || logical == 0) + break; + /* + * Because of uniformity assumption we examine only + * those logical processors that belong to the same + * package as BSP. + */ + for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { + if (!cpu_info[x].cpu_present || + cpu_info[x].cpu_disabled) + continue; + if (x >> bits == boot_cpu_id >> bits) + cnt++; + } + if (type == CPUID_TYPE_SMT) + cpu_logical = cnt; + else if (type == CPUID_TYPE_CORE) + cpu_cores = cnt; + } + if (cpu_logical == 0) + cpu_logical = 1; + cpu_cores /= cpu_logical; +} + +/* + * Both topology discovery code and code that consumes topology + * information assume top-down uniformity of the topology. + * That is, all physical packages must be identical and each + * core in a package must have the same number of threads. + * Topology information is queried only on BSP, on which this + * code runs and for which it can query CPUID information. + * Then topology is extrapolated on all packages using the + * uniformity assumption. + */ +void +topo_probe(void) +{ + static int cpu_topo_probed = 0; + + if (cpu_topo_probed) + return; + + CPU_ZERO(&logical_cpus_mask); + if (mp_ncpus <= 1) + cpu_cores = cpu_logical = 1; + else if (cpu_vendor_id == CPU_VENDOR_AMD) + topo_probe_amd(); + else if (cpu_vendor_id == CPU_VENDOR_INTEL) { + /* + * See Intel(R) 64 Architecture Processor + * Topology Enumeration article for details. + * + * Note that 0x1 <= cpu_high < 4 case should be + * compatible with topo_probe_0x4() logic when + * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) + * or it should trigger the fallback otherwise. + */ + if (cpu_high >= 0xb) + topo_probe_0xb(); + else if (cpu_high >= 0x1) + topo_probe_0x4(); + } + + /* + * Fallback: assume each logical CPU is in separate + * physical package. That is, no multi-core, no SMT. + */ + if (cpu_cores == 0 || cpu_logical == 0) + cpu_cores = cpu_logical = 1; + cpu_topo_probed = 1; +} + +struct cpu_group * +cpu_topo(void) +{ + int cg_flags; + + /* + * Determine whether any threading flags are + * necessry. + */ + topo_probe(); + if (cpu_logical > 1 && hyperthreading_cpus) + cg_flags = CG_FLAG_HTT; + else if (cpu_logical > 1) + cg_flags = CG_FLAG_SMT; + else + cg_flags = 0; + if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { + printf("WARNING: Non-uniform processors.\n"); + printf("WARNING: Using suboptimal topology.\n"); + return (smp_topo_none()); + } + /* + * No multi-core or hyper-threaded. + */ + if (cpu_logical * cpu_cores == 1) + return (smp_topo_none()); + /* + * Only HTT no multi-core. + */ + if (cpu_logical > 1 && cpu_cores == 1) + return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); + /* + * Only multi-core no HTT. + */ + if (cpu_cores > 1 && cpu_logical == 1) + return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); + /* + * Both HTT and multi-core. + */ + return (smp_topo_2level(CG_SHARE_L2, cpu_cores, + CG_SHARE_L1, cpu_logical, cg_flags)); +} + + +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id > MAX_APIC_ID) { + panic("SMP: APIC ID %d too high", apic_id); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + if (mp_ncpus < MAXCPU) { + mp_ncpus++; + mp_maxid = mp_ncpus - 1; + } + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); +} + +void +cpu_mp_setmaxid(void) +{ + + /* + * mp_maxid should be already set by calls to cpu_add(). + * Just sanity check its value here. + */ + if (mp_ncpus == 0) + KASSERT(mp_maxid == 0, + ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); + else if (mp_ncpus == 1) + mp_maxid = 0; + else + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, + mp_maxid, mp_ncpus)); +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + CPU_SETOF(0, &all_cpus); + if (mp_ncpus == 0) { + /* + * No CPUs were found, so this must be a UP system. Setup + * the variables to represent a system with a single CPU + * with an id of 0. + */ + mp_ncpus = 1; + return (0); + } + + /* At least one CPU was found. */ + if (mp_ncpus == 1) { + /* + * One CPU was found, so this must be a UP system with + * an I/O APIC. + */ + mp_maxid = 0; + return (0); + } + + /* At least two CPUs were found. */ + return (1); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + const char *hyperthread; + int i; + + printf("FreeBSD/SMP: %d package(s) x %d core(s)", + mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); + if (hyperthreading_cpus > 1) + printf(" x %d HTT threads", cpu_logical); + else if (cpu_logical > 1) + printf(" x %d SMT threads", cpu_logical); + printf("\n"); + + /* List active CPUs first. */ + printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); + for (i = 1; i < mp_ncpus; i++) { + if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) + hyperthread = "/HT"; + else + hyperthread = ""; + printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, + cpu_apic_ids[i]); + } + + /* List disabled CPUs last. */ + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) + continue; + if (cpu_info[i].cpu_hyperthread) + hyperthread = "/HT"; + else + hyperthread = ""; + printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, + i); + } +} + +void +init_secondary_tail(void) +{ + u_int cpuid; + + /* + * On real hardware, switch to x2apic mode if possible. Do it + * after aps_ready was signalled, to avoid manipulating the + * mode while BSP might still want to send some IPI to us + * (second startup IPI is ignored on modern hardware etc). + */ + lapic_xapic_mode(); + + /* Initialize the PAT MSR. */ + pmap_init_pat(); + + /* set up CPU registers and state */ + cpu_setregs(); + + /* set up SSE/NX */ + initializecpu(); + + /* set up FPU state on the AP */ +#ifdef __amd64__ + fpuinit(); +#else + npxinit(false); +#endif + + if (cpu_ops.cpu_init) + cpu_ops.cpu_init(); + + /* A quick check from sanity claus */ + cpuid = PCPU_GET(cpuid); + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", cpuid); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + panic("cpuid mismatch! boom!!"); + } + + /* Initialize curthread. */ + KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); + PCPU_SET(curthread, PCPU_GET(idlethread)); + + mca_init(); + + mtx_lock_spin(&ap_boot_mtx); + + /* Init local apic for irq's */ + lapic_setup(1); + + /* Set memory range attributes for this CPU to match the BSP */ + mem_range_AP_init(); + + smp_cpus++; + + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); + printf("SMP: AP CPU #%d Launched!\n", cpuid); + + /* Determine if we are a logical CPU. */ + /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ + if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) + CPU_SET(cpuid, &logical_cpus_mask); + + if (bootverbose) + lapic_dump("AP"); + + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + } + +#ifdef __amd64__ + /* + * Enable global pages TLB extension + * This also implicitly flushes the TLB + */ + load_cr4(rcr4() | CR4_PGE); + if (pmap_pcid_enabled) + load_cr4(rcr4() | CR4_PCIDE); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); +#endif + + mtx_unlock_spin(&ap_boot_mtx); + + /* Wait until all the AP's are up. */ + while (smp_started == 0) + ia32_pause(); + + /* Start per-CPU event timers. */ + cpu_initclocks_ap(); + + sched_throw(NULL); + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * We tell the I/O APIC code about all the CPUs we want to receive + * interrupts. If we don't want certain CPUs to receive IRQs we + * can simply not tell the I/O APIC code about them in this function. + * We also do not tell it about the BSP since it tells itself about + * the BSP internally to work with UP kernels and on UP machines. + */ +void +set_interrupt_apic_ids(void) +{ + u_int i, apic_id; + + for (i = 0; i < MAXCPU; i++) { + apic_id = cpu_apic_ids[i]; + if (apic_id == -1) + continue; + if (cpu_info[apic_id].cpu_bsp) + continue; + if (cpu_info[apic_id].cpu_disabled) + continue; + + /* Don't let hyperthreads service interrupts. */ + if (cpu_logical > 1 && + apic_id % cpu_logical != 0) + continue; + + intr_add_cpu(i); + } +} + +/* + * Assign logical CPU IDs to local APICs. + */ +void +assign_cpu_ids(void) +{ + u_int i; + + TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", + &hyperthreading_allowed); + + /* Check for explicitly disabled CPUs. */ + for (i = 0; i <= MAX_APIC_ID; i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) + continue; + + if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { + cpu_info[i].cpu_hyperthread = 1; + + /* + * Don't use HT CPU if it has been disabled by a + * tunable. + */ + if (hyperthreading_allowed == 0) { + cpu_info[i].cpu_disabled = 1; + continue; + } + } + + /* Don't use this CPU if it has been disabled by a tunable. */ + if (resource_disabled("lapic", i)) { + cpu_info[i].cpu_disabled = 1; + continue; + } + } + + if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { + hyperthreading_cpus = 0; + cpu_logical = 1; + } + + /* + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 is always assigned to the BSP. + * + * To minimize confusion for userland, we attempt to number + * CPUs such that all threads and cores in a package are + * grouped together. For now we assume that the BSP is always + * the first thread in a package and just start adding APs + * starting with the BSP's APIC ID. + */ + mp_ncpus = 1; + cpu_apic_ids[0] = boot_cpu_id; + apic_cpuids[boot_cpu_id] = 0; + for (i = boot_cpu_id + 1; i != boot_cpu_id; + i == MAX_APIC_ID ? i = 0 : i++) { + if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || + cpu_info[i].cpu_disabled) + continue; + + if (mp_ncpus < MAXCPU) { + cpu_apic_ids[mp_ncpus] = i; + apic_cpuids[i] = mp_ncpus; + mp_ncpus++; + } else + cpu_info[i].cpu_disabled = 1; + } + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); +#endif /* COUNT_XINVLTLB_HITS */ + +/* + * Init and startup IPI. + */ +void +ipi_startup(int apic_id, int vector) +{ + + /* + * This attempts to follow the algorithm described in the + * Intel Multiprocessor Specification v1.4 in section B.4. + * For each IPI, we allow the local APIC ~20us to deliver the + * IPI. If that times out, we panic. + */ + + /* + * first we do an INIT IPI: this INIT IPI might be run, resetting + * and running the target CPU. OR this INIT IPI might be latched (P5 + * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be + * ignored. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); + lapic_ipi_wait(100); + + /* Explicitly deassert the INIT IPI. */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | + APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, + apic_id); + + DELAY(10000); /* wait ~10mS */ + + /* + * next we do a STARTUP IPI: the previous INIT IPI might still be + * latched, (P5 bug) this 1st STARTUP would then terminate + * immediately, and the previously started INIT IPI would continue. OR + * the previous INIT IPI has already run. and this STARTUP IPI will + * run. OR the previous INIT IPI was ignored. and this STARTUP IPI + * will run. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + if (!lapic_ipi_wait(100)) + panic("Failed to deliver first STARTUP IPI to APIC %d", + apic_id); + DELAY(200); /* wait ~200uS */ + + /* + * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF + * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR + * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is + * recognized after hardware RESET or INIT IPI. + */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + if (!lapic_ipi_wait(100)) + panic("Failed to deliver second STARTUP IPI to APIC %d", + apic_id); + + DELAY(200); /* wait ~200uS */ +} + +/* + * Send an IPI to specified CPU handling the bitmap logic. + */ +void +ipi_send_cpu(int cpu, u_int ipi) +{ + u_int bitmap, old_pending, new_pending; + + KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); + + if (IPI_IS_BITMAPED(ipi)) { + bitmap = 1 << ipi; + ipi = IPI_BITMAP_VECTOR; + do { + old_pending = cpu_ipi_pending[cpu]; + new_pending = old_pending | bitmap; + } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], + old_pending, new_pending)); + if (old_pending) + return; + } + lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); +} + +void +ipi_bitmap_handler(struct trapframe frame) +{ + struct trapframe *oldframe; + struct thread *td; + int cpu = PCPU_GET(cpuid); + u_int ipi_bitmap; + + critical_enter(); + td = curthread; + td->td_intr_nesting_level++; + oldframe = td->td_intr_frame; + td->td_intr_frame = &frame; + ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); + if (ipi_bitmap & (1 << IPI_PREEMPT)) { +#ifdef COUNT_IPIS + (*ipi_preempt_counts[cpu])++; +#endif + sched_preempt(td); + } + if (ipi_bitmap & (1 << IPI_AST)) { +#ifdef COUNT_IPIS + (*ipi_ast_counts[cpu])++; +#endif + /* Nothing to do for AST */ + } + if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { +#ifdef COUNT_IPIS + (*ipi_hardclock_counts[cpu])++; +#endif + hardclockintr(); + } + td->td_intr_frame = oldframe; + td->td_intr_nesting_level--; + critical_exit(); +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(cpuset_t cpus, u_int ipi) +{ + int cpu; + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); + + while ((cpu = CPU_FFS(&cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &cpus); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); + } +} + +/* + * send an IPI to a specific CPU. + */ +void +ipi_cpu(int cpu, u_int ipi) +{ + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); + + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); + ipi_send_cpu(cpu, ipi); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + cpuset_t other_cpus; + + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + if (IPI_IS_BITMAPED(ipi)) { + ipi_selected(other_cpus, ipi); + return; + } + + /* + * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit + * of help in order to understand what is the source. + * Set the mask of receiving CPUs for this purpose. + */ + if (ipi == IPI_STOP_HARD) + CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); +} + +int +ipi_nmi_handler() +{ + u_int cpuid; + + /* + * As long as there is not a simple way to know about a NMI's + * source, if the bitmask for the current CPU is present in + * the global pending bitword an IPI_STOP_HARD has been issued + * and should be handled. + */ + cpuid = PCPU_GET(cpuid); + if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) + return (1); + + CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); + cpustop_handler(); + return (0); +} + +/* + * Handle an IPI_STOP by saving our current context and spinning until we + * are resumed. + */ +void +cpustop_handler(void) +{ + u_int cpu; + + cpu = PCPU_GET(cpuid); + + savectx(&stoppcbs[cpu]); + + /* Indicate that we are stopped */ + CPU_SET_ATOMIC(cpu, &stopped_cpus); + + /* Wait for restart */ + while (!CPU_ISSET(cpu, &started_cpus)) + ia32_pause(); + + CPU_CLR_ATOMIC(cpu, &started_cpus); + CPU_CLR_ATOMIC(cpu, &stopped_cpus); + +#if defined(__amd64__) && defined(DDB) + amd64_db_resume_dbreg(); +#endif + + if (cpu == 0 && cpustop_restartfunc != NULL) { + cpustop_restartfunc(); + cpustop_restartfunc = NULL; + } +} + +/* + * Handle an IPI_SUSPEND by saving our current context and spinning until we + * are resumed. + */ +void +cpususpend_handler(void) +{ + u_int cpu; + + mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); + + cpu = PCPU_GET(cpuid); + if (savectx(&susppcbs[cpu]->sp_pcb)) { +#ifdef __amd64__ + fpususpend(susppcbs[cpu]->sp_fpususpend); +#else + npxsuspend(susppcbs[cpu]->sp_fpususpend); +#endif + wbinvd(); + CPU_SET_ATOMIC(cpu, &suspended_cpus); + } else { +#ifdef __amd64__ + fpuresume(susppcbs[cpu]->sp_fpususpend); +#else + npxresume(susppcbs[cpu]->sp_fpususpend); +#endif + pmap_init_pat(); + initializecpu(); + PCPU_SET(switchtime, 0); + PCPU_SET(switchticks, ticks); + + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); + } + + /* Wait for resume */ + while (!CPU_ISSET(cpu, &started_cpus)) + ia32_pause(); + + if (cpu_ops.cpu_resume) + cpu_ops.cpu_resume(); +#ifdef __amd64__ + if (vmm_resume_p) + vmm_resume_p(); +#endif + + /* Resume MCA and local APIC */ + lapic_xapic_mode(); + mca_resume(); + lapic_setup(0); + + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); + CPU_CLR_ATOMIC(cpu, &started_cpus); +} + + +void +invlcache_handler(void) +{ +#ifdef COUNT_IPIS + (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + wbinvd(); + atomic_add_int(&smp_tlb_wait, 1); +} + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +static void +release_aps(void *dummy __unused) +{ + + if (mp_ncpus == 1) + return; + atomic_store_rel_int(&aps_ready, 1); + while (smp_started == 0) + ia32_pause(); +} +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); + +#ifdef COUNT_IPIS +/* + * Setup interrupt counters for IPI handlers. + */ +static void +mp_ipi_intrcnt(void *dummy) +{ + char buf[64]; + int i; + + CPU_FOREACH(i) { + snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); + intrcnt_add(buf, &ipi_invltlb_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); + intrcnt_add(buf, &ipi_invlrng_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); + intrcnt_add(buf, &ipi_invlpg_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); + intrcnt_add(buf, &ipi_invlcache_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:preempt", i); + intrcnt_add(buf, &ipi_preempt_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:ast", i); + intrcnt_add(buf, &ipi_ast_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); + intrcnt_add(buf, &ipi_rendezvous_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); + intrcnt_add(buf, &ipi_hardclock_counts[i]); + } +} +SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); +#endif diff --git a/sys/x86/xen/xen_intr.c b/sys/x86/xen/xen_intr.c index 64979b1..3bc4b43 100644 --- a/sys/x86/xen/xen_intr.c +++ b/sys/x86/xen/xen_intr.c @@ -1,7 +1,7 @@ /****************************************************************************** * xen_intr.c * - * Xen event and interrupt services for x86 PV and HVM guests. + * Xen event and interrupt services for x86 HVM guests. * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@intel.com> @@ -864,10 +864,8 @@ xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id) u_int to_cpu, vcpu_id; int error, masked; -#ifdef XENHVM if (xen_vector_callback_enabled == 0) return (EOPNOTSUPP); -#endif to_cpu = apic_cpuid(apic_id); vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id; diff --git a/sys/x86/xen/xen_nexus.c b/sys/x86/xen/xen_nexus.c index f25f970..73506fc 100644 --- a/sys/x86/xen/xen_nexus.c +++ b/sys/x86/xen/xen_nexus.c @@ -66,14 +66,11 @@ static int nexus_xen_attach(device_t dev) { int error; -#ifndef XEN device_t acpi_dev = NULL; -#endif nexus_init_resources(); bus_generic_probe(dev); -#ifndef XEN if (xen_initial_domain()) { /* Disable some ACPI devices that are not usable by Dom0 */ acpi_cpu_disabled = true; @@ -84,13 +81,10 @@ nexus_xen_attach(device_t dev) if (acpi_dev == NULL) panic("Unable to add ACPI bus to Xen Dom0"); } -#endif error = bus_generic_attach(dev); -#ifndef XEN if (xen_initial_domain() && (error == 0)) acpi_install_wakeup_handler(device_get_softc(acpi_dev)); -#endif return (error); } |