--- old/usr/src/lib/libc/i386_hwcap3/Makefile Mon Apr 6 14:23:04 2009 +++ new/usr/src/lib/libc/i386_hwcap3/Makefile Mon Apr 6 14:23:03 2009 @@ -19,14 +19,17 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # lib/libc/i386_hwcap3/Makefile # +# +# This libc is only used when running on pre-3.2 versions of Xen (which +# don't support sysenter or syscall for 32-bit processes). +# + LIBCBASE= ../i386 LIBRARY= libc_hwcap3.a --- old/usr/src/pkgdefs/SUNWhea/prototype_i386 Mon Apr 6 14:23:06 2009 +++ new/usr/src/pkgdefs/SUNWhea/prototype_i386 Mon Apr 6 14:23:05 2009 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This required package information file contains a list of package contents. @@ -139,7 +139,9 @@ f none usr/platform/i86pc/include/sys/machsystm.h 644 root bin f none usr/platform/i86pc/include/sys/machthread.h 644 root bin f none usr/platform/i86pc/include/sys/memnode.h 644 root bin +f none usr/platform/i86pc/include/sys/platdep.h 644 root bin f none usr/platform/i86pc/include/sys/pc_mmu.h 644 root bin +f none usr/platform/i86pc/include/sys/pc_platdep.h 644 root bin f none usr/platform/i86pc/include/sys/psm.h 644 root bin f none usr/platform/i86pc/include/sys/psm_defs.h 644 root bin f none usr/platform/i86pc/include/sys/psm_modctl.h 644 root bin @@ -163,6 +165,7 @@ f none usr/platform/i86xpv/include/sys/machprivregs.h 644 root bin f none usr/platform/i86xpv/include/sys/xen_mmu.h 644 root bin f none usr/platform/i86xpv/include/sys/xpv_impl.h 644 root bin +f none usr/platform/i86xpv/include/sys/xpv_platdep.h 644 root bin d none usr/platform/i86xpv/include/vm 755 root bin f none usr/platform/i86xpv/include/vm/seg_mf.h 644 root bin d none usr/share/src/uts/i86pc 755 root bin --- old/usr/src/uts/common/xen/io/xdf.c Mon Apr 6 14:23:08 2009 +++ new/usr/src/uts/common/xen/io/xdf.c Mon Apr 6 14:23:07 2009 @@ -3264,7 +3264,7 @@ xdf_hvm_add(dip); /* Report our version to dom0. */ - if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d", + if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d", HVMPV_XDF_VERS)) cmn_err(CE_WARN, "xdf: couldn't write version\n"); --- old/usr/src/uts/common/xen/io/xnf.c Mon Apr 6 14:23:10 2009 +++ new/usr/src/uts/common/xen/io/xnf.c Mon Apr 6 14:23:08 2009 @@ -690,7 +690,7 @@ /* * Report our version to dom0. */ - if (xenbus_printf(XBT_NULL, "hvmpv/xnf", "version", "%d", + if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", HVMPV_XNF_VERS)) cmn_err(CE_WARN, "xnf: couldn't write version\n"); --- old/usr/src/uts/common/xen/io/xpvd.c Mon Apr 6 14:23:11 2009 +++ new/usr/src/uts/common/xen/io/xpvd.c Mon Apr 6 14:23:10 2009 @@ -275,7 +275,7 @@ /* * Report our version to dom0. */ - if (xenbus_printf(XBT_NULL, "hvmpv/xpvd", "version", "%d", + if (xenbus_printf(XBT_NULL, "guest/xpvd", "version", "%d", HVMPV_XPVD_VERS)) cmn_err(CE_WARN, "xpvd: couldn't write version\n"); #endif /* XPV_HVM_DRIVER */ --- old/usr/src/uts/i86pc/Makefile.files Mon Apr 6 14:23:13 2009 +++ new/usr/src/uts/i86pc/Makefile.files Mon Apr 6 14:23:12 2009 @@ -91,6 +91,7 @@ memscrub.o \ mpcore.o \ notes.o \ + pc_platdep.o \ pci_bios.o \ pci_cfgspace.o \ pci_mech1.o \ --- old/usr/src/uts/i86pc/i86hvm/io/xpv/xpv_support.c Mon Apr 6 14:23:15 2009 +++ new/usr/src/uts/i86pc/i86hvm/io/xpv/xpv_support.c Mon Apr 6 14:23:14 2009 @@ -883,7 +883,7 @@ /* * Report our version to dom0. */ - if (xenbus_printf(XBT_NULL, "hvmpv/xpv", "version", "%d", + if (xenbus_printf(XBT_NULL, "guest/xpv", "version", "%d", HVMPV_XPV_VERS)) cmn_err(CE_WARN, "xpv: couldn't write version\n"); --- old/usr/src/uts/i86pc/ml/syscall_asm_amd64.s Mon Apr 6 14:23:17 2009 +++ new/usr/src/uts/i86pc/ml/syscall_asm_amd64.s Mon Apr 6 14:23:16 2009 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -343,12 +343,10 @@ * * In particular, we have a stack structure like that for interrupt * gates, except that the %cs and %ss registers are modified for reasons - * that are not entirely clear. Critically, the %rcx/%r11 values do - * *not* reflect the usage of those registers under a 'real' syscall[1]; - * the stack, therefore, looks like this: + * that are not entirely clear: * - * 0x0(rsp) potentially junk %rcx - * 0x8(rsp) potentially junk %r11 + * 0x0(rsp) potentially junk %rcx [1] + * 0x8(rsp) potentially junk %r11 [1] * 0x10(rsp) user %rip * 0x18(rsp) modified %cs * 0x20(rsp) user %rflags @@ -355,12 +353,12 @@ * 0x28(rsp) user %rsp * 0x30(rsp) modified %ss * - * - * and before continuing on, we must load the %rip into %rcx and the - * %rflags into %r11. - * - * [1] They used to, and we relied on it, but this was broken in 3.1.1. - * Sigh. + * [1] In Xen 3.1.1 (and only that release), the %r11 value was + * accidentally corrupted (that is, not the user %rflags): Linux used + * the "formal" Xen ABI locations of %rip and %rflags on the stack, so + * we were accidentally relying on behaviour like hardware. This /did/ + * become ABI in 3.1.2, but for compatibility's sake, we still use the + * older ABI here. */ #if defined(__xpv) @@ -647,13 +645,16 @@ ALTENTRY(sys_syscall32) SWAPGS /* kernel gsbase */ - -#if defined(__xpv) XPV_TRAP_POP + nopop_sys_syscall32: -#endif + ASSERT_UPCALL_MASK_IS_SET +#if defined(__xpv) + movl 0x18(%rsp), %r10d +#else movl %esp, %r10d +#endif movq %gs:CPU_THREAD, %r15 movq T_STACK(%r15), %rsp movl %eax, %eax @@ -806,7 +807,11 @@ movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ +#if defined(__xpv) + addq $REGOFF_RIP, %rsp +#else movl REGOFF_RSP(%rsp), %esp +#endif ASSERT_UPCALL_MASK_IS_SET SWAPGS /* user gsbase */ @@ -896,6 +901,7 @@ SWAPGS /* kernel gsbase */ ALTENTRY(_sys_sysenter_post_swapgs) + XPV_TRAP_POP movq %gs:CPU_THREAD, %r15 movl $U32CS_SEL, REGOFF_CS(%rsp) --- old/usr/src/uts/i86pc/os/cpr_impl.c Mon Apr 6 14:23:19 2009 +++ new/usr/src/uts/i86pc/os/cpr_impl.c Mon Apr 6 14:23:18 2009 @@ -66,6 +66,7 @@ #include #include #include +#include #define AFMT "%lx" @@ -903,7 +904,6 @@ struct cpu *cp = CPU; char *str = "i_cpr_start_cpu"; - extern void init_cpu_syscall(struct cpu *cp); PMD(PMD_SX, ("%s() called\n", str)) --- old/usr/src/uts/i86pc/os/cpuid.c Mon Apr 6 14:23:21 2009 +++ new/usr/src/uts/i86pc/os/cpuid.c Mon Apr 6 14:23:19 2009 @@ -45,12 +45,8 @@ #include #include #include - -#ifdef __xpv -#include -#else +#include #include -#endif /* * Pass 0 of cpuid feature analysis happens in locore. It contains special code @@ -303,77 +299,6 @@ extern uint_t _cpuid_vendorstr_to_vendorcode(char *); /* - * Apply up various platform-dependent restrictions where the - * underlying platform restrictions mean the CPU can be marked - * as less capable than its cpuid instruction would imply. - */ -#if defined(__xpv) -static void -platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) -{ - switch (eax) { - case 1: { - uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? - 0 : CPUID_INTC_EDX_MCA; - cp->cp_edx &= - ~(mcamask | - CPUID_INTC_EDX_PSE | - CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | - CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | - CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | - CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | - CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); - break; - } - - case 0x80000001: - cp->cp_edx &= - ~(CPUID_AMD_EDX_PSE | - CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | - CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | - CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | - CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | - CPUID_AMD_EDX_TSCP); - cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; - break; - default: - break; - } - - switch (vendor) { - case X86_VENDOR_Intel: - switch (eax) { - case 4: - /* - * Zero out the (ncores-per-chip - 1) field - */ - cp->cp_eax &= 0x03fffffff; - break; - default: - break; - } - break; - case X86_VENDOR_AMD: - switch (eax) { - case 0x80000008: - /* - * Zero out the (ncores-per-chip - 1) field - */ - cp->cp_ecx &= 0xffffff00; - break; - default: - break; - } - break; - default: - break; - } -} -#else -#define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ -#endif - -/* * Some undocumented ways of patching the results of the cpuid * instruction to permit running Solaris 10 on future cpus that * we don't currently support. Could be set to non-zero values @@ -418,34 +343,6 @@ kmem_free(cpu->cpu_m.mcpu_cpi, sizeof (*cpu->cpu_m.mcpu_cpi)); } -#if !defined(__xpv) - -static void -check_for_hvm() -{ - struct cpuid_regs cp; - char *xen_str; - uint32_t xen_signature[4]; - extern int xpv_is_hvm; - - /* - * In a fully virtualized domain, Xen's pseudo-cpuid function - * 0x40000000 returns a string representing the Xen signature in - * %ebx, %ecx, and %edx. %eax contains the maximum supported cpuid - * function. - */ - cp.cp_eax = 0x40000000; - (void) __cpuid_insn(&cp); - xen_signature[0] = cp.cp_ebx; - xen_signature[1] = cp.cp_ecx; - xen_signature[2] = cp.cp_edx; - xen_signature[3] = 0; - xen_str = (char *)xen_signature; - if (strcmp("XenVMMXenVMM", xen_str) == 0 && cp.cp_eax <= 0x40000002) - xpv_is_hvm = 1; -} -#endif /* __xpv */ - uint_t cpuid_pass1(cpu_t *cpu) { @@ -454,9 +351,7 @@ struct cpuid_info *cpi; struct cpuid_regs *cp; int xcpuid; -#if !defined(__xpv) extern int idle_cpu_prefer_mwait; -#endif /* * Space statically allocated for cpu0, ensure pointer is set @@ -616,7 +511,6 @@ if (cpi->cpi_maxeax < 5) mask_ecx &= ~CPUID_INTC_ECX_MON; -#if !defined(__xpv) /* * Do not use MONITOR/MWAIT to halt in the idle loop on any AMD * processors. AMD does not intend MWAIT to be used in the cpu @@ -625,7 +519,6 @@ * Pre-family-10h Opterons do not have the MWAIT instruction. */ idle_cpu_prefer_mwait = 0; -#endif break; case X86_VENDOR_TM: @@ -696,14 +589,7 @@ break; } -#if defined(__xpv) /* - * Do not support MONITOR/MWAIT under a hypervisor - */ - mask_ecx &= ~CPUID_INTC_ECX_MON; -#endif /* __xpv */ - - /* * Now we've figured out the masks that determine * which bits we choose to believe, apply the masks * to the feature words, then map the kernel's view @@ -717,7 +603,7 @@ * immediately after __cpuid_insn here, because we need the * workarounds applied above first) */ - platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); + plat_mask_cpuid(cpi->cpi_vendor, 1, cp); /* * fold in overrides from the "eeprom" mechanism @@ -778,7 +664,7 @@ } if (cp->cp_edx & CPUID_INTC_EDX_DE) feature |= X86_DE; -#if !defined(__xpv) + if (cp->cp_ecx & CPUID_INTC_ECX_MON) { /* @@ -801,7 +687,6 @@ } } } -#endif /* __xpv */ /* * Only need it first time, rest of the cpus would follow suite. @@ -897,7 +782,7 @@ } } - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); + plat_mask_cpuid(cpi->cpi_vendor, 0x80000001, cp); /* * Compute the additions to the kernel's feature word. @@ -970,7 +855,7 @@ cp->cp_eax = 4; cp->cp_ecx = 0; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); + plat_mask_cpuid(cpi->cpi_vendor, 4, cp); } /*FALLTHROUGH*/ case X86_VENDOR_AMD: @@ -979,7 +864,7 @@ cp = &cpi->cpi_extd[8]; cp->cp_eax = 0x80000008; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000008, cp); + plat_mask_cpuid(cpi->cpi_vendor, 0x80000008, cp); /* * Virtual and physical address limits from @@ -1187,9 +1072,8 @@ cpi->cpi_model, cpi->cpi_step); pass1_done: -#if !defined(__xpv) - check_for_hvm(); -#endif + discover_virt_type(); + cpi->cpi_pass = 1; return (feature); } @@ -1245,7 +1129,7 @@ cp->cp_ecx = 0; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, n, cp); + plat_mask_cpuid(cpi->cpi_vendor, n, cp); switch (n) { case 2: /* @@ -1411,7 +1295,7 @@ for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { cp->cp_eax = 0x80000000 + n; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000000 + n, cp); + plat_mask_cpuid(cpi->cpi_vendor, 0x80000000 + n, cp); switch (n) { case 2: case 3: @@ -2298,9 +2182,6 @@ * However, Intel decided to -not- implement the 32-bit variant of the * syscall instruction, so we provide a predicate to allow our caller * to test that subtlety here. - * - * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, - * even in the case where the hardware would in fact support it. */ /*ARGSUSED*/ int @@ -2308,7 +2189,6 @@ { ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1)); -#if !defined(__xpv) if (cpu == NULL) cpu = CPU; @@ -2321,7 +2201,6 @@ (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) return (1); } -#endif return (0); } @@ -3777,8 +3656,6 @@ return (l2i->l2i_ret); } -#if !defined(__xpv) - uint32_t * cpuid_mwait_alloc(cpu_t *cpu) { @@ -3836,6 +3713,8 @@ cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; } +#if !defined(__xpv) + void patch_tsc_read(int flag) { @@ -3865,6 +3744,28 @@ } } +#if defined(__amd64) +/* + * Patch in versions of bcopy for high performance Intel Nhm processors + * and later... + */ +void +patch_memops(uint_t vendor) +{ + size_t cnt, i; + caddr_t to, from; + + if ((vendor == X86_VENDOR_Intel) && ((x86_feature & X86_SSE4_2) != 0)) { + cnt = &bcopy_patch_end - &bcopy_patch_start; + to = &bcopy_ck_size; + from = &bcopy_patch_start; + for (i = 0; i < cnt; i++) { + *to++ = *from++; + } + } +} +#endif /* __amd64 */ + int cpuid_deep_cstates_supported(void) { @@ -3924,25 +3825,3 @@ } #endif /* !__xpv */ } - -#if defined(__amd64) && !defined(__xpv) -/* - * Patch in versions of bcopy for high performance Intel Nhm processors - * and later... - */ -void -patch_memops(uint_t vendor) -{ - size_t cnt, i; - caddr_t to, from; - - if ((vendor == X86_VENDOR_Intel) && ((x86_feature & X86_SSE4_2) != 0)) { - cnt = &bcopy_patch_end - &bcopy_patch_start; - to = &bcopy_ck_size; - from = &bcopy_patch_start; - for (i = 0; i < cnt; i++) { - *to++ = *from++; - } - } -} -#endif /* __amd64 && !__xpv */ --- old/usr/src/uts/i86pc/os/cpupm/cpu_idle.c Mon Apr 6 14:23:23 2009 +++ new/usr/src/uts/i86pc/os/cpupm/cpu_idle.c Mon Apr 6 14:23:22 2009 @@ -38,6 +38,7 @@ #include #include #include +#include extern void cpu_idle_adaptive(void); --- old/usr/src/uts/i86pc/os/intr.c Mon Apr 6 14:23:25 2009 +++ new/usr/src/uts/i86pc/os/intr.c Mon Apr 6 14:23:24 2009 @@ -52,6 +52,7 @@ #include #include #include +#include #include #if defined(__xpv) #include @@ -931,12 +932,10 @@ ttp->ttr_vector = 0xff; #endif /* TRAPTRACE */ -#if !defined(__xpv) /* * Handle any pending TLB flushing */ tlb_service(); -#endif /* * If it's a softint go do it now. --- old/usr/src/uts/i86pc/os/mp_machdep.c Mon Apr 6 14:23:26 2009 +++ new/usr/src/uts/i86pc/os/mp_machdep.c Mon Apr 6 14:23:25 2009 @@ -54,6 +54,7 @@ #include #include #include +#include #define OFFSETOF(s, m) (size_t)(&(((s *)0)->m)) @@ -176,7 +177,6 @@ */ int idle_cpu_use_hlt = 1; -#ifndef __xpv /* * If non-zero, idle cpus will use mwait if available to halt instead of hlt. */ @@ -202,8 +202,6 @@ */ hpet_t hpet; -#endif /* ifndef __xpv */ - /*ARGSUSED*/ int pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw) @@ -558,7 +556,6 @@ } } -#ifndef __xpv /* * Idle the present CPU until awoken via touching its monitored line */ @@ -727,8 +724,6 @@ MWAIT_WAKEUP(cpu_seq[cpu_found]); /* write to monitored line */ } -#endif - void (*cpu_pause_handler)(volatile char *) = NULL; static int @@ -927,7 +922,6 @@ if (idle_cpu_use_hlt) { idle_cpu = cpu_idle_adaptive; CPU->cpu_m.mcpu_idle_cpu = cpu_idle; -#ifndef __xpv if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); /* @@ -954,7 +948,6 @@ if (idle_cpu_no_deep_c) { idle_cpu = non_deep_idle_cpu; } -#endif } mach_smpinit(); @@ -1036,11 +1029,9 @@ */ if (idle_cpu_use_hlt) { disp_enq_thread = cpu_wakeup; -#ifndef __xpv if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) disp_enq_thread = cpu_wakeup_mwait; non_deep_idle_disp_enq_thread = disp_enq_thread; -#endif } psm_get_ipivect = pops->psm_get_ipivect; --- old/usr/src/uts/i86pc/os/mp_startup.c Mon Apr 6 14:23:28 2009 +++ new/usr/src/uts/i86pc/os/mp_startup.c Mon Apr 6 14:23:27 2009 @@ -68,6 +68,7 @@ #include #endif #include +#include struct cpu cpus[1]; /* CPU data */ struct cpu *cpu[NCPU] = {&cpus[0]}; /* pointers to all CPUs */ @@ -96,9 +97,7 @@ static void mp_startup(void); -static void cpu_sep_enable(void); static void cpu_sep_disable(void); -static void cpu_asysc_enable(void); static void cpu_asysc_disable(void); /* @@ -135,88 +134,6 @@ } /* - * Configure syscall support on this CPU. - */ -/*ARGSUSED*/ -void -init_cpu_syscall(struct cpu *cp) -{ - kpreempt_disable(); - -#if defined(__amd64) - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) { - -#if !defined(__lint) - /* - * The syscall instruction imposes a certain ordering on - * segment selectors, so we double-check that ordering - * here. - */ - ASSERT(KDS_SEL == KCS_SEL + 8); - ASSERT(UDS_SEL == U32CS_SEL + 8); - ASSERT(UCS_SEL == U32CS_SEL + 16); -#endif - /* - * Turn syscall/sysret extensions on. - */ - cpu_asysc_enable(); - - /* - * Program the magic registers .. - */ - wrmsr(MSR_AMD_STAR, - ((uint64_t)(U32CS_SEL << 16 | KCS_SEL)) << 32); - wrmsr(MSR_AMD_LSTAR, (uint64_t)(uintptr_t)sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uint64_t)(uintptr_t)sys_syscall32); - - /* - * This list of flags is masked off the incoming - * %rfl when we enter the kernel. - */ - wrmsr(MSR_AMD_SFMASK, (uint64_t)(uintptr_t)(PS_IE | PS_T)); - } -#endif - - /* - * On 32-bit kernels, we use sysenter/sysexit because it's too - * hard to use syscall/sysret, and it is more portable anyway. - * - * On 64-bit kernels on Nocona machines, the 32-bit syscall - * variant isn't available to 32-bit applications, but sysenter is. - */ - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) { - -#if !defined(__lint) - /* - * The sysenter instruction imposes a certain ordering on - * segment selectors, so we double-check that ordering - * here. See "sysenter" in Intel document 245471-012, "IA-32 - * Intel Architecture Software Developer's Manual Volume 2: - * Instruction Set Reference" - */ - ASSERT(KDS_SEL == KCS_SEL + 8); - - ASSERT32(UCS_SEL == ((KCS_SEL + 16) | 3)); - ASSERT32(UDS_SEL == UCS_SEL + 8); - - ASSERT64(U32CS_SEL == ((KCS_SEL + 16) | 3)); - ASSERT64(UDS_SEL == U32CS_SEL + 8); -#endif - - cpu_sep_enable(); - - /* - * resume() sets this value to the base of the threads stack - * via a context handler. - */ - wrmsr(MSR_INTC_SEP_ESP, 0); - wrmsr(MSR_INTC_SEP_EIP, (uint64_t)(uintptr_t)sys_sysenter); - } - - kpreempt_enable(); -} - -/* * Multiprocessor initialization. * * Allocate and initialize the cpu structure, TRAPTRACE buffer, and the @@ -229,10 +146,8 @@ kthread_id_t tp; caddr_t sp; proc_t *procp; -#if !defined(__xpv) extern int idle_cpu_prefer_mwait; extern void cpu_idle_mwait(); -#endif extern void idle(); extern void cpu_idle(); @@ -243,12 +158,10 @@ ASSERT(cpun < NCPU && cpu[cpun] == NULL); cp = kmem_zalloc(sizeof (*cp), KM_SLEEP); -#if !defined(__xpv) if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; } else -#endif cp->cpu_m.mcpu_idle_cpu = cpu_idle; procp = curthread->t_procp; @@ -500,10 +413,9 @@ disp_cpu_fini(cp); mutex_exit(&cpu_lock); -#if !defined(__xpv) if (cp->cpu_m.mcpu_mwait != NULL) cpuid_mwait_free(cp); -#endif + kmem_free(cp, sizeof (*cp)); } @@ -1728,80 +1640,4 @@ cmi_hdl_rele(hdl); } #endif -} - -/* - * The following two routines are used as context operators on threads belonging - * to processes with a private LDT (see sysi86). Due to the rarity of such - * processes, these routines are currently written for best code readability and - * organization rather than speed. We could avoid checking x86_feature at every - * context switch by installing different context ops, depending on the - * x86_feature flags, at LDT creation time -- one for each combination of fast - * syscall feature flags. - */ - -/*ARGSUSED*/ -void -cpu_fast_syscall_disable(void *arg) -{ - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) - cpu_sep_disable(); - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) - cpu_asysc_disable(); -} - -/*ARGSUSED*/ -void -cpu_fast_syscall_enable(void *arg) -{ - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) - cpu_sep_enable(); - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) - cpu_asysc_enable(); -} - -static void -cpu_sep_enable(void) -{ - ASSERT(x86_feature & X86_SEP); - ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); - - wrmsr(MSR_INTC_SEP_CS, (uint64_t)(uintptr_t)KCS_SEL); -} - -static void -cpu_sep_disable(void) -{ - ASSERT(x86_feature & X86_SEP); - ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); - - /* - * Setting the SYSENTER_CS_MSR register to 0 causes software executing - * the sysenter or sysexit instruction to trigger a #gp fault. - */ - wrmsr(MSR_INTC_SEP_CS, 0); -} - -static void -cpu_asysc_enable(void) -{ - ASSERT(x86_feature & X86_ASYSC); - ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); - - wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) | - (uint64_t)(uintptr_t)AMD_EFER_SCE); -} - -static void -cpu_asysc_disable(void) -{ - ASSERT(x86_feature & X86_ASYSC); - ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); - - /* - * Turn off the SCE (syscall enable) bit in the EFER register. Software - * executing syscall or sysret with this bit off will incur a #ud trap. - */ - wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) & - ~((uint64_t)(uintptr_t)AMD_EFER_SCE)); } --- /dev/null Mon Apr 6 14:23:30 2009 +++ new/usr/src/uts/i86pc/os/pc_platdep.c Mon Apr 6 14:23:29 2009 @@ -0,0 +1,272 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * In a fully virtualized domain, Xen's pseudo-cpuid function + * 0x40000000 returns a string representing the Xen signature in + * %ebx, %ecx, and %edx. %eax contains the maximum supported cpuid + * function. + */ +void +discover_virt_type() +{ + struct cpuid_regs cp; + char *xen_str; + uint32_t xen_signature[4]; + extern int xpv_is_hvm; + + cp.cp_eax = 0x40000000; + (void) __cpuid_insn(&cp); + xen_signature[0] = cp.cp_ebx; + xen_signature[1] = cp.cp_ecx; + xen_signature[2] = cp.cp_edx; + xen_signature[3] = 0; + xen_str = (char *)xen_signature; + if (strcmp("XenVMMXenVMM", xen_str) == 0 && cp.cp_eax <= 0x40000002) + xpv_is_hvm = 1; +} + +/* + * Enable interpositioning on the system call path by rewriting the + * sys{call|enter} MSRs and the syscall-related entries in the IDT to use + * the branded entry points. + */ +void +brand_interpositioning_enable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) + idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; + +#if defined(__amd64) + if (x86_feature & X86_ASYSC) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); + } +#endif /* __amd64 */ + + if (x86_feature & X86_SEP) + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); +} + +/* + * Disable interpositioning on the system call path by rewriting the + * sys{call|enter} MSRs and the syscall-related entries in the IDT to use + * the standard entry points, which bypass the interpositioning hooks. + */ +void +brand_interpositioning_disable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) + idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; + +#if defined(__amd64) + if (x86_feature & X86_ASYSC) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); + } +#endif /* __amd64 */ + + if (x86_feature & X86_SEP) + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); +} + +static void +cpu_sep_enable(void) +{ + ASSERT(x86_feature & X86_SEP); + ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); + + wrmsr(MSR_INTC_SEP_CS, (uint64_t)(uintptr_t)KCS_SEL); +} + +static void +cpu_sep_disable(void) +{ + ASSERT(x86_feature & X86_SEP); + ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); + + /* + * Setting the SYSENTER_CS_MSR register to 0 causes software executing + * the sysenter or sysexit instruction to trigger a #gp fault. + */ + wrmsr(MSR_INTC_SEP_CS, 0); +} + +static void +cpu_asysc_enable(void) +{ + ASSERT(x86_feature & X86_ASYSC); + ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); + + wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) | + (uint64_t)(uintptr_t)AMD_EFER_SCE); +} + +static void +cpu_asysc_disable(void) +{ + ASSERT(x86_feature & X86_ASYSC); + ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); + + /* + * Turn off the SCE (syscall enable) bit in the EFER register. Software + * executing syscall or sysret with this bit off will incur a #ud trap. + */ + wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) & + ~((uint64_t)(uintptr_t)AMD_EFER_SCE)); +} + +/* + * The following two routines are used as context operators on threads belonging + * to processes with a private LDT (see sysi86). Due to the rarity of such + * processes, these routines are currently written for best code readability and + * organization rather than speed. We could avoid checking x86_feature at every + * context switch by installing different context ops, depending on the + * x86_feature flags, at LDT creation time -- one for each combination of fast + * syscall feature flags. + */ + +void +cpu_fast_syscall_disable(void) +{ + if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) + cpu_sep_disable(); + if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) + cpu_asysc_disable(); +} + +/*ARGSUSED*/ +void +cpu_fast_syscall_enable(void) +{ + if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) + cpu_sep_enable(); + if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) + cpu_asysc_enable(); +} + +/* + * Configure syscall support on this CPU. + */ +/*ARGSUSED*/ +void +init_cpu_syscall(struct cpu *cp) +{ + kpreempt_disable(); + +#if defined(__amd64) + if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) { + +#if !defined(__lint) + /* + * The syscall instruction imposes a certain ordering on + * segment selectors, so we double-check that ordering + * here. + */ + ASSERT(KDS_SEL == KCS_SEL + 8); + ASSERT(UDS_SEL == U32CS_SEL + 8); + ASSERT(UCS_SEL == U32CS_SEL + 16); +#endif + /* + * Turn syscall/sysret extensions on. + */ + cpu_asysc_enable(); + + /* + * Program the magic registers .. + */ + wrmsr(MSR_AMD_STAR, + ((uint64_t)(U32CS_SEL << 16 | KCS_SEL)) << 32); + wrmsr(MSR_AMD_LSTAR, (uint64_t)(uintptr_t)sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uint64_t)(uintptr_t)sys_syscall32); + + /* + * This list of flags is masked off the incoming + * %rfl when we enter the kernel. + */ + wrmsr(MSR_AMD_SFMASK, (uint64_t)(uintptr_t)(PS_IE | PS_T)); + } +#endif + + /* + * On 32-bit kernels, we use sysenter/sysexit because it's too + * hard to use syscall/sysret, and it is more portable anyway. + * + * On 64-bit kernels on Nocona machines, the 32-bit syscall + * variant isn't available to 32-bit applications, but sysenter is. + */ + if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) { + +#if !defined(__lint) + /* + * The sysenter instruction imposes a certain ordering on + * segment selectors, so we double-check that ordering + * here. See "sysenter" in Intel document 245471-012, "IA-32 + * Intel Architecture Software Developer's Manual Volume 2: + * Instruction Set Reference" + */ + ASSERT(KDS_SEL == KCS_SEL + 8); + + ASSERT32(UCS_SEL == ((KCS_SEL + 16) | 3)); + ASSERT32(UDS_SEL == UCS_SEL + 8); + + ASSERT64(U32CS_SEL == ((KCS_SEL + 16) | 3)); + ASSERT64(UDS_SEL == U32CS_SEL + 8); +#endif + + cpu_sep_enable(); + + /* + * resume() sets this value to the base of the threads stack + * via a context handler. + */ + wrmsr(MSR_INTC_SEP_ESP, 0); + wrmsr(MSR_INTC_SEP_EIP, (uint64_t)(uintptr_t)sys_sysenter); + } + + kpreempt_enable(); +} + --- old/usr/src/uts/i86pc/sys/Makefile Mon Apr 6 14:23:32 2009 +++ new/usr/src/uts/i86pc/sys/Makefile Mon Apr 6 14:23:31 2009 @@ -20,7 +20,7 @@ # # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # uts/i86pc/sys/Makefile @@ -55,6 +55,8 @@ machthread.h \ memnode.h \ pc_mmu.h \ + pc_platdep.h \ + platdep.h \ psm.h \ psm_defs.h \ psm_modctl.h \ --- /dev/null Mon Apr 6 14:23:34 2009 +++ new/usr/src/uts/i86pc/sys/pc_platdep.h Mon Apr 6 14:23:32 2009 @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PC_PLATDEP_H +#define _SYS_PC_PLATDEP_H + +/* + * Stuff specific to the i86pc platform. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define plat_mask_cpuid(vendor, eax, cp) /* nothing */ + +extern void discover_virt_type(void); +extern void i86_monitor(volatile uint32_t *, uint32_t, uint32_t); +extern void i86_mwait(uint32_t, uint32_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PC_PLATDEP_H */ --- /dev/null Mon Apr 6 14:23:35 2009 +++ new/usr/src/uts/i86pc/sys/platdep.h Mon Apr 6 14:23:34 2009 @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PLATDEP_H +#define _SYS_PLATDEP_H + +/* + * Stuff specific to i86pc or i86xpv platforms. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct cpu; + +extern void init_cpu_syscall(struct cpu *); + +void cpu_fast_syscall_disable(void); +void cpu_fast_syscall_enable(void); + +void brand_interpositioning_disable(void); +void brand_interpositioning_enable(void); + +#ifdef __xpv +#include +#else +#include +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PLATDEP_H */ --- old/usr/src/uts/i86xpv/Makefile.files Mon Apr 6 14:23:36 2009 +++ new/usr/src/uts/i86xpv/Makefile.files Mon Apr 6 14:23:35 2009 @@ -137,9 +137,10 @@ xenbus_comms.o \ xenbus_probe.o \ xenbus_xs.o \ - xen_machdep.o \ xen_mmu.o \ xpv_panic.o \ + xpv_platdep.o \ + xpv_suspend.o \ xvdi.o # --- old/usr/src/uts/i86xpv/os/mp_xen.c Mon Apr 6 14:23:38 2009 +++ new/usr/src/uts/i86xpv/os/mp_xen.c Mon Apr 6 14:23:37 2009 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -89,8 +89,6 @@ * dropping into HYPERVISOR_block(). */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -307,12 +305,6 @@ vgc->flags |= VGCF_failsafe_disables_events; #if defined(__amd64) - /* - * XXPV should this be moved to init_cpu_syscall? - */ - vgc->syscall_callback_eip = (uintptr_t)sys_syscall; - vgc->flags |= VGCF_syscall_disables_events; - ASSERT(vgc->user_regs.gs == 0); vgc->gs_base_kernel = (uintptr_t)cp; #endif --- old/usr/src/uts/i86xpv/os/xen_machdep.c Mon Apr 6 14:23:40 2009 +++ /dev/null Mon Apr 6 14:23:40 2009 @@ -1,1392 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* derived from netbsd's xen_machdep.c 1.1.2.1 */ - -/* - * - * Copyright (c) 2004 Christian Limpach. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. This section intentionally left blank. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -/* - * Section 3 of the above license was updated in response to bug 6379571. - */ - -#include - -/* XXX 3.3. TODO remove this include */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf -#else -#define SUSPEND_DEBUG(...) -#endif - -int cpr_debug; -cpuset_t cpu_suspend_lost_set; -static int xen_suspend_debug; - -uint_t xen_phys_ncpus; -xen_mc_logical_cpu_t *xen_phys_cpus; -int xen_physinfo_debug = 0; - -/* - * Determine helpful version information. - * - * (And leave copies in the data segment so we can look at them later - * with e.g. kmdb.) - */ - -typedef enum xen_version { - XENVER_BOOT_IDX, - XENVER_CURRENT_IDX -} xen_version_t; - -struct xenver { - ulong_t xv_major; - ulong_t xv_minor; - ulong_t xv_revision; - xen_extraversion_t xv_ver; - ulong_t xv_is_xvm; - xen_changeset_info_t xv_chgset; - xen_compile_info_t xv_build; - xen_capabilities_info_t xv_caps; -} xenver[2]; - -#define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) -#define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) - -/* - * Update the xenver data. We maintain two copies, boot and - * current. If we are setting the boot, then also set current. - */ -static void -xen_set_version(xen_version_t idx) -{ - ulong_t ver; - - bzero(&xenver[idx], sizeof (xenver[idx])); - - ver = HYPERVISOR_xen_version(XENVER_version, 0); - - xenver[idx].xv_major = BITX(ver, 31, 16); - xenver[idx].xv_minor = BITX(ver, 15, 0); - - (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); - - /* - * The revision is buried in the extraversion information that is - * maintained by the hypervisor. For our purposes we expect that - * the revision number is: - * - the second character in the extraversion information - * - one character long - * - numeric digit - * If it isn't then we can't extract the revision and we leave it - * set to 0. - */ - if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) - xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; - else - cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " - "version: v%s, unexpected version format", - xenver[idx].xv_ver); - - xenver[idx].xv_is_xvm = 0; - - if (strlen(xenver[idx].xv_ver) >= 4 && - strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4, - "-xvm", 4) == 0) - xenver[idx].xv_is_xvm = 1; - - (void) HYPERVISOR_xen_version(XENVER_changeset, - &xenver[idx].xv_chgset); - - (void) HYPERVISOR_xen_version(XENVER_compile_info, - &xenver[idx].xv_build); - /* - * Capabilities are a set of space separated ascii strings - * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' - */ - (void) HYPERVISOR_xen_version(XENVER_capabilities, - &xenver[idx].xv_caps); - - cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, - xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); - - if (idx == XENVER_BOOT_IDX) - bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], - sizeof (xenver[XENVER_BOOT_IDX])); -} - -typedef enum xen_hypervisor_check { - XEN_RUN_CHECK, - XEN_SUSPEND_CHECK -} xen_hypervisor_check_t; - -/* - * To run the hypervisor must be 3.0.4 or better. To suspend/resume - * we need 3.0.4 or better and if it is 3.0.4. then it must be provided - * by the Solaris xVM project. - * Checking can be disabled for testing purposes by setting the - * xen_suspend_debug variable. - */ -static int -xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) -{ - if (xen_suspend_debug == 1) - return (1); - if (XENVER_CURRENT(xv_major) < 3) - return (0); - if (XENVER_CURRENT(xv_major) > 3) - return (1); - if (XENVER_CURRENT(xv_minor) > 0) - return (1); - if (XENVER_CURRENT(xv_revision) < 4) - return (0); - if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 && - !XENVER_CURRENT(xv_is_xvm)) - return (0); - - return (1); -} - -/* - * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the - * workaround. - */ -static void -xen_pte_workaround(void) -{ -#if defined(__amd64) - extern int pt_kern; - - if (XENVER_CURRENT(xv_major) != 3) - return; - if (XENVER_CURRENT(xv_minor) > 1) - return; - if (XENVER_CURRENT(xv_minor) == 1 && - XENVER_CURRENT(xv_revision) > 1) - return; - if (XENVER_CURRENT(xv_is_xvm)) - return; - - pt_kern = PT_USER; -#endif -} - -void -xen_set_callback(void (*func)(void), uint_t type, uint_t flags) -{ - struct callback_register cb; - - bzero(&cb, sizeof (cb)); -#if defined(__amd64) - cb.address = (ulong_t)func; -#elif defined(__i386) - cb.address.cs = KCS_SEL; - cb.address.eip = (ulong_t)func; -#endif - cb.type = type; - cb.flags = flags; - - /* - * XXPV always ignore return value for NMI - */ - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && - type != CALLBACKTYPE_nmi) - panic("HYPERVISOR_callback_op failed"); -} - -void -xen_init_callbacks(void) -{ - /* - * register event (interrupt) handler. - */ - xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); - - /* - * failsafe handler. - */ - xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, - CALLBACKF_mask_events); - - /* - * NMI handler. - */ - xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); - - /* - * system call handler - * XXPV move to init_cpu_syscall? - */ -#if defined(__amd64) - xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, - CALLBACKF_mask_events); -#endif /* __amd64 */ -} - - -/* - * cmn_err() followed by a 1/4 second delay; this gives the - * logging service a chance to flush messages and helps avoid - * intermixing output from prom_printf(). - * XXPV: doesn't exactly help us on UP though. - */ -/*PRINTFLIKE2*/ -void -cpr_err(int ce, const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(ce, fmt, adx); - va_end(adx); - drv_usecwait(MICROSEC >> 2); -} - -void -xen_suspend_devices(void) -{ - int rc; - - SUSPEND_DEBUG("xen_suspend_devices\n"); - - if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) - panic("failed to suspend devices: %d", rc); -} - -void -xen_resume_devices(void) -{ - int rc; - - SUSPEND_DEBUG("xen_resume_devices\n"); - - if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) - panic("failed to resume devices: %d", rc); -} - -/* - * The list of mfn pages is out of date. Recompute it. - */ -static void -rebuild_mfn_list(void) -{ - int i = 0; - size_t sz; - size_t off; - pfn_t pfn; - - SUSPEND_DEBUG("rebuild_mfn_list\n"); - - sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; - - for (off = 0; off < sz; off += MMU_PAGESIZE) { - size_t j = mmu_btop(off); - if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { - pfn = hat_getpfnum(kas.a_hat, - (caddr_t)&mfn_list_pages[j]); - mfn_list_pages_page[i++] = pfn_to_mfn(pfn); - } - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); - mfn_list_pages[j] = pfn_to_mfn(pfn); - } - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list - = pfn_to_mfn(pfn); -} - -static void -suspend_cpus(void) -{ - int i; - - SUSPEND_DEBUG("suspend_cpus\n"); - - mp_enter_barrier(); - - for (i = 1; i < ncpus; i++) { - if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { - SUSPEND_DEBUG("xen_vcpu_down %d\n", i); - (void) xen_vcpu_down(i); - } - - mach_cpucontext_reset(cpu[i]); - } -} - -static void -resume_cpus(void) -{ - int i; - - for (i = 1; i < ncpus; i++) { - if (cpu[i] == NULL) - continue; - - if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { - SUSPEND_DEBUG("xen_vcpu_up %d\n", i); - mach_cpucontext_restore(cpu[i]); - (void) xen_vcpu_up(i); - } - } - - mp_leave_barrier(); -} - -/* - * Top level routine to direct suspend/resume of a domain. - */ -void -xen_suspend_domain(void) -{ - extern void rtcsync(void); - extern hrtime_t hres_last_tick; - mfn_t start_info_mfn; - ulong_t flags; - pfn_t pfn; - int i; - - /* - * Check that we are happy to suspend on this hypervisor. - */ - if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { - cpr_err(CE_WARN, "Cannot suspend on this hypervisor " - "version: v%lu.%lu%s, need at least version v3.0.4 or " - "-xvm based hypervisor", XENVER_CURRENT(xv_major), - XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); - return; - } - - /* - * XXPV - Are we definitely OK to suspend by the time we've connected - * the handler? - */ - - cpr_err(CE_NOTE, "Domain suspending for save/migrate"); - - SUSPEND_DEBUG("xen_suspend_domain\n"); - - /* - * suspend interrupts and devices - * XXPV - we use suspend/resume for both save/restore domains (like sun - * cpr) and for migration. Would be nice to know the difference if - * possible. For save/restore where down time may be a long time, we - * may want to do more of the things that cpr does. (i.e. notify user - * processes, shrink memory footprint for faster restore, etc.) - */ - xen_suspend_devices(); - SUSPEND_DEBUG("xenbus_suspend\n"); - xenbus_suspend(); - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); - start_info_mfn = pfn_to_mfn(pfn); - - /* - * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe - * wrt xenbus being suspended here? - */ - mutex_enter(&cpu_lock); - - /* - * Suspend must be done on vcpu 0, as no context for other CPUs is - * saved. - * - * XXPV - add to taskq API ? - */ - thread_affinity_set(curthread, 0); - kpreempt_disable(); - - SUSPEND_DEBUG("xen_start_migrate\n"); - xen_start_migrate(); - if (ncpus > 1) - suspend_cpus(); - - /* - * We can grab the ec_lock as it's a spinlock with a high SPL. Hence - * any holder would have dropped it to get through suspend_cpus(). - */ - mutex_enter(&ec_lock); - - /* - * From here on in, we can't take locks. - */ - SUSPEND_DEBUG("ec_suspend\n"); - ec_suspend(); - SUSPEND_DEBUG("gnttab_suspend\n"); - gnttab_suspend(); - - flags = intr_clear(); - - xpv_time_suspend(); - - /* - * Currently, the hypervisor incorrectly fails to bring back - * powered-down VCPUs. Thus we need to record any powered-down VCPUs - * to prevent any attempts to operate on them. But we have to do this - * *after* the very first time we do ec_suspend(). - */ - for (i = 1; i < ncpus; i++) { - if (cpu[i] == NULL) - continue; - - if (cpu_get_state(cpu[i]) == P_POWEROFF) - CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); - } - - /* - * The dom0 save/migrate code doesn't automatically translate - * these into PFNs, but expects them to be, so we do it here. - * We don't use mfn_to_pfn() because so many OS services have - * been disabled at this point. - */ - xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; - xen_info->console.domU.mfn = - mfn_to_pfn_mapping[xen_info->console.domU.mfn]; - - if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { - prom_printf("xen_suspend_domain(): " - "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, - 0, UVMF_INVLPG)) { - prom_printf("xen_suspend_domain(): " - "HYPERVISOR_update_va_mapping() failed\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - SUSPEND_DEBUG("HYPERVISOR_suspend\n"); - - /* - * At this point we suspend and sometime later resume. - */ - if (HYPERVISOR_suspend(start_info_mfn)) { - prom_printf("xen_suspend_domain(): " - "HYPERVISOR_suspend() failed\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - /* - * Point HYPERVISOR_shared_info to its new value. - */ - if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, - xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, - UVMF_INVLPG)) - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - - if (xen_info->nr_pages != mfn_count) { - prom_printf("xen_suspend_domain(): number of pages" - " changed, was 0x%lx, now 0x%lx\n", mfn_count, - xen_info->nr_pages); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - xpv_time_resume(); - - cached_max_mfn = 0; - - SUSPEND_DEBUG("gnttab_resume\n"); - gnttab_resume(); - - /* XXPV: add a note that this must be lockless. */ - SUSPEND_DEBUG("ec_resume\n"); - ec_resume(); - - intr_restore(flags); - - if (ncpus > 1) - resume_cpus(); - - mutex_exit(&ec_lock); - xen_end_migrate(); - mutex_exit(&cpu_lock); - - /* - * Now we can take locks again. - */ - - /* - * Force the tick value used for tv_nsec in hres_tick() to be up to - * date. rtcsync() will reset the hrestime value appropriately. - */ - hres_last_tick = xpv_gethrtime(); - - /* - * XXPV: we need to have resumed the CPUs since this takes locks, but - * can remote CPUs see bad state? Presumably yes. Should probably nest - * taking of todlock inside of cpu_lock, or vice versa, then provide an - * unlocked version. Probably need to call clkinitf to reset cpu freq - * and re-calibrate if we migrated to a different speed cpu. Also need - * to make a (re)init_cpu_info call to update processor info structs - * and device tree info. That remains to be written at the moment. - */ - rtcsync(); - - rebuild_mfn_list(); - - SUSPEND_DEBUG("xenbus_resume\n"); - xenbus_resume(); - SUSPEND_DEBUG("xenbus_resume_devices\n"); - xen_resume_devices(); - - thread_affinity_clear(curthread); - kpreempt_enable(); - - SUSPEND_DEBUG("finished xen_suspend_domain\n"); - - /* - * We have restarted our suspended domain, update the hypervisor - * details. NB: This must be done at the end of this function, - * since we need the domain to be completely resumed before - * these functions will work correctly. - */ - xen_set_version(XENVER_CURRENT_IDX); - - /* - * We can check and report a warning, but we don't stop the - * process. - */ - if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) - cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " - "but need at least version v3.0.4", - XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), - XENVER_CURRENT(xv_ver)); - - cmn_err(CE_NOTE, "domain restore/migrate completed"); -} - -/*ARGSUSED*/ -int -xen_debug_handler(void *arg) -{ - debug_enter("External debug event received"); - - /* - * If we've not got KMDB loaded, output some stuff difficult to capture - * from a domain core. - */ - if (!(boothowto & RB_DEBUG)) { - shared_info_t *si = HYPERVISOR_shared_info; - int i; - - prom_printf("evtchn_pending [ "); - for (i = 0; i < 8; i++) - prom_printf("%lx ", si->evtchn_pending[i]); - prom_printf("]\nevtchn_mask [ "); - for (i = 0; i < 8; i++) - prom_printf("%lx ", si->evtchn_mask[i]); - prom_printf("]\n"); - - for (i = 0; i < ncpus; i++) { - vcpu_info_t *vcpu = &si->vcpu_info[i]; - if (cpu[i] == NULL) - continue; - prom_printf("CPU%d pending %d mask %d sel %lx\n", - i, vcpu->evtchn_upcall_pending, - vcpu->evtchn_upcall_mask, - vcpu->evtchn_pending_sel); - } - } - - return (0); -} - -/*ARGSUSED*/ -static void -xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - xenbus_transaction_t xbt; - char key = '\0'; - int ret; - -retry: - if (xenbus_transaction_start(&xbt)) { - cmn_err(CE_WARN, "failed to start sysrq transaction"); - return; - } - - if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { - /* - * ENOENT happens in response to our own xenbus_rm. - * XXPV - this happens spuriously on boot? - */ - if (ret != ENOENT) - cmn_err(CE_WARN, "failed to read sysrq: %d", ret); - goto out; - } - - if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { - cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); - goto out; - } - - if (xenbus_transaction_end(xbt, 0) == EAGAIN) - goto retry; - - /* - * Somewhat arbitrary - on Linux this means 'reboot'. We could just - * accept any key, but this might increase the risk of sending a - * harmless sysrq to the wrong domain... - */ - if (key == 'b') - (void) xen_debug_handler(NULL); - else - cmn_err(CE_WARN, "Ignored sysrq %c", key); - return; - -out: - (void) xenbus_transaction_end(xbt, 1); -} - -taskq_t *xen_shutdown_tq; - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_REBOOT 1 -#define SHUTDOWN_SUSPEND 2 -#define SHUTDOWN_HALT 3 -#define SHUTDOWN_MAX 4 - -#define SHUTDOWN_TIMEOUT_SECS (60 * 5) - -static const char *cmd_strings[SHUTDOWN_MAX] = { - "poweroff", - "reboot", - "suspend", - "halt" -}; - -static void -xen_dirty_shutdown(void *arg) -{ - int cmd = (uintptr_t)arg; - - cmn_err(CE_WARN, "Externally requested shutdown failed or " - "timed out.\nShutting down.\n"); - - switch (cmd) { - case SHUTDOWN_HALT: - case SHUTDOWN_POWEROFF: - (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); - break; - case SHUTDOWN_REBOOT: - (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); - break; - } -} - -static void -xen_shutdown(void *arg) -{ - int cmd = (uintptr_t)arg; - proc_t *initpp; - - ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); - - if (cmd == SHUTDOWN_SUSPEND) { - xen_suspend_domain(); - return; - } - - switch (cmd) { - case SHUTDOWN_POWEROFF: - force_shutdown_method = AD_POWEROFF; - break; - case SHUTDOWN_HALT: - force_shutdown_method = AD_HALT; - break; - case SHUTDOWN_REBOOT: - force_shutdown_method = AD_BOOT; - break; - } - - /* - * If we're still booting and init(1) isn't set up yet, simply halt. - */ - mutex_enter(&pidlock); - initpp = prfind(P_INITPID); - mutex_exit(&pidlock); - if (initpp == NULL) { - extern void halt(char *); - halt("Power off the System"); /* just in case */ - } - - /* - * else, graceful shutdown with inittab and all getting involved - */ - psignal(initpp, SIGPWR); - - (void) timeout(xen_dirty_shutdown, arg, - SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); -} - -/*ARGSUSED*/ -static void -xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char *str; - xenbus_transaction_t xbt; - int err, shutdown_code = SHUTDOWN_INVALID; - unsigned int slen; - -again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { - (void) xenbus_transaction_end(xbt, 1); - return; - } - - SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); - - /* - * If this is a watch fired from our write below, check out early to - * avoid an infinite loop. - */ - if (strcmp(str, "") == 0) { - (void) xenbus_transaction_end(xbt, 0); - kmem_free(str, slen); - return; - } else if (strcmp(str, "poweroff") == 0) { - shutdown_code = SHUTDOWN_POWEROFF; - } else if (strcmp(str, "reboot") == 0) { - shutdown_code = SHUTDOWN_REBOOT; - } else if (strcmp(str, "suspend") == 0) { - shutdown_code = SHUTDOWN_SUSPEND; - } else if (strcmp(str, "halt") == 0) { - shutdown_code = SHUTDOWN_HALT; - } else { - printf("Ignoring shutdown request: %s\n", str); - } - - /* - * XXPV Should we check the value of xenbus_write() too, or are all - * errors automatically folded into xenbus_transaction_end() ?? - */ - (void) xenbus_write(xbt, "control", "shutdown", ""); - err = xenbus_transaction_end(xbt, 0); - if (err == EAGAIN) { - SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); - kmem_free(str, slen); - goto again; - } - - kmem_free(str, slen); - if (shutdown_code != SHUTDOWN_INVALID) { - (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, - (void *)(intptr_t)shutdown_code, 0); - } -} - -static struct xenbus_watch shutdown_watch; -static struct xenbus_watch sysrq_watch; - -void -xen_late_startup(void) -{ - if (!DOMAIN_IS_INITDOMAIN(xen_info)) { - xen_shutdown_tq = taskq_create("shutdown_taskq", 1, - maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); - shutdown_watch.node = "control/shutdown"; - shutdown_watch.callback = xen_shutdown_handler; - if (register_xenbus_watch(&shutdown_watch)) - cmn_err(CE_WARN, "Failed to set shutdown watcher"); - - sysrq_watch.node = "control/sysrq"; - sysrq_watch.callback = xen_sysrq_handler; - if (register_xenbus_watch(&sysrq_watch)) - cmn_err(CE_WARN, "Failed to set sysrq watcher"); - } - balloon_init(xen_info->nr_pages); -} - -#ifdef DEBUG -#define XEN_PRINTF_BUFSIZE 1024 - -char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; - -/* - * Printf function that calls hypervisor directly. For DomU it only - * works when running on a xen hypervisor built with debug on. Works - * always since no I/O ring interaction is needed. - */ -/*PRINTFLIKE1*/ -void -xen_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); - va_end(ap); - - (void) HYPERVISOR_console_io(CONSOLEIO_write, - strlen(xen_printf_buffer), xen_printf_buffer); -} -#else -void -xen_printf(const char *fmt, ...) -{ -} -#endif /* DEBUG */ - -void -startup_xen_version(void) -{ - xen_set_version(XENVER_BOOT_IDX); - if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) - cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " - "but need at least version v3.0.4", - XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), - XENVER_CURRENT(xv_ver)); - xen_pte_workaround(); -} - -int xen_mca_simulate_mc_physinfo_failure = 0; - -void -startup_xen_mca(void) -{ - if (!DOMAIN_IS_INITDOMAIN(xen_info)) - return; - - xen_phys_ncpus = 0; - xen_phys_cpus = NULL; - - if (xen_mca_simulate_mc_physinfo_failure || - xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) { - cmn_err(CE_WARN, - "%sxen_get_mc_physinfo failure during xen MCA startup: " - "there will be no machine check support", - xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : ""); - return; - } - - xen_phys_cpus = kmem_alloc(xen_phys_ncpus * - sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP); - - if (xen_phys_cpus == NULL) { - cmn_err(CE_WARN, - "xen_get_mc_physinfo failure: can't allocate CPU array"); - return; - } - - if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) { - cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no " - "physical CPU info"); - kmem_free(xen_phys_cpus, - xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t)); - xen_phys_ncpus = 0; - xen_phys_cpus = NULL; - } - - if (xen_physinfo_debug) { - xen_mc_logical_cpu_t *xcp; - unsigned i; - - cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n", - xen_phys_ncpus); - for (i = 0; i < xen_phys_ncpus; i++) { - xcp = &xen_phys_cpus[i]; - cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u", - xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid, - xcp->mc_threadid, xcp->mc_apicid); - } - } -} - -/* - * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. - */ - -void -xen_set_gdt(ulong_t *frame_list, int entries) -{ - int err; - if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { - /* - * X_EINVAL: reserved entry or bad frames - * X_EFAULT: bad address - */ - panic("xen_set_gdt(%p, %d): error %d", - (void *)frame_list, entries, -(int)err); - } -} - -void -xen_set_ldt(user_desc_t *ldt, uint_t nsels) -{ - struct mmuext_op op; - long err; - - op.cmd = MMUEXT_SET_LDT; - op.arg1.linear_addr = (uintptr_t)ldt; - op.arg2.nr_ents = nsels; - - if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { - panic("xen_set_ldt(%p, %d): error %d", - (void *)ldt, nsels, -(int)err); - } -} - -void -xen_stack_switch(ulong_t ss, ulong_t esp) -{ - long err; - - if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { - /* - * X_EPERM: bad selector - */ - panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, - -(int)err); - } -} - -long -xen_set_trap_table(trap_info_t *table) -{ - long err; - - if ((err = HYPERVISOR_set_trap_table(table)) != 0) { - /* - * X_EFAULT: bad address - * X_EPERM: bad selector - */ - panic("xen_set_trap_table(%p): error %d", (void *)table, - -(int)err); - } - return (err); -} - -#if defined(__amd64) -void -xen_set_segment_base(int reg, ulong_t value) -{ - long err; - - if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { - /* - * X_EFAULT: bad address - * X_EINVAL: bad type - */ - panic("xen_set_segment_base(%d, %lx): error %d", - reg, value, -(int)err); - } -} -#endif /* __amd64 */ - -/* - * Translate a hypervisor errcode to a Solaris error code. - */ -int -xen_xlate_errcode(int error) -{ - switch (-error) { - - /* - * Translate hypervisor errno's into native errno's - */ - -#define CASE(num) case X_##num: error = num; break - - CASE(EPERM); CASE(ENOENT); CASE(ESRCH); - CASE(EINTR); CASE(EIO); CASE(ENXIO); - CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); - CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); - CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); - CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); - CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); - CASE(ENODATA); CASE(EAGAIN); - -#undef CASE - - default: - panic("xen_xlate_errcode: unknown error %d", error); - } - - return (error); -} - -/* - * Raise PS_IOPL on current vcpu to user level. - * Caller responsible for preventing kernel preemption. - */ -void -xen_enable_user_iopl(void) -{ - physdev_set_iopl_t set_iopl; - set_iopl.iopl = 3; /* user ring 3 */ - (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -/* - * Drop PS_IOPL on current vcpu to kernel level - */ -void -xen_disable_user_iopl(void) -{ - physdev_set_iopl_t set_iopl; - set_iopl.iopl = 1; /* kernel pseudo ring 1 */ - (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -int -xen_gdt_setprot(cpu_t *cp, uint_t prot) -{ - int err; -#if defined(__amd64) - int pt_bits = PT_VALID; - if (prot & PROT_WRITE) - pt_bits |= PT_WRITABLE; -#endif - - if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, - MMU_PAGESIZE, prot)) != 0) - goto done; - -#if defined(__amd64) - err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); -#endif - -done: - if (err) { - cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", - cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", - err); - } - - return (err); -} - -int -xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) -{ - int err; - caddr_t lva = (caddr_t)ldt; -#if defined(__amd64) - int pt_bits = PT_VALID; - pgcnt_t npgs; - if (prot & PROT_WRITE) - pt_bits |= PT_WRITABLE; -#endif /* __amd64 */ - - if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) - goto done; - -#if defined(__amd64) - - ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); - npgs = mmu_btop(lsize); - while (npgs--) { - if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), - pt_bits)) != 0) - break; - lva += PAGESIZE; - } -#endif /* __amd64 */ - -done: - if (err) { - cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", - (void *)lva, - (prot & PROT_WRITE) ? "writable" : "read-only", err); - } - - return (err); -} - -int -xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus) -{ - struct xen_mc_physcpuinfo cpi; - - cpi.ncpus = *ncpus; - /*LINTED: constant in conditional context*/ - set_xen_guest_handle(cpi.info, log_cpus); - - if (HYPERVISOR_mca(XEN_MC_physcpuinfo, (xen_mc_arg_t *)&cpi) != 0) - return (-1); - - *ncpus = cpi.ncpus; - return (0); -} - -void -print_panic(const char *str) -{ - xen_printf(str); -} - -/* - * Interfaces to iterate over real cpu information, but only that info - * which we choose to expose here. These are of interest to dom0 - * only (and the backing hypercall should not work for domu). - */ - -xen_mc_lcpu_cookie_t -xen_physcpu_next(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie; - - if (!DOMAIN_IS_INITDOMAIN(xen_info)) - return (NULL); - - if (cookie == NULL) - return ((xen_mc_lcpu_cookie_t)xen_phys_cpus); - - if (xcp == xen_phys_cpus + xen_phys_ncpus - 1) - return (NULL); - else - return ((xen_mc_lcpu_cookie_t)++xcp); -} - -#define COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c)) - -const char * -xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); - - return ((const char *)&xcp->mc_vendorid[0]); -} - -int -xen_physcpu_family(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_family); -} - -int -xen_physcpu_model(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_model); -} - -int -xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_step); -} - -id_t -xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_chipid); -} - -id_t -xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_coreid); -} - -id_t -xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_threadid); -} - -id_t -xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_cpunr); -} - -boolean_t -xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_nthreads > 1); -} - -uint64_t -xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); - - /* - * Need to #define the indices, or search through the array. - */ - return (xcp->mc_msrvalues[0].value); -} - -int -xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count, - boolean_t uvaddr) -{ - long rc; - uint_t i; - - ASSERT(cmd == GNTTABOP_map_grant_ref); - -#if !defined(_BOOT) - if (uvaddr == B_FALSE) { - for (i = 0; i < count; ++i) { - mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0); - } - } -#endif - - rc = HYPERVISOR_grant_table_op(cmd, mapop, count); - - return (rc); -} - -static int -xpv_get_physinfo(xen_sysctl_physinfo_t *pi) -{ - xen_sysctl_t op; - struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node; - int ret; - - bzero(&op, sizeof (op)); - op.cmd = XEN_SYSCTL_physinfo; - op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; - /*LINTED: constant in conditional context*/ - set_xen_guest_handle(*sp, NULL); - - ret = HYPERVISOR_sysctl(&op); - - if (ret != 0) - return (xen_xlate_errcode(ret)); - - bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo)); - return (0); -} - -/* - * On dom0, we can determine the number of physical cpus on the machine. - * This number is important when figuring out what workarounds are - * appropriate, so compute it now. - */ -uint_t -xpv_nr_phys_cpus(void) -{ - static uint_t nphyscpus = 0; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if (nphyscpus == 0) { - xen_sysctl_physinfo_t pi; - int ret; - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - nphyscpus = pi.nr_cpus; - } - return (nphyscpus); -} - -pgcnt_t -xpv_nr_phys_pages(void) -{ - xen_sysctl_physinfo_t pi; - int ret; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - - return ((pgcnt_t)pi.total_pages); -} - -uint64_t -xpv_cpu_khz(void) -{ - xen_sysctl_physinfo_t pi; - int ret; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - return ((uint64_t)pi.cpu_khz); -} --- /dev/null Mon Apr 6 14:23:40 2009 +++ new/usr/src/uts/i86xpv/os/xpv_platdep.c Mon Apr 6 14:23:39 2009 @@ -0,0 +1,1217 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* derived from netbsd's xen_machdep.c 1.1.2.1 */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. This section intentionally left blank. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Section 3 of the above license was updated in response to bug 6379571. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +/* XXX 3.3. TODO remove this include */ +#include + +uint_t xen_phys_ncpus; +xen_mc_logical_cpu_t *xen_phys_cpus; +int xen_physinfo_debug = 0; + +/* + * Determine helpful version information. + * + * (And leave copies in the data segment so we can look at them later + * with e.g. kmdb.) + */ + +typedef enum xen_version { + XENVER_BOOT_IDX, + XENVER_CURRENT_IDX +} xen_version_t; + +struct xenver { + ulong_t xv_major; + ulong_t xv_minor; + ulong_t xv_revision; + xen_extraversion_t xv_ver; + ulong_t xv_is_xvm; + xen_changeset_info_t xv_chgset; + xen_compile_info_t xv_build; + xen_capabilities_info_t xv_caps; +} xenver[2]; + +#define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) +#define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) + +/* + * Update the xenver data. We maintain two copies, boot and + * current. If we are setting the boot, then also set current. + */ +static void +xen_set_version(xen_version_t idx) +{ + ulong_t ver; + + bzero(&xenver[idx], sizeof (xenver[idx])); + + ver = HYPERVISOR_xen_version(XENVER_version, 0); + + xenver[idx].xv_major = BITX(ver, 31, 16); + xenver[idx].xv_minor = BITX(ver, 15, 0); + + (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); + + /* + * The revision is buried in the extraversion information that is + * maintained by the hypervisor. For our purposes we expect that + * the revision number is: + * - the second character in the extraversion information + * - one character long + * - numeric digit + * If it isn't then we can't extract the revision and we leave it + * set to 0. + */ + if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) + xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; + else + cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " + "version: v%s, unexpected version format", + xenver[idx].xv_ver); + + xenver[idx].xv_is_xvm = 0; + + if (strlen(xenver[idx].xv_ver) >= 4 && + strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4, + "-xvm", 4) == 0) + xenver[idx].xv_is_xvm = 1; + + (void) HYPERVISOR_xen_version(XENVER_changeset, + &xenver[idx].xv_chgset); + + (void) HYPERVISOR_xen_version(XENVER_compile_info, + &xenver[idx].xv_build); + /* + * Capabilities are a set of space separated ascii strings + * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' + */ + (void) HYPERVISOR_xen_version(XENVER_capabilities, + &xenver[idx].xv_caps); + + cmn_err(CE_CONT, "!v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, + xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); + + if (idx == XENVER_BOOT_IDX) + bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], + sizeof (xenver[XENVER_BOOT_IDX])); +} + +void +xen_reset_version(void) +{ + xen_set_version(XENVER_CURRENT_IDX); +} + +typedef enum xen_hypervisor_check { + XEN_RUN_CHECK, + XEN_SUSPEND_CHECK +} xen_hypervisor_check_t; + +/* + * To run the hypervisor must be 3.0.4 or better. To suspend/resume + * we need 3.0.4 or better and if it is 3.0.4. then it must be provided + * by the Solaris xVM project. + * Checking can be disabled for testing purposes by setting the + * xen_suspend_debug variable. + */ +static int +xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) +{ + if (XENVER_CURRENT(xv_major) < 3) + return (0); + if (XENVER_CURRENT(xv_major) > 3) + return (1); + if (XENVER_CURRENT(xv_minor) > 0) + return (1); + if (XENVER_CURRENT(xv_revision) < 4) + return (0); + if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 && + !XENVER_CURRENT(xv_is_xvm)) + return (0); + + return (1); +} + +/* + * Check that we are happy to suspend on this hypervisor. + */ +int +xen_hypervisor_supports_suspend(void) +{ + if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK)) + return (1); + + cmn_err(CE_WARN, "Cannot suspend on this hypervisor " + "version: v%lu.%lu%s, need at least version v3.0.4 or " + "-xvm based hypervisor", XENVER_CURRENT(xv_major), + XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); + return (0); +} + +/* + * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the + * workaround. + */ +static void +xen_pte_workaround(void) +{ +#if defined(__amd64) + extern int pt_kern; + + if (XENVER_CURRENT(xv_major) != 3) + return; + if (XENVER_CURRENT(xv_minor) > 1) + return; + if (XENVER_CURRENT(xv_minor) == 1 && + XENVER_CURRENT(xv_revision) > 1) + return; + if (XENVER_CURRENT(xv_is_xvm)) + return; + + pt_kern = PT_USER; +#endif +} + +/* + * 3.2 upwards support 32-bit syscall/sysenter support. + */ +static int +syscall32_support(void) +{ +#if defined(__amd64) + /* Too early to use version code above. */ + ulong_t ver = HYPERVISOR_xen_version(XENVER_version, 0); + + if (BITX(ver, 31, 16) < 3) + return (0); + return (BITX(ver, 15, 0) >= 2); +#else + return (0); +#endif +} + +void +plat_mask_cpuid(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) +{ + uint32_t syscall32_mask = 0; + uint32_t mca_mask = 0; + + if (!DOMAIN_IS_INITDOMAIN(xen_info)) + mca_mask = CPUID_INTC_EDX_MCA; + + if (!syscall32_support()) + syscall32_mask = CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP; + + switch (eax) { + case 1: + cp->cp_edx &= + ~(mca_mask | syscall32_mask | + CPUID_INTC_EDX_PSE | CPUID_INTC_EDX_VME | + CPUID_INTC_EDX_DE | CPUID_INTC_EDX_MTRR | + CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | + CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); + + cp->cp_ecx &= ~CPUID_INTC_ECX_MON; + break; + + case 0x80000001: + cp->cp_edx &= + ~(syscall32_mask | CPUID_AMD_EDX_PSE | + CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | + CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | + CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | + CPUID_AMD_EDX_TSCP); + + cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; + break; + default: + break; + } + + switch (vendor) { + case X86_VENDOR_Intel: + switch (eax) { + case 4: + /* + * Zero out the (ncores-per-chip - 1) field + * XXPV: why? + */ + cp->cp_eax &= 0x03fffffff; + break; + default: + break; + } + break; + case X86_VENDOR_AMD: + switch (eax) { + case 0x80000008: + /* + * Zero out the (ncores-per-chip - 1) field + * XXPV: why? + */ + cp->cp_ecx &= 0xffffff00; + break; + default: + break; + } + break; + default: + break; + } +} + +/*ARGSUSED*/ +int +xen_debug_handler(void *arg) +{ + debug_enter("External debug event received"); + + /* + * If we've not got KMDB loaded, output some stuff difficult to capture + * from a domain core. + */ + if (!(boothowto & RB_DEBUG)) { + shared_info_t *si = HYPERVISOR_shared_info; + int i; + + prom_printf("evtchn_pending [ "); + for (i = 0; i < 8; i++) + prom_printf("%lx ", si->evtchn_pending[i]); + prom_printf("]\nevtchn_mask [ "); + for (i = 0; i < 8; i++) + prom_printf("%lx ", si->evtchn_mask[i]); + prom_printf("]\n"); + + for (i = 0; i < ncpus; i++) { + vcpu_info_t *vcpu = &si->vcpu_info[i]; + if (cpu[i] == NULL) + continue; + prom_printf("CPU%d pending %d mask %d sel %lx\n", + i, vcpu->evtchn_upcall_pending, + vcpu->evtchn_upcall_mask, + vcpu->evtchn_pending_sel); + } + } + + return (0); +} + +/*ARGSUSED*/ +static void +xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + xenbus_transaction_t xbt; + char key = '\0'; + int ret; + +retry: + if (xenbus_transaction_start(&xbt)) { + cmn_err(CE_WARN, "failed to start sysrq transaction"); + return; + } + + if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { + /* + * ENOENT happens in response to our own xenbus_rm. + * XXPV - this happens spuriously on boot? + */ + if (ret != ENOENT) + cmn_err(CE_WARN, "failed to read sysrq: %d", ret); + goto out; + } + + if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { + cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); + goto out; + } + + if (xenbus_transaction_end(xbt, 0) == EAGAIN) + goto retry; + + /* + * Somewhat arbitrary - on Linux this means 'reboot'. We could just + * accept any key, but this might increase the risk of sending a + * harmless sysrq to the wrong domain... + */ + if (key == 'b') + (void) xen_debug_handler(NULL); + else + cmn_err(CE_WARN, "Ignored sysrq %c", key); + return; + +out: + (void) xenbus_transaction_end(xbt, 1); +} + +taskq_t *xen_shutdown_tq; + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_REBOOT 1 +#define SHUTDOWN_SUSPEND 2 +#define SHUTDOWN_HALT 3 +#define SHUTDOWN_MAX 4 + +#define SHUTDOWN_TIMEOUT_SECS (60 * 5) + +static const char *cmd_strings[SHUTDOWN_MAX] = { + "poweroff", + "reboot", + "suspend", + "halt" +}; + +static void +xen_dirty_shutdown(void *arg) +{ + int cmd = (uintptr_t)arg; + + cmn_err(CE_WARN, "Externally requested shutdown failed or " + "timed out.\nShutting down.\n"); + + switch (cmd) { + case SHUTDOWN_HALT: + case SHUTDOWN_POWEROFF: + (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); + break; + case SHUTDOWN_REBOOT: + (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); + break; + } +} + +static void +xen_shutdown(void *arg) +{ + int cmd = (uintptr_t)arg; + proc_t *initpp; + + ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); + + if (cmd == SHUTDOWN_SUSPEND) { + extern void xen_suspend_domain(); + xen_suspend_domain(); + return; + } + + switch (cmd) { + case SHUTDOWN_POWEROFF: + force_shutdown_method = AD_POWEROFF; + break; + case SHUTDOWN_HALT: + force_shutdown_method = AD_HALT; + break; + case SHUTDOWN_REBOOT: + force_shutdown_method = AD_BOOT; + break; + } + + /* + * If we're still booting and init(1) isn't set up yet, simply halt. + */ + mutex_enter(&pidlock); + initpp = prfind(P_INITPID); + mutex_exit(&pidlock); + if (initpp == NULL) { + extern void halt(char *); + halt("Power off the System"); /* just in case */ + } + + /* + * else, graceful shutdown with inittab and all getting involved + */ + psignal(initpp, SIGPWR); + + (void) timeout(xen_dirty_shutdown, arg, + SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); +} + +/*ARGSUSED*/ +static void +xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char *str; + xenbus_transaction_t xbt; + int err, shutdown_code = SHUTDOWN_INVALID; + unsigned int slen; + +again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { + (void) xenbus_transaction_end(xbt, 1); + return; + } + + /* + * If this is a watch fired from our write below, check out early to + * avoid an infinite loop. + */ + if (strcmp(str, "") == 0) { + (void) xenbus_transaction_end(xbt, 0); + kmem_free(str, slen); + return; + } else if (strcmp(str, "poweroff") == 0) { + shutdown_code = SHUTDOWN_POWEROFF; + } else if (strcmp(str, "reboot") == 0) { + shutdown_code = SHUTDOWN_REBOOT; + } else if (strcmp(str, "suspend") == 0) { + shutdown_code = SHUTDOWN_SUSPEND; + } else if (strcmp(str, "halt") == 0) { + shutdown_code = SHUTDOWN_HALT; + } else { + printf("Ignoring shutdown request: %s\n", str); + } + + /* + * XXPV Should we check the value of xenbus_write() too, or are all + * errors automatically folded into xenbus_transaction_end() ?? + */ + (void) xenbus_write(xbt, "control", "shutdown", ""); + err = xenbus_transaction_end(xbt, 0); + if (err == EAGAIN) { + kmem_free(str, slen); + goto again; + } + + kmem_free(str, slen); + if (shutdown_code != SHUTDOWN_INVALID) { + (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, + (void *)(intptr_t)shutdown_code, 0); + } +} + +static struct xenbus_watch shutdown_watch; +static struct xenbus_watch sysrq_watch; + +void +xen_late_startup(void) +{ + if (!DOMAIN_IS_INITDOMAIN(xen_info)) { + xen_shutdown_tq = taskq_create("shutdown_taskq", 1, + maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); + shutdown_watch.node = "control/shutdown"; + shutdown_watch.callback = xen_shutdown_handler; + if (register_xenbus_watch(&shutdown_watch)) + cmn_err(CE_WARN, "Failed to set shutdown watcher"); + + sysrq_watch.node = "control/sysrq"; + sysrq_watch.callback = xen_sysrq_handler; + if (register_xenbus_watch(&sysrq_watch)) + cmn_err(CE_WARN, "Failed to set sysrq watcher"); + } + balloon_init(xen_info->nr_pages); +} + +#ifdef DEBUG +#define XEN_PRINTF_BUFSIZE 1024 + +char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; + +/* + * Printf function that calls hypervisor directly. For DomU it only + * works when running on a xen hypervisor built with debug on. Works + * always since no I/O ring interaction is needed. + */ +/*PRINTFLIKE1*/ +void +xen_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); + va_end(ap); + + (void) HYPERVISOR_console_io(CONSOLEIO_write, + strlen(xen_printf_buffer), xen_printf_buffer); +} +#else +void +xen_printf(const char *fmt, ...) +{ +} +#endif /* DEBUG */ + +void +startup_xen_version(void) +{ + xen_set_version(XENVER_BOOT_IDX); + if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) + cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " + "but need at least version v3.0.4", + XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), + XENVER_CURRENT(xv_ver)); + xen_pte_workaround(); +} + +int xen_mca_simulate_mc_physinfo_failure = 0; + +void +startup_xen_mca(void) +{ + if (!DOMAIN_IS_INITDOMAIN(xen_info)) + return; + + xen_phys_ncpus = 0; + xen_phys_cpus = NULL; + + if (xen_mca_simulate_mc_physinfo_failure || + xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) { + cmn_err(CE_WARN, + "%sxen_get_mc_physinfo failure during xen MCA startup: " + "there will be no machine check support", + xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : ""); + return; + } + + xen_phys_cpus = kmem_alloc(xen_phys_ncpus * + sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP); + + if (xen_phys_cpus == NULL) { + cmn_err(CE_WARN, + "xen_get_mc_physinfo failure: can't allocate CPU array"); + return; + } + + if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) { + cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no " + "physical CPU info"); + kmem_free(xen_phys_cpus, + xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t)); + xen_phys_ncpus = 0; + xen_phys_cpus = NULL; + } + + if (xen_physinfo_debug) { + xen_mc_logical_cpu_t *xcp; + unsigned i; + + cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n", + xen_phys_ncpus); + for (i = 0; i < xen_phys_ncpus; i++) { + xcp = &xen_phys_cpus[i]; + cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u", + xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid, + xcp->mc_threadid, xcp->mc_apicid); + } + } +} + +/* + * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. + */ + +void +xen_set_gdt(ulong_t *frame_list, int entries) +{ + int err; + if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { + /* + * X_EINVAL: reserved entry or bad frames + * X_EFAULT: bad address + */ + panic("xen_set_gdt(%p, %d): error %d", + (void *)frame_list, entries, -(int)err); + } +} + +void +xen_set_ldt(user_desc_t *ldt, uint_t nsels) +{ + struct mmuext_op op; + long err; + + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = (uintptr_t)ldt; + op.arg2.nr_ents = nsels; + + if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { + panic("xen_set_ldt(%p, %d): error %d", + (void *)ldt, nsels, -(int)err); + } +} + +void +xen_stack_switch(ulong_t ss, ulong_t esp) +{ + long err; + + if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { + /* + * X_EPERM: bad selector + */ + panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, + -(int)err); + } +} + +long +xen_set_trap_table(trap_info_t *table) +{ + long err; + + if ((err = HYPERVISOR_set_trap_table(table)) != 0) { + /* + * X_EFAULT: bad address + * X_EPERM: bad selector + */ + panic("xen_set_trap_table(%p): error %d", (void *)table, + -(int)err); + } + return (err); +} + +#if defined(__amd64) +void +xen_set_segment_base(int reg, ulong_t value) +{ + long err; + + if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { + /* + * X_EFAULT: bad address + * X_EINVAL: bad type + */ + panic("xen_set_segment_base(%d, %lx): error %d", + reg, value, -(int)err); + } +} +#endif /* __amd64 */ + +/* + * Translate a hypervisor errcode to a Solaris error code. + */ +int +xen_xlate_errcode(int error) +{ + switch (-error) { + + /* + * Translate hypervisor errno's into native errno's + */ + +#define CASE(num) case X_##num: error = num; break + + CASE(EPERM); CASE(ENOENT); CASE(ESRCH); + CASE(EINTR); CASE(EIO); CASE(ENXIO); + CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); + CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); + CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); + CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); + CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); + CASE(ENODATA); CASE(EAGAIN); + +#undef CASE + + default: + panic("xen_xlate_errcode: unknown error %d", error); + } + + return (error); +} + +/* + * Raise PS_IOPL on current vcpu to user level. + * Caller responsible for preventing kernel preemption. + */ +void +xen_enable_user_iopl(void) +{ + physdev_set_iopl_t set_iopl; + set_iopl.iopl = 3; /* user ring 3 */ + (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +/* + * Drop PS_IOPL on current vcpu to kernel level + */ +void +xen_disable_user_iopl(void) +{ + physdev_set_iopl_t set_iopl; + set_iopl.iopl = 1; /* kernel pseudo ring 1 */ + (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +int +xen_gdt_setprot(cpu_t *cp, uint_t prot) +{ + int err; +#if defined(__amd64) + int pt_bits = PT_VALID; + if (prot & PROT_WRITE) + pt_bits |= PT_WRITABLE; +#endif + + if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, + MMU_PAGESIZE, prot)) != 0) + goto done; + +#if defined(__amd64) + err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); +#endif + +done: + if (err) { + cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", + cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", + err); + } + + return (err); +} + +int +xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) +{ + int err; + caddr_t lva = (caddr_t)ldt; +#if defined(__amd64) + int pt_bits = PT_VALID; + pgcnt_t npgs; + if (prot & PROT_WRITE) + pt_bits |= PT_WRITABLE; +#endif /* __amd64 */ + + if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) + goto done; + +#if defined(__amd64) + + ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); + npgs = mmu_btop(lsize); + while (npgs--) { + if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), + pt_bits)) != 0) + break; + lva += PAGESIZE; + } +#endif /* __amd64 */ + +done: + if (err) { + cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", + (void *)lva, + (prot & PROT_WRITE) ? "writable" : "read-only", err); + } + + return (err); +} + +int +xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus) +{ + struct xen_mc_physcpuinfo cpi; + + cpi.ncpus = *ncpus; + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(cpi.info, log_cpus); + + if (HYPERVISOR_mca(XEN_MC_physcpuinfo, (xen_mc_arg_t *)&cpi) != 0) + return (-1); + + *ncpus = cpi.ncpus; + return (0); +} + +void +print_panic(const char *str) +{ + xen_printf(str); +} + +/* + * Interfaces to iterate over real cpu information, but only that info + * which we choose to expose here. These are of interest to dom0 + * only (and the backing hypercall should not work for domu). + */ + +xen_mc_lcpu_cookie_t +xen_physcpu_next(xen_mc_lcpu_cookie_t cookie) +{ + xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie; + + if (!DOMAIN_IS_INITDOMAIN(xen_info)) + return (NULL); + + if (cookie == NULL) + return ((xen_mc_lcpu_cookie_t)xen_phys_cpus); + + if (xcp == xen_phys_cpus + xen_phys_ncpus - 1) + return (NULL); + else + return ((xen_mc_lcpu_cookie_t)++xcp); +} + +#define COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c)) + +const char * +xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie) +{ + xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); + + return ((const char *)&xcp->mc_vendorid[0]); +} + +int +xen_physcpu_family(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_family); +} + +int +xen_physcpu_model(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_model); +} + +int +xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_step); +} + +id_t +xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_chipid); +} + +id_t +xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_coreid); +} + +id_t +xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_threadid); +} + +id_t +xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_cpunr); +} + +boolean_t +xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie) +{ + return (COOKIE2XCP(cookie)->mc_nthreads > 1); +} + +uint64_t +xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie) +{ + xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); + + /* + * Need to #define the indices, or search through the array. + */ + return (xcp->mc_msrvalues[0].value); +} + +int +xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count, + boolean_t uvaddr) +{ + long rc; + uint_t i; + + ASSERT(cmd == GNTTABOP_map_grant_ref); + +#if !defined(_BOOT) + if (uvaddr == B_FALSE) { + for (i = 0; i < count; ++i) { + mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0); + } + } +#endif + + rc = HYPERVISOR_grant_table_op(cmd, mapop, count); + + return (rc); +} + +static int +xpv_get_physinfo(xen_sysctl_physinfo_t *pi) +{ + xen_sysctl_t op; + struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node; + int ret; + + bzero(&op, sizeof (op)); + op.cmd = XEN_SYSCTL_physinfo; + op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(*sp, NULL); + + ret = HYPERVISOR_sysctl(&op); + + if (ret != 0) + return (xen_xlate_errcode(ret)); + + bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo)); + return (0); +} + +/* + * On dom0, we can determine the number of physical cpus on the machine. + * This number is important when figuring out what workarounds are + * appropriate, so compute it now. + */ +uint_t +xpv_nr_phys_cpus(void) +{ + static uint_t nphyscpus = 0; + + ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); + + if (nphyscpus == 0) { + xen_sysctl_physinfo_t pi; + int ret; + + if ((ret = xpv_get_physinfo(&pi)) != 0) + panic("xpv_get_physinfo() failed: %d\n", ret); + nphyscpus = pi.nr_cpus; + } + return (nphyscpus); +} + +pgcnt_t +xpv_nr_phys_pages(void) +{ + xen_sysctl_physinfo_t pi; + int ret; + + ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); + + if ((ret = xpv_get_physinfo(&pi)) != 0) + panic("xpv_get_physinfo() failed: %d\n", ret); + + return ((pgcnt_t)pi.total_pages); +} + +uint64_t +xpv_cpu_khz(void) +{ + xen_sysctl_physinfo_t pi; + int ret; + + ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); + + if ((ret = xpv_get_physinfo(&pi)) != 0) + panic("xpv_get_physinfo() failed: %d\n", ret); + return ((uint64_t)pi.cpu_khz); +} + +static void +xen_set_callback(void (*func)(void), uint_t type, uint_t flags) +{ + struct callback_register cb; + + bzero(&cb, sizeof (cb)); +#if defined(__amd64) + cb.address = (ulong_t)func; +#elif defined(__i386) + cb.address.cs = KCS_SEL; + cb.address.eip = (ulong_t)func; +#endif + cb.type = type; + cb.flags = flags; + + /* + * NMI callback doesn't exist on earlier Xen versions, so + * silently ignore failure. + */ + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && + type != CALLBACKTYPE_nmi) + panic("HYPERVISOR_callback_op failed"); +} + +void +xen_init_callbacks(void) +{ + /* + * register event (interrupt) handler. + */ + xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); + + /* + * failsafe handler. + */ + xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, + CALLBACKF_mask_events); +} + +#if defined(__amd64) +/*ARGSUSED*/ +static void +set_syscall_handlers(void (*syscall)(void), void (*syscall32)(void), + void (*sysenter)(void)) +{ + if (x86_feature & X86_ASYSC) { + xen_set_callback(syscall, CALLBACKTYPE_syscall, + CALLBACKF_mask_events); + if (syscall32_support()) { + xen_set_callback(syscall32, CALLBACKTYPE_syscall32, + CALLBACKF_mask_events); + } + } + + if (syscall32_support() && (x86_feature & X86_SEP)) { + xen_set_callback(sysenter, CALLBACKTYPE_sysenter, + CALLBACKF_mask_events); + } +} +#else +#define set_syscall_handlers(sc, sc32, se) /* nothing */ +#endif /* __amd64 */ + +/*ARGSUSED*/ +void +init_cpu_syscall(struct cpu *cp) +{ + set_syscall_handlers(sys_syscall, sys_syscall32, sys_sysenter); + + /* + * Setup NMI handler here, for want of a better place: it's + * per-CPU, but we can't set it in the initializing context. + */ + xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); +} + +void +brand_interpositioning_enable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) { + idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; + xen_idt_write(&idt[brand_tbl[i].ih_inum], + brand_tbl[i].ih_inum); + } + + set_syscall_handlers(brand_sys_syscall, brand_sys_syscall32, + brand_sys_sysenter); +} + +/* + * Disable interpositioning on the system call path by rewriting the + * syscall entry points to use the standard entry points. + */ +void +brand_interpositioning_disable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) { + xen_idt_write(&idt[brand_tbl[i].ih_inum], + brand_tbl[i].ih_inum); + } + + set_syscall_handlers(sys_syscall, sys_syscall32, sys_sysenter); +} + +void cpu_fast_syscall_disable(void) +{ + /* FIXME */ +} + +void cpu_fast_syscall_enable(void) +{ + /* FIXME: if we need to do this, we'll have to point to a stub + * syscall handler that delivers a #gp or whatever. See + * ldt_rewrite_syscall(). + */ +} --- old/usr/src/uts/i86xpv/os/xen_machdep.c Mon Apr 6 14:23:42 2009 +++ /dev/null Mon Apr 6 14:23:42 2009 @@ -1,1392 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* derived from netbsd's xen_machdep.c 1.1.2.1 */ - -/* - * - * Copyright (c) 2004 Christian Limpach. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. This section intentionally left blank. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -/* - * Section 3 of the above license was updated in response to bug 6379571. - */ - -#include - -/* XXX 3.3. TODO remove this include */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf -#else -#define SUSPEND_DEBUG(...) -#endif - -int cpr_debug; -cpuset_t cpu_suspend_lost_set; -static int xen_suspend_debug; - -uint_t xen_phys_ncpus; -xen_mc_logical_cpu_t *xen_phys_cpus; -int xen_physinfo_debug = 0; - -/* - * Determine helpful version information. - * - * (And leave copies in the data segment so we can look at them later - * with e.g. kmdb.) - */ - -typedef enum xen_version { - XENVER_BOOT_IDX, - XENVER_CURRENT_IDX -} xen_version_t; - -struct xenver { - ulong_t xv_major; - ulong_t xv_minor; - ulong_t xv_revision; - xen_extraversion_t xv_ver; - ulong_t xv_is_xvm; - xen_changeset_info_t xv_chgset; - xen_compile_info_t xv_build; - xen_capabilities_info_t xv_caps; -} xenver[2]; - -#define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) -#define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) - -/* - * Update the xenver data. We maintain two copies, boot and - * current. If we are setting the boot, then also set current. - */ -static void -xen_set_version(xen_version_t idx) -{ - ulong_t ver; - - bzero(&xenver[idx], sizeof (xenver[idx])); - - ver = HYPERVISOR_xen_version(XENVER_version, 0); - - xenver[idx].xv_major = BITX(ver, 31, 16); - xenver[idx].xv_minor = BITX(ver, 15, 0); - - (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); - - /* - * The revision is buried in the extraversion information that is - * maintained by the hypervisor. For our purposes we expect that - * the revision number is: - * - the second character in the extraversion information - * - one character long - * - numeric digit - * If it isn't then we can't extract the revision and we leave it - * set to 0. - */ - if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) - xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; - else - cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " - "version: v%s, unexpected version format", - xenver[idx].xv_ver); - - xenver[idx].xv_is_xvm = 0; - - if (strlen(xenver[idx].xv_ver) >= 4 && - strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4, - "-xvm", 4) == 0) - xenver[idx].xv_is_xvm = 1; - - (void) HYPERVISOR_xen_version(XENVER_changeset, - &xenver[idx].xv_chgset); - - (void) HYPERVISOR_xen_version(XENVER_compile_info, - &xenver[idx].xv_build); - /* - * Capabilities are a set of space separated ascii strings - * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' - */ - (void) HYPERVISOR_xen_version(XENVER_capabilities, - &xenver[idx].xv_caps); - - cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, - xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); - - if (idx == XENVER_BOOT_IDX) - bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], - sizeof (xenver[XENVER_BOOT_IDX])); -} - -typedef enum xen_hypervisor_check { - XEN_RUN_CHECK, - XEN_SUSPEND_CHECK -} xen_hypervisor_check_t; - -/* - * To run the hypervisor must be 3.0.4 or better. To suspend/resume - * we need 3.0.4 or better and if it is 3.0.4. then it must be provided - * by the Solaris xVM project. - * Checking can be disabled for testing purposes by setting the - * xen_suspend_debug variable. - */ -static int -xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) -{ - if (xen_suspend_debug == 1) - return (1); - if (XENVER_CURRENT(xv_major) < 3) - return (0); - if (XENVER_CURRENT(xv_major) > 3) - return (1); - if (XENVER_CURRENT(xv_minor) > 0) - return (1); - if (XENVER_CURRENT(xv_revision) < 4) - return (0); - if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 && - !XENVER_CURRENT(xv_is_xvm)) - return (0); - - return (1); -} - -/* - * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the - * workaround. - */ -static void -xen_pte_workaround(void) -{ -#if defined(__amd64) - extern int pt_kern; - - if (XENVER_CURRENT(xv_major) != 3) - return; - if (XENVER_CURRENT(xv_minor) > 1) - return; - if (XENVER_CURRENT(xv_minor) == 1 && - XENVER_CURRENT(xv_revision) > 1) - return; - if (XENVER_CURRENT(xv_is_xvm)) - return; - - pt_kern = PT_USER; -#endif -} - -void -xen_set_callback(void (*func)(void), uint_t type, uint_t flags) -{ - struct callback_register cb; - - bzero(&cb, sizeof (cb)); -#if defined(__amd64) - cb.address = (ulong_t)func; -#elif defined(__i386) - cb.address.cs = KCS_SEL; - cb.address.eip = (ulong_t)func; -#endif - cb.type = type; - cb.flags = flags; - - /* - * XXPV always ignore return value for NMI - */ - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && - type != CALLBACKTYPE_nmi) - panic("HYPERVISOR_callback_op failed"); -} - -void -xen_init_callbacks(void) -{ - /* - * register event (interrupt) handler. - */ - xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); - - /* - * failsafe handler. - */ - xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, - CALLBACKF_mask_events); - - /* - * NMI handler. - */ - xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); - - /* - * system call handler - * XXPV move to init_cpu_syscall? - */ -#if defined(__amd64) - xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, - CALLBACKF_mask_events); -#endif /* __amd64 */ -} - - -/* - * cmn_err() followed by a 1/4 second delay; this gives the - * logging service a chance to flush messages and helps avoid - * intermixing output from prom_printf(). - * XXPV: doesn't exactly help us on UP though. - */ -/*PRINTFLIKE2*/ -void -cpr_err(int ce, const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(ce, fmt, adx); - va_end(adx); - drv_usecwait(MICROSEC >> 2); -} - -void -xen_suspend_devices(void) -{ - int rc; - - SUSPEND_DEBUG("xen_suspend_devices\n"); - - if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) - panic("failed to suspend devices: %d", rc); -} - -void -xen_resume_devices(void) -{ - int rc; - - SUSPEND_DEBUG("xen_resume_devices\n"); - - if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) - panic("failed to resume devices: %d", rc); -} - -/* - * The list of mfn pages is out of date. Recompute it. - */ -static void -rebuild_mfn_list(void) -{ - int i = 0; - size_t sz; - size_t off; - pfn_t pfn; - - SUSPEND_DEBUG("rebuild_mfn_list\n"); - - sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; - - for (off = 0; off < sz; off += MMU_PAGESIZE) { - size_t j = mmu_btop(off); - if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { - pfn = hat_getpfnum(kas.a_hat, - (caddr_t)&mfn_list_pages[j]); - mfn_list_pages_page[i++] = pfn_to_mfn(pfn); - } - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); - mfn_list_pages[j] = pfn_to_mfn(pfn); - } - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list - = pfn_to_mfn(pfn); -} - -static void -suspend_cpus(void) -{ - int i; - - SUSPEND_DEBUG("suspend_cpus\n"); - - mp_enter_barrier(); - - for (i = 1; i < ncpus; i++) { - if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { - SUSPEND_DEBUG("xen_vcpu_down %d\n", i); - (void) xen_vcpu_down(i); - } - - mach_cpucontext_reset(cpu[i]); - } -} - -static void -resume_cpus(void) -{ - int i; - - for (i = 1; i < ncpus; i++) { - if (cpu[i] == NULL) - continue; - - if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { - SUSPEND_DEBUG("xen_vcpu_up %d\n", i); - mach_cpucontext_restore(cpu[i]); - (void) xen_vcpu_up(i); - } - } - - mp_leave_barrier(); -} - -/* - * Top level routine to direct suspend/resume of a domain. - */ -void -xen_suspend_domain(void) -{ - extern void rtcsync(void); - extern hrtime_t hres_last_tick; - mfn_t start_info_mfn; - ulong_t flags; - pfn_t pfn; - int i; - - /* - * Check that we are happy to suspend on this hypervisor. - */ - if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { - cpr_err(CE_WARN, "Cannot suspend on this hypervisor " - "version: v%lu.%lu%s, need at least version v3.0.4 or " - "-xvm based hypervisor", XENVER_CURRENT(xv_major), - XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); - return; - } - - /* - * XXPV - Are we definitely OK to suspend by the time we've connected - * the handler? - */ - - cpr_err(CE_NOTE, "Domain suspending for save/migrate"); - - SUSPEND_DEBUG("xen_suspend_domain\n"); - - /* - * suspend interrupts and devices - * XXPV - we use suspend/resume for both save/restore domains (like sun - * cpr) and for migration. Would be nice to know the difference if - * possible. For save/restore where down time may be a long time, we - * may want to do more of the things that cpr does. (i.e. notify user - * processes, shrink memory footprint for faster restore, etc.) - */ - xen_suspend_devices(); - SUSPEND_DEBUG("xenbus_suspend\n"); - xenbus_suspend(); - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); - start_info_mfn = pfn_to_mfn(pfn); - - /* - * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe - * wrt xenbus being suspended here? - */ - mutex_enter(&cpu_lock); - - /* - * Suspend must be done on vcpu 0, as no context for other CPUs is - * saved. - * - * XXPV - add to taskq API ? - */ - thread_affinity_set(curthread, 0); - kpreempt_disable(); - - SUSPEND_DEBUG("xen_start_migrate\n"); - xen_start_migrate(); - if (ncpus > 1) - suspend_cpus(); - - /* - * We can grab the ec_lock as it's a spinlock with a high SPL. Hence - * any holder would have dropped it to get through suspend_cpus(). - */ - mutex_enter(&ec_lock); - - /* - * From here on in, we can't take locks. - */ - SUSPEND_DEBUG("ec_suspend\n"); - ec_suspend(); - SUSPEND_DEBUG("gnttab_suspend\n"); - gnttab_suspend(); - - flags = intr_clear(); - - xpv_time_suspend(); - - /* - * Currently, the hypervisor incorrectly fails to bring back - * powered-down VCPUs. Thus we need to record any powered-down VCPUs - * to prevent any attempts to operate on them. But we have to do this - * *after* the very first time we do ec_suspend(). - */ - for (i = 1; i < ncpus; i++) { - if (cpu[i] == NULL) - continue; - - if (cpu_get_state(cpu[i]) == P_POWEROFF) - CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); - } - - /* - * The dom0 save/migrate code doesn't automatically translate - * these into PFNs, but expects them to be, so we do it here. - * We don't use mfn_to_pfn() because so many OS services have - * been disabled at this point. - */ - xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; - xen_info->console.domU.mfn = - mfn_to_pfn_mapping[xen_info->console.domU.mfn]; - - if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { - prom_printf("xen_suspend_domain(): " - "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, - 0, UVMF_INVLPG)) { - prom_printf("xen_suspend_domain(): " - "HYPERVISOR_update_va_mapping() failed\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - SUSPEND_DEBUG("HYPERVISOR_suspend\n"); - - /* - * At this point we suspend and sometime later resume. - */ - if (HYPERVISOR_suspend(start_info_mfn)) { - prom_printf("xen_suspend_domain(): " - "HYPERVISOR_suspend() failed\n"); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - /* - * Point HYPERVISOR_shared_info to its new value. - */ - if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, - xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, - UVMF_INVLPG)) - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - - if (xen_info->nr_pages != mfn_count) { - prom_printf("xen_suspend_domain(): number of pages" - " changed, was 0x%lx, now 0x%lx\n", mfn_count, - xen_info->nr_pages); - (void) HYPERVISOR_shutdown(SHUTDOWN_crash); - } - - xpv_time_resume(); - - cached_max_mfn = 0; - - SUSPEND_DEBUG("gnttab_resume\n"); - gnttab_resume(); - - /* XXPV: add a note that this must be lockless. */ - SUSPEND_DEBUG("ec_resume\n"); - ec_resume(); - - intr_restore(flags); - - if (ncpus > 1) - resume_cpus(); - - mutex_exit(&ec_lock); - xen_end_migrate(); - mutex_exit(&cpu_lock); - - /* - * Now we can take locks again. - */ - - /* - * Force the tick value used for tv_nsec in hres_tick() to be up to - * date. rtcsync() will reset the hrestime value appropriately. - */ - hres_last_tick = xpv_gethrtime(); - - /* - * XXPV: we need to have resumed the CPUs since this takes locks, but - * can remote CPUs see bad state? Presumably yes. Should probably nest - * taking of todlock inside of cpu_lock, or vice versa, then provide an - * unlocked version. Probably need to call clkinitf to reset cpu freq - * and re-calibrate if we migrated to a different speed cpu. Also need - * to make a (re)init_cpu_info call to update processor info structs - * and device tree info. That remains to be written at the moment. - */ - rtcsync(); - - rebuild_mfn_list(); - - SUSPEND_DEBUG("xenbus_resume\n"); - xenbus_resume(); - SUSPEND_DEBUG("xenbus_resume_devices\n"); - xen_resume_devices(); - - thread_affinity_clear(curthread); - kpreempt_enable(); - - SUSPEND_DEBUG("finished xen_suspend_domain\n"); - - /* - * We have restarted our suspended domain, update the hypervisor - * details. NB: This must be done at the end of this function, - * since we need the domain to be completely resumed before - * these functions will work correctly. - */ - xen_set_version(XENVER_CURRENT_IDX); - - /* - * We can check and report a warning, but we don't stop the - * process. - */ - if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) - cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " - "but need at least version v3.0.4", - XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), - XENVER_CURRENT(xv_ver)); - - cmn_err(CE_NOTE, "domain restore/migrate completed"); -} - -/*ARGSUSED*/ -int -xen_debug_handler(void *arg) -{ - debug_enter("External debug event received"); - - /* - * If we've not got KMDB loaded, output some stuff difficult to capture - * from a domain core. - */ - if (!(boothowto & RB_DEBUG)) { - shared_info_t *si = HYPERVISOR_shared_info; - int i; - - prom_printf("evtchn_pending [ "); - for (i = 0; i < 8; i++) - prom_printf("%lx ", si->evtchn_pending[i]); - prom_printf("]\nevtchn_mask [ "); - for (i = 0; i < 8; i++) - prom_printf("%lx ", si->evtchn_mask[i]); - prom_printf("]\n"); - - for (i = 0; i < ncpus; i++) { - vcpu_info_t *vcpu = &si->vcpu_info[i]; - if (cpu[i] == NULL) - continue; - prom_printf("CPU%d pending %d mask %d sel %lx\n", - i, vcpu->evtchn_upcall_pending, - vcpu->evtchn_upcall_mask, - vcpu->evtchn_pending_sel); - } - } - - return (0); -} - -/*ARGSUSED*/ -static void -xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - xenbus_transaction_t xbt; - char key = '\0'; - int ret; - -retry: - if (xenbus_transaction_start(&xbt)) { - cmn_err(CE_WARN, "failed to start sysrq transaction"); - return; - } - - if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { - /* - * ENOENT happens in response to our own xenbus_rm. - * XXPV - this happens spuriously on boot? - */ - if (ret != ENOENT) - cmn_err(CE_WARN, "failed to read sysrq: %d", ret); - goto out; - } - - if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { - cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); - goto out; - } - - if (xenbus_transaction_end(xbt, 0) == EAGAIN) - goto retry; - - /* - * Somewhat arbitrary - on Linux this means 'reboot'. We could just - * accept any key, but this might increase the risk of sending a - * harmless sysrq to the wrong domain... - */ - if (key == 'b') - (void) xen_debug_handler(NULL); - else - cmn_err(CE_WARN, "Ignored sysrq %c", key); - return; - -out: - (void) xenbus_transaction_end(xbt, 1); -} - -taskq_t *xen_shutdown_tq; - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_REBOOT 1 -#define SHUTDOWN_SUSPEND 2 -#define SHUTDOWN_HALT 3 -#define SHUTDOWN_MAX 4 - -#define SHUTDOWN_TIMEOUT_SECS (60 * 5) - -static const char *cmd_strings[SHUTDOWN_MAX] = { - "poweroff", - "reboot", - "suspend", - "halt" -}; - -static void -xen_dirty_shutdown(void *arg) -{ - int cmd = (uintptr_t)arg; - - cmn_err(CE_WARN, "Externally requested shutdown failed or " - "timed out.\nShutting down.\n"); - - switch (cmd) { - case SHUTDOWN_HALT: - case SHUTDOWN_POWEROFF: - (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); - break; - case SHUTDOWN_REBOOT: - (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); - break; - } -} - -static void -xen_shutdown(void *arg) -{ - int cmd = (uintptr_t)arg; - proc_t *initpp; - - ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); - - if (cmd == SHUTDOWN_SUSPEND) { - xen_suspend_domain(); - return; - } - - switch (cmd) { - case SHUTDOWN_POWEROFF: - force_shutdown_method = AD_POWEROFF; - break; - case SHUTDOWN_HALT: - force_shutdown_method = AD_HALT; - break; - case SHUTDOWN_REBOOT: - force_shutdown_method = AD_BOOT; - break; - } - - /* - * If we're still booting and init(1) isn't set up yet, simply halt. - */ - mutex_enter(&pidlock); - initpp = prfind(P_INITPID); - mutex_exit(&pidlock); - if (initpp == NULL) { - extern void halt(char *); - halt("Power off the System"); /* just in case */ - } - - /* - * else, graceful shutdown with inittab and all getting involved - */ - psignal(initpp, SIGPWR); - - (void) timeout(xen_dirty_shutdown, arg, - SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); -} - -/*ARGSUSED*/ -static void -xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char *str; - xenbus_transaction_t xbt; - int err, shutdown_code = SHUTDOWN_INVALID; - unsigned int slen; - -again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { - (void) xenbus_transaction_end(xbt, 1); - return; - } - - SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); - - /* - * If this is a watch fired from our write below, check out early to - * avoid an infinite loop. - */ - if (strcmp(str, "") == 0) { - (void) xenbus_transaction_end(xbt, 0); - kmem_free(str, slen); - return; - } else if (strcmp(str, "poweroff") == 0) { - shutdown_code = SHUTDOWN_POWEROFF; - } else if (strcmp(str, "reboot") == 0) { - shutdown_code = SHUTDOWN_REBOOT; - } else if (strcmp(str, "suspend") == 0) { - shutdown_code = SHUTDOWN_SUSPEND; - } else if (strcmp(str, "halt") == 0) { - shutdown_code = SHUTDOWN_HALT; - } else { - printf("Ignoring shutdown request: %s\n", str); - } - - /* - * XXPV Should we check the value of xenbus_write() too, or are all - * errors automatically folded into xenbus_transaction_end() ?? - */ - (void) xenbus_write(xbt, "control", "shutdown", ""); - err = xenbus_transaction_end(xbt, 0); - if (err == EAGAIN) { - SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); - kmem_free(str, slen); - goto again; - } - - kmem_free(str, slen); - if (shutdown_code != SHUTDOWN_INVALID) { - (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, - (void *)(intptr_t)shutdown_code, 0); - } -} - -static struct xenbus_watch shutdown_watch; -static struct xenbus_watch sysrq_watch; - -void -xen_late_startup(void) -{ - if (!DOMAIN_IS_INITDOMAIN(xen_info)) { - xen_shutdown_tq = taskq_create("shutdown_taskq", 1, - maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); - shutdown_watch.node = "control/shutdown"; - shutdown_watch.callback = xen_shutdown_handler; - if (register_xenbus_watch(&shutdown_watch)) - cmn_err(CE_WARN, "Failed to set shutdown watcher"); - - sysrq_watch.node = "control/sysrq"; - sysrq_watch.callback = xen_sysrq_handler; - if (register_xenbus_watch(&sysrq_watch)) - cmn_err(CE_WARN, "Failed to set sysrq watcher"); - } - balloon_init(xen_info->nr_pages); -} - -#ifdef DEBUG -#define XEN_PRINTF_BUFSIZE 1024 - -char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; - -/* - * Printf function that calls hypervisor directly. For DomU it only - * works when running on a xen hypervisor built with debug on. Works - * always since no I/O ring interaction is needed. - */ -/*PRINTFLIKE1*/ -void -xen_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); - va_end(ap); - - (void) HYPERVISOR_console_io(CONSOLEIO_write, - strlen(xen_printf_buffer), xen_printf_buffer); -} -#else -void -xen_printf(const char *fmt, ...) -{ -} -#endif /* DEBUG */ - -void -startup_xen_version(void) -{ - xen_set_version(XENVER_BOOT_IDX); - if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) - cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " - "but need at least version v3.0.4", - XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), - XENVER_CURRENT(xv_ver)); - xen_pte_workaround(); -} - -int xen_mca_simulate_mc_physinfo_failure = 0; - -void -startup_xen_mca(void) -{ - if (!DOMAIN_IS_INITDOMAIN(xen_info)) - return; - - xen_phys_ncpus = 0; - xen_phys_cpus = NULL; - - if (xen_mca_simulate_mc_physinfo_failure || - xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) { - cmn_err(CE_WARN, - "%sxen_get_mc_physinfo failure during xen MCA startup: " - "there will be no machine check support", - xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : ""); - return; - } - - xen_phys_cpus = kmem_alloc(xen_phys_ncpus * - sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP); - - if (xen_phys_cpus == NULL) { - cmn_err(CE_WARN, - "xen_get_mc_physinfo failure: can't allocate CPU array"); - return; - } - - if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) { - cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no " - "physical CPU info"); - kmem_free(xen_phys_cpus, - xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t)); - xen_phys_ncpus = 0; - xen_phys_cpus = NULL; - } - - if (xen_physinfo_debug) { - xen_mc_logical_cpu_t *xcp; - unsigned i; - - cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n", - xen_phys_ncpus); - for (i = 0; i < xen_phys_ncpus; i++) { - xcp = &xen_phys_cpus[i]; - cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u", - xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid, - xcp->mc_threadid, xcp->mc_apicid); - } - } -} - -/* - * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. - */ - -void -xen_set_gdt(ulong_t *frame_list, int entries) -{ - int err; - if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { - /* - * X_EINVAL: reserved entry or bad frames - * X_EFAULT: bad address - */ - panic("xen_set_gdt(%p, %d): error %d", - (void *)frame_list, entries, -(int)err); - } -} - -void -xen_set_ldt(user_desc_t *ldt, uint_t nsels) -{ - struct mmuext_op op; - long err; - - op.cmd = MMUEXT_SET_LDT; - op.arg1.linear_addr = (uintptr_t)ldt; - op.arg2.nr_ents = nsels; - - if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { - panic("xen_set_ldt(%p, %d): error %d", - (void *)ldt, nsels, -(int)err); - } -} - -void -xen_stack_switch(ulong_t ss, ulong_t esp) -{ - long err; - - if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { - /* - * X_EPERM: bad selector - */ - panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, - -(int)err); - } -} - -long -xen_set_trap_table(trap_info_t *table) -{ - long err; - - if ((err = HYPERVISOR_set_trap_table(table)) != 0) { - /* - * X_EFAULT: bad address - * X_EPERM: bad selector - */ - panic("xen_set_trap_table(%p): error %d", (void *)table, - -(int)err); - } - return (err); -} - -#if defined(__amd64) -void -xen_set_segment_base(int reg, ulong_t value) -{ - long err; - - if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { - /* - * X_EFAULT: bad address - * X_EINVAL: bad type - */ - panic("xen_set_segment_base(%d, %lx): error %d", - reg, value, -(int)err); - } -} -#endif /* __amd64 */ - -/* - * Translate a hypervisor errcode to a Solaris error code. - */ -int -xen_xlate_errcode(int error) -{ - switch (-error) { - - /* - * Translate hypervisor errno's into native errno's - */ - -#define CASE(num) case X_##num: error = num; break - - CASE(EPERM); CASE(ENOENT); CASE(ESRCH); - CASE(EINTR); CASE(EIO); CASE(ENXIO); - CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); - CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); - CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); - CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); - CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); - CASE(ENODATA); CASE(EAGAIN); - -#undef CASE - - default: - panic("xen_xlate_errcode: unknown error %d", error); - } - - return (error); -} - -/* - * Raise PS_IOPL on current vcpu to user level. - * Caller responsible for preventing kernel preemption. - */ -void -xen_enable_user_iopl(void) -{ - physdev_set_iopl_t set_iopl; - set_iopl.iopl = 3; /* user ring 3 */ - (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -/* - * Drop PS_IOPL on current vcpu to kernel level - */ -void -xen_disable_user_iopl(void) -{ - physdev_set_iopl_t set_iopl; - set_iopl.iopl = 1; /* kernel pseudo ring 1 */ - (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -int -xen_gdt_setprot(cpu_t *cp, uint_t prot) -{ - int err; -#if defined(__amd64) - int pt_bits = PT_VALID; - if (prot & PROT_WRITE) - pt_bits |= PT_WRITABLE; -#endif - - if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, - MMU_PAGESIZE, prot)) != 0) - goto done; - -#if defined(__amd64) - err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); -#endif - -done: - if (err) { - cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", - cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", - err); - } - - return (err); -} - -int -xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) -{ - int err; - caddr_t lva = (caddr_t)ldt; -#if defined(__amd64) - int pt_bits = PT_VALID; - pgcnt_t npgs; - if (prot & PROT_WRITE) - pt_bits |= PT_WRITABLE; -#endif /* __amd64 */ - - if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) - goto done; - -#if defined(__amd64) - - ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); - npgs = mmu_btop(lsize); - while (npgs--) { - if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), - pt_bits)) != 0) - break; - lva += PAGESIZE; - } -#endif /* __amd64 */ - -done: - if (err) { - cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", - (void *)lva, - (prot & PROT_WRITE) ? "writable" : "read-only", err); - } - - return (err); -} - -int -xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus) -{ - struct xen_mc_physcpuinfo cpi; - - cpi.ncpus = *ncpus; - /*LINTED: constant in conditional context*/ - set_xen_guest_handle(cpi.info, log_cpus); - - if (HYPERVISOR_mca(XEN_MC_physcpuinfo, (xen_mc_arg_t *)&cpi) != 0) - return (-1); - - *ncpus = cpi.ncpus; - return (0); -} - -void -print_panic(const char *str) -{ - xen_printf(str); -} - -/* - * Interfaces to iterate over real cpu information, but only that info - * which we choose to expose here. These are of interest to dom0 - * only (and the backing hypercall should not work for domu). - */ - -xen_mc_lcpu_cookie_t -xen_physcpu_next(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie; - - if (!DOMAIN_IS_INITDOMAIN(xen_info)) - return (NULL); - - if (cookie == NULL) - return ((xen_mc_lcpu_cookie_t)xen_phys_cpus); - - if (xcp == xen_phys_cpus + xen_phys_ncpus - 1) - return (NULL); - else - return ((xen_mc_lcpu_cookie_t)++xcp); -} - -#define COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c)) - -const char * -xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); - - return ((const char *)&xcp->mc_vendorid[0]); -} - -int -xen_physcpu_family(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_family); -} - -int -xen_physcpu_model(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_model); -} - -int -xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_step); -} - -id_t -xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_chipid); -} - -id_t -xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_coreid); -} - -id_t -xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_threadid); -} - -id_t -xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_cpunr); -} - -boolean_t -xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie) -{ - return (COOKIE2XCP(cookie)->mc_nthreads > 1); -} - -uint64_t -xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie) -{ - xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); - - /* - * Need to #define the indices, or search through the array. - */ - return (xcp->mc_msrvalues[0].value); -} - -int -xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count, - boolean_t uvaddr) -{ - long rc; - uint_t i; - - ASSERT(cmd == GNTTABOP_map_grant_ref); - -#if !defined(_BOOT) - if (uvaddr == B_FALSE) { - for (i = 0; i < count; ++i) { - mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0); - } - } -#endif - - rc = HYPERVISOR_grant_table_op(cmd, mapop, count); - - return (rc); -} - -static int -xpv_get_physinfo(xen_sysctl_physinfo_t *pi) -{ - xen_sysctl_t op; - struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node; - int ret; - - bzero(&op, sizeof (op)); - op.cmd = XEN_SYSCTL_physinfo; - op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; - /*LINTED: constant in conditional context*/ - set_xen_guest_handle(*sp, NULL); - - ret = HYPERVISOR_sysctl(&op); - - if (ret != 0) - return (xen_xlate_errcode(ret)); - - bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo)); - return (0); -} - -/* - * On dom0, we can determine the number of physical cpus on the machine. - * This number is important when figuring out what workarounds are - * appropriate, so compute it now. - */ -uint_t -xpv_nr_phys_cpus(void) -{ - static uint_t nphyscpus = 0; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if (nphyscpus == 0) { - xen_sysctl_physinfo_t pi; - int ret; - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - nphyscpus = pi.nr_cpus; - } - return (nphyscpus); -} - -pgcnt_t -xpv_nr_phys_pages(void) -{ - xen_sysctl_physinfo_t pi; - int ret; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - - return ((pgcnt_t)pi.total_pages); -} - -uint64_t -xpv_cpu_khz(void) -{ - xen_sysctl_physinfo_t pi; - int ret; - - ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); - - if ((ret = xpv_get_physinfo(&pi)) != 0) - panic("xpv_get_physinfo() failed: %d\n", ret); - return ((uint64_t)pi.cpu_khz); -} --- /dev/null Mon Apr 6 14:23:42 2009 +++ new/usr/src/uts/i86xpv/os/xpv_suspend.c Mon Apr 6 14:23:41 2009 @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* derived from netbsd's xen_machdep.c 1.1.2.1 */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. This section intentionally left blank. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * Section 3 of the above license was updated in response to bug 6379571. + */ + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#ifdef DEBUG +#define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf +#else +#define SUSPEND_DEBUG(...) +#endif + +int cpr_debug; +cpuset_t cpu_suspend_lost_set; +static int xen_suspend_debug; + +/* + * cmn_err() followed by a 1/4 second delay; this gives the + * logging service a chance to flush messages and helps avoid + * intermixing output from prom_printf(). + * XXPV: doesn't exactly help us on UP though. + */ +/*PRINTFLIKE2*/ +void +cpr_err(int ce, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(ce, fmt, adx); + va_end(adx); + drv_usecwait(MICROSEC >> 2); +} + +void +xen_suspend_devices(void) +{ + int rc; + + SUSPEND_DEBUG("xen_suspend_devices\n"); + + if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) + panic("failed to suspend devices: %d", rc); +} + +void +xen_resume_devices(void) +{ + int rc; + + SUSPEND_DEBUG("xen_resume_devices\n"); + + if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) + panic("failed to resume devices: %d", rc); +} + +/* + * The list of mfn pages is out of date. Recompute it. + */ +static void +rebuild_mfn_list(void) +{ + int i = 0; + size_t sz; + size_t off; + pfn_t pfn; + + SUSPEND_DEBUG("rebuild_mfn_list\n"); + + sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; + + for (off = 0; off < sz; off += MMU_PAGESIZE) { + size_t j = mmu_btop(off); + if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { + pfn = hat_getpfnum(kas.a_hat, + (caddr_t)&mfn_list_pages[j]); + mfn_list_pages_page[i++] = pfn_to_mfn(pfn); + } + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); + mfn_list_pages[j] = pfn_to_mfn(pfn); + } + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list + = pfn_to_mfn(pfn); +} + +static void +suspend_cpus(void) +{ + int i; + + SUSPEND_DEBUG("suspend_cpus\n"); + + mp_enter_barrier(); + + for (i = 1; i < ncpus; i++) { + if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { + SUSPEND_DEBUG("xen_vcpu_down %d\n", i); + (void) xen_vcpu_down(i); + } + + mach_cpucontext_reset(cpu[i]); + } +} + +static void +resume_cpus(void) +{ + int i; + + for (i = 1; i < ncpus; i++) { + if (cpu[i] == NULL) + continue; + + if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { + SUSPEND_DEBUG("xen_vcpu_up %d\n", i); + mach_cpucontext_restore(cpu[i]); + (void) xen_vcpu_up(i); + } + } + + mp_leave_barrier(); +} + +/* + * Top level routine to direct suspend/resume of a domain. + */ +void +xen_suspend_domain(void) +{ + extern void rtcsync(void); + extern hrtime_t hres_last_tick; + mfn_t start_info_mfn; + ulong_t flags; + pfn_t pfn; + int i; + + if (!xen_hypervisor_supports_suspend()) + return; + + /* + * XXPV - Are we definitely OK to suspend by the time we've connected + * the handler? + */ + + cpr_err(CE_NOTE, "Domain suspending for save/migrate"); + + SUSPEND_DEBUG("xen_suspend_domain\n"); + + /* + * suspend interrupts and devices + * XXPV - we use suspend/resume for both save/restore domains (like sun + * cpr) and for migration. Would be nice to know the difference if + * possible. For save/restore where down time may be a long time, we + * may want to do more of the things that cpr does. (i.e. notify user + * processes, shrink memory footprint for faster restore, etc.) + */ + xen_suspend_devices(); + SUSPEND_DEBUG("xenbus_suspend\n"); + xenbus_suspend(); + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); + start_info_mfn = pfn_to_mfn(pfn); + + /* + * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe + * wrt xenbus being suspended here? + */ + mutex_enter(&cpu_lock); + + /* + * Suspend must be done on vcpu 0, as no context for other CPUs is + * saved. + * + * XXPV - add to taskq API ? + */ + thread_affinity_set(curthread, 0); + kpreempt_disable(); + + SUSPEND_DEBUG("xen_start_migrate\n"); + xen_start_migrate(); + if (ncpus > 1) + suspend_cpus(); + + /* + * We can grab the ec_lock as it's a spinlock with a high SPL. Hence + * any holder would have dropped it to get through suspend_cpus(). + */ + mutex_enter(&ec_lock); + + /* + * From here on in, we can't take locks. + */ + SUSPEND_DEBUG("ec_suspend\n"); + ec_suspend(); + SUSPEND_DEBUG("gnttab_suspend\n"); + gnttab_suspend(); + + flags = intr_clear(); + + xpv_time_suspend(); + + /* + * Currently, the hypervisor incorrectly fails to bring back + * powered-down VCPUs. Thus we need to record any powered-down VCPUs + * to prevent any attempts to operate on them. But we have to do this + * *after* the very first time we do ec_suspend(). + */ + for (i = 1; i < ncpus; i++) { + if (cpu[i] == NULL) + continue; + + if (cpu_get_state(cpu[i]) == P_POWEROFF) + CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); + } + + /* + * The dom0 save/migrate code doesn't automatically translate + * these into PFNs, but expects them to be, so we do it here. + * We don't use mfn_to_pfn() because so many OS services have + * been disabled at this point. + */ + xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; + xen_info->console.domU.mfn = + mfn_to_pfn_mapping[xen_info->console.domU.mfn]; + + if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { + prom_printf("xen_suspend_domain(): " + "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + } + + if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, + 0, UVMF_INVLPG)) { + prom_printf("xen_suspend_domain(): " + "HYPERVISOR_update_va_mapping() failed\n"); + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + } + + SUSPEND_DEBUG("HYPERVISOR_suspend\n"); + + /* + * At this point we suspend and sometime later resume. + */ + if (HYPERVISOR_suspend(start_info_mfn)) { + prom_printf("xen_suspend_domain(): " + "HYPERVISOR_suspend() failed\n"); + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + } + + /* + * Point HYPERVISOR_shared_info to its new value. + */ + if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, + xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, + UVMF_INVLPG)) + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + + if (xen_info->nr_pages != mfn_count) { + prom_printf("xen_suspend_domain(): number of pages" + " changed, was 0x%lx, now 0x%lx\n", mfn_count, + xen_info->nr_pages); + (void) HYPERVISOR_shutdown(SHUTDOWN_crash); + } + + xpv_time_resume(); + + cached_max_mfn = 0; + + SUSPEND_DEBUG("gnttab_resume\n"); + gnttab_resume(); + + /* XXPV: add a note that this must be lockless. */ + SUSPEND_DEBUG("ec_resume\n"); + ec_resume(); + + intr_restore(flags); + + if (ncpus > 1) + resume_cpus(); + + mutex_exit(&ec_lock); + xen_end_migrate(); + mutex_exit(&cpu_lock); + + /* + * Now we can take locks again. + */ + + /* + * Force the tick value used for tv_nsec in hres_tick() to be up to + * date. rtcsync() will reset the hrestime value appropriately. + */ + hres_last_tick = xpv_gethrtime(); + + /* + * XXPV: we need to have resumed the CPUs since this takes locks, but + * can remote CPUs see bad state? Presumably yes. Should probably nest + * taking of todlock inside of cpu_lock, or vice versa, then provide an + * unlocked version. Probably need to call clkinitf to reset cpu freq + * and re-calibrate if we migrated to a different speed cpu. Also need + * to make a (re)init_cpu_info call to update processor info structs + * and device tree info. That remains to be written at the moment. + */ + rtcsync(); + + rebuild_mfn_list(); + + SUSPEND_DEBUG("xenbus_resume\n"); + xenbus_resume(); + SUSPEND_DEBUG("xenbus_resume_devices\n"); + xen_resume_devices(); + + thread_affinity_clear(curthread); + kpreempt_enable(); + + SUSPEND_DEBUG("finished xen_suspend_domain\n"); + + /* + * We have restarted our suspended domain, update the hypervisor + * details. NB: This must be done at the end of this function, + * since we need the domain to be completely resumed before + * this function will work correctly. + */ + xen_reset_version(); + + cmn_err(CE_NOTE, "domain restore/migrate completed"); +} --- old/usr/src/uts/i86xpv/os/xpv_timestamp.c Mon Apr 6 14:23:44 2009 +++ new/usr/src/uts/i86xpv/os/xpv_timestamp.c Mon Apr 6 14:23:43 2009 @@ -68,8 +68,8 @@ * a hypervisor interface, as hrtime_addend may well be non-zero. */ +int hrtime_fake_mt = 0; static volatile hrtime_t hrtime_last; -static int hrtime_fake_mt = 1; static hrtime_t hrtime_suspend_time; static hrtime_t hrtime_addend; --- old/usr/src/uts/i86xpv/sys/Makefile Mon Apr 6 14:23:46 2009 +++ new/usr/src/uts/i86xpv/sys/Makefile Mon Apr 6 14:23:45 2009 @@ -20,11 +20,9 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# UTSBASE = ../.. # @@ -40,7 +38,8 @@ HDRS= \ balloon.h \ machprivregs.h \ - xen_mmu.h \ + xen_mmu.h \ + xpv_platdep.h \ xpv_impl.h ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%) --- old/usr/src/uts/i86xpv/sys/machprivregs.h Mon Apr 6 14:23:47 2009 +++ new/usr/src/uts/i86xpv/sys/machprivregs.h Mon Apr 6 14:23:46 2009 @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,6 @@ #ifndef _SYS_MACHPRIVREGS_H #define _SYS_MACHPRIVREGS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include /* @@ -375,7 +373,7 @@ #define IRET HYPERVISOR_IRET(0) #define SYSRETQ HYPERVISOR_IRET(VGCF_IN_SYSCALL) -#define SYSRETL ud2 /* 32-bit syscall/sysret not supported */ +#define SYSRETL HYPERVISOR_IRET(VGCF_IN_SYSCALL) #define SWAPGS /* empty - handled in hypervisor */ #elif defined(__i386) --- /dev/null Mon Apr 6 14:23:49 2009 +++ new/usr/src/uts/i86xpv/sys/xpv_platdep.h Mon Apr 6 14:23:48 2009 @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_XPV_PLATDEP_H +#define _SYS_XPV_PLATDEP_H + +/* + * Stuff specific to the i86xpv platform. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct cpuid_regs; + +extern int xen_hypervisor_supports_suspend(void); +extern void xen_reset_version(void); + +extern void plat_mask_cpuid(uint_t, uint32_t, struct cpuid_regs *); + +#define discover_virt_type() /* we know we're XPV */ + +/* for mwait (which we don't support) */ +#define tlb_going_idle() +#define tlb_service() +#define i86_monitor(addr, ext, hints) +#define i86_mwait(data, ext) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_XPV_PLATDEP_H */ --- old/usr/src/uts/intel/ia32/os/desctbls.c Mon Apr 6 14:23:51 2009 +++ new/usr/src/uts/intel/ia32/os/desctbls.c Mon Apr 6 14:23:50 2009 @@ -80,6 +80,7 @@ #include #include #include +#include #ifdef __xpv #include @@ -145,20 +146,10 @@ }; /* - * Structure containing pre-computed descriptors to allow us to temporarily - * interpose on a standard handler. - */ -struct interposing_handler { - int ih_inum; - gate_desc_t ih_interp_desc; - gate_desc_t ih_default_desc; -}; - -/* * The brand infrastructure interposes on two handlers, and we use one as a * NULL signpost. */ -static struct interposing_handler brand_tbl[3]; +struct interposing_handler brand_tbl[3]; /* * software prototypes for default local descriptor table @@ -1309,94 +1300,4 @@ set_usegd(&bgdt[GDT_B32CODE], NULL, -1, SDT_MEMERA, SEL_KPL, SDP_PAGES, SDP_OP32); #endif /* __i386 */ -} - -/* - * Enable interpositioning on the system call path by rewriting the - * sys{call|enter} MSRs and the syscall-related entries in the IDT to use - * the branded entry points. - */ -void -brand_interpositioning_enable(void) -{ - gate_desc_t *idt = CPU->cpu_idt; - int i; - - ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); - - for (i = 0; brand_tbl[i].ih_inum; i++) { - idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; -#if defined(__xpv) - xen_idt_write(&idt[brand_tbl[i].ih_inum], - brand_tbl[i].ih_inum); -#endif - } - -#if defined(__amd64) -#if defined(__xpv) - - /* - * Currently the hypervisor only supports 64-bit syscalls via - * syscall instruction. The 32-bit syscalls are handled by - * interrupt gate above. - */ - xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall, - CALLBACKF_mask_events); - -#else - - if (x86_feature & X86_ASYSC) { - wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); - } - -#endif -#endif /* __amd64 */ - - if (x86_feature & X86_SEP) - wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); -} - -/* - * Disable interpositioning on the system call path by rewriting the - * sys{call|enter} MSRs and the syscall-related entries in the IDT to use - * the standard entry points, which bypass the interpositioning hooks. - */ -void -brand_interpositioning_disable(void) -{ - gate_desc_t *idt = CPU->cpu_idt; - int i; - - ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); - - for (i = 0; brand_tbl[i].ih_inum; i++) { - idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; -#if defined(__xpv) - xen_idt_write(&idt[brand_tbl[i].ih_inum], - brand_tbl[i].ih_inum); -#endif - } - -#if defined(__amd64) -#if defined(__xpv) - - /* - * See comment above in brand_interpositioning_enable. - */ - xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, - CALLBACKF_mask_events); - -#else - - if (x86_feature & X86_ASYSC) { - wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); - } - -#endif -#endif /* __amd64 */ - - if (x86_feature & X86_SEP) - wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); } --- old/usr/src/uts/intel/ia32/os/sysi86.c Mon Apr 6 14:23:53 2009 +++ new/usr/src/uts/intel/ia32/os/sysi86.c Mon Apr 6 14:23:52 2009 @@ -62,6 +62,7 @@ #include #include #include +#include #include #if defined(__xpv) #include @@ -398,7 +399,7 @@ #endif ldt_unload(); - cpu_fast_syscall_enable(NULL); + cpu_fast_syscall_enable(); } static void @@ -408,7 +409,7 @@ ASSERT(p == curproc); ldt_load(); - cpu_fast_syscall_disable(NULL); + cpu_fast_syscall_disable(); } /* @@ -423,7 +424,7 @@ if (isexec) { kpreempt_disable(); - cpu_fast_syscall_enable(NULL); + cpu_fast_syscall_enable(); kpreempt_enable(); } @@ -527,7 +528,7 @@ */ kpreempt_disable(); ldt_installctx(pp, NULL); - cpu_fast_syscall_disable(NULL); + cpu_fast_syscall_disable(); ASSERT(curthread->t_post_sys != 0); kpreempt_enable(); --- old/usr/src/uts/intel/sys/archsystm.h Mon Apr 6 14:23:55 2009 +++ new/usr/src/uts/intel/sys/archsystm.h Mon Apr 6 14:23:53 2009 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -181,7 +181,6 @@ #if defined(__xpv) extern void xen_init_callbacks(void); -extern void xen_set_callback(void (*)(void), uint_t, uint_t); extern void xen_printf(const char *, ...); #define cpr_dprintf xen_printf extern int xpv_panicking; --- old/usr/src/uts/intel/sys/cpu.h Mon Apr 6 14:23:56 2009 +++ new/usr/src/uts/intel/sys/cpu.h Mon Apr 6 14:23:55 2009 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -26,8 +26,6 @@ #ifndef _SYS_CPU_H #define _SYS_CPU_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * WARNING: * This header file is Obsolete and may be deleted in a @@ -58,9 +56,6 @@ extern void cli(void); extern void sti(void); extern void i86_halt(void); -extern void i86_monitor(volatile uint32_t *addr, uint32_t extensions, - uint32_t hints); -extern void i86_mwait(uint32_t data, uint32_t extensions); /* * Used to insert cpu-dependent instructions into spin loops --- old/usr/src/uts/intel/sys/machbrand.h Mon Apr 6 14:23:58 2009 +++ new/usr/src/uts/intel/sys/machbrand.h Mon Apr 6 14:23:57 2009 @@ -45,6 +45,14 @@ greg_t (*b_fixsegreg)(greg_t, model_t); }; +struct interposing_handler { + int ih_inum; + gate_desc_t ih_interp_desc; + gate_desc_t ih_default_desc; +}; + +extern struct interposing_handler brand_tbl[3]; + #endif /* _ASM */ #define BRAND_CB_SYSENTER 0 --- old/usr/src/uts/intel/sys/x86_archext.h Mon Apr 6 14:24:00 2009 +++ new/usr/src/uts/intel/sys/x86_archext.h Mon Apr 6 14:23:59 2009 @@ -568,9 +568,6 @@ extern void mtrr_sync(void); -extern void cpu_fast_syscall_enable(void *); -extern void cpu_fast_syscall_disable(void *); - struct cpu; extern int cpuid_checkpass(struct cpu *, int); @@ -616,11 +613,9 @@ extern void cpuid_get_addrsize(struct cpu *, uint_t *, uint_t *); extern uint_t cpuid_get_dtlb_nent(struct cpu *, size_t); -#if !defined(__xpv) extern uint32_t *cpuid_mwait_alloc(struct cpu *); extern void cpuid_mwait_free(struct cpu *); extern int cpuid_deep_cstates_supported(void); -#endif struct cpu_ucode_info;