Cx MWAIT
Konstantin Belousov
kostikbel at gmail.com
Wed Apr 22 12:55:34 UTC 2015
Below is the patch to start using mwait instead of 'legacy' port read
to enter the higher Cx states when idle. This is the Intel' recommended
way of entering Cx, using hints provided by the vendor-specific
fixed function hardware GAS encoding. See the "Intel(R) Processor
Vendor-Specific ACPI Interface Specification" revision 007. Patch was
written after I become interested why my Haswell desktop test box does
not report any C-states besides C1. It appeared to be due to combination
of BIOS misconfiguration and FreeBSD code lacking mwait support.
Also an enchanced C1 entry sequence, "I/O then halt", for coordination
of C1 entry with PCH, is supported. The "sti;hlt" sequence usage was
consolidated by calling acpi_cpu_c1().
Intel hardware automatically handles per-core and per-package state
aggregated from the thread-local C-states, which is indicated as
"hardware-coordinated" C-state entry. It is theoretically possible that
OS must handle software-coordinated package C-entry, but I am not aware
of real processors which need this mode. Intel is hw-coordinated, and it
seems that AMD does not advertise mwait sequence for C-states at all.
I know that BIOS _CST tables are believed to be buggy. In particular,
for Linux, Intel wrote a driver which has hard-coded model tables with
the encoding of supported C-states, latencies and caches/busmastering
behaviour. I agree with avg that we cannot support this approach.
I tried to keep the dev/acpica/acpi_cpu.c to be MI as much as possible.
At least, all mwait-specific code is put under #ifdef x86. The
acpi_PkgFFH_IntelCPU() helper to parse Intel FFH GAS is MI, but only
usable on x86; I believe this is fine. Note that currently ACPI is only
used on x86: we lost ia64, but it might be used on arm shortly.
diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c
index 049b51bb4e..8f88a00 100644
--- a/sys/amd64/acpica/acpi_machdep.c
+++ b/sys/amd64/acpica/acpi_machdep.c
@@ -87,13 +87,6 @@ acpi_machdep_quirks(int *quirks)
return (0);
}
-void
-acpi_cpu_c1()
-{
-
- __asm __volatile("sti; hlt");
-}
-
/*
* Support for mapping ACPI tables during early boot. Currently this
* uses the crashdump map to map each table. However, the crashdump
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 9083421..0813e5f 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -91,6 +91,7 @@ struct dumperinfo;
void *alloc_fpusave(int flags);
void amd64_syscall(struct thread *td, int traced);
void busdma_swi(void);
+bool cpu_mwait_usable(void);
void cpu_probe_amdc1e(void);
void cpu_setregs(void);
void doreti_iret(void) __asm(__STRING(doreti_iret));
diff --git a/sys/dev/acpica/acpi_cpu.c b/sys/dev/acpica/acpi_cpu.c
index 8df2782..3fb21a6 100644
--- a/sys/dev/acpica/acpi_cpu.c
+++ b/sys/dev/acpica/acpi_cpu.c
@@ -47,6 +47,8 @@ __FBSDID("$FreeBSD$");
#include <machine/bus.h>
#if defined(__amd64__) || defined(__i386__)
#include <machine/clock.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
#endif
#include <sys/rman.h>
@@ -70,6 +72,10 @@ struct acpi_cx {
uint32_t power; /* Power consumed (mW). */
int res_type; /* Resource type for p_lvlx. */
int res_rid; /* Resource ID for p_lvlx. */
+ bool do_mwait;
+ uint32_t mwait_hint;
+ bool mwait_hw_coord;
+ bool mwait_bm_avoidance;
};
#define MAX_CX_STATES 8
@@ -128,6 +134,12 @@ struct acpi_cpu_device {
#define PIIX4_STOP_BREAK_MASK (PIIX4_BRLD_EN_IRQ0 | PIIX4_BRLD_EN_IRQ | PIIX4_BRLD_EN_IRQ8)
#define PIIX4_PCNTRL_BST_EN (1<<10)
+#define CST_FFH_VENDOR_INTEL 1
+#define CST_FFH_INTEL_CL_C1IO 1
+#define CST_FFH_INTEL_CL_MWAIT 2
+#define CST_FFH_MWAIT_HW_COORD 0x0001
+#define CST_FFH_MWAIT_BM_AVOID 0x0002
+
/* Allow users to ignore processor orders in MADT. */
static int cpu_unordered;
SYSCTL_INT(_debug_acpi, OID_AUTO, cpu_unordered, CTLFLAG_RDTUN,
@@ -348,7 +360,17 @@ acpi_cpu_attach(device_t dev)
* so advertise this ourselves. Note this is not the same as independent
* SMP control where each CPU can have different settings.
*/
- sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3;
+ sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3 |
+ ACPI_CAP_C1_IO_HALT;
+
+#if defined(__i386__) || defined(__amd64__)
+ /*
+ * Ask for MWAIT modes if interrupts work reasonable with MWAIT.
+ */
+ if (cpu_mwait_usable())
+ sc->cpu_features |= ACPI_CAP_SMP_C1_NATIVE | ACPI_CAP_SMP_C3_NATIVE;
+#endif
+
if (devclass_get_drivers(acpi_cpu_devclass, &drivers, &drv_count) == 0) {
for (i = 0; i < drv_count; i++) {
if (ACPI_GET_FEATURES(drivers[i], &features) == 0)
@@ -720,6 +742,27 @@ acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc)
}
}
+static void
+acpi_cpu_cx_cst_mwait(struct acpi_cx *cx_ptr, uint64_t address, int accsize)
+{
+
+ cx_ptr->do_mwait = true;
+ cx_ptr->mwait_hint = address & 0xffffffff;
+ cx_ptr->mwait_hw_coord = (accsize & CST_FFH_MWAIT_HW_COORD) != 0;
+ cx_ptr->mwait_bm_avoidance = (accsize & CST_FFH_MWAIT_BM_AVOID) != 0;
+}
+
+static void
+acpi_cpu_cx_cst_free_plvlx(device_t cpu_dev, struct acpi_cx *cx_ptr)
+{
+
+ if (cx_ptr->p_lvlx == NULL)
+ return;
+ bus_release_resource(cpu_dev, cx_ptr->res_type, cx_ptr->res_rid,
+ cx_ptr->p_lvlx);
+ cx_ptr->p_lvlx = NULL;
+}
+
/*
* Parse a _CST package and set up its Cx states. Since the _CST object
* can change dynamically, our notify handler may call this function
@@ -734,7 +777,8 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
ACPI_OBJECT *top;
ACPI_OBJECT *pkg;
uint32_t count;
- int i;
+ uint64_t address;
+ int i, vendor, class, accsize;
ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
@@ -790,6 +834,30 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
/* Validate the state to see if we should use it. */
switch (cx_ptr->type) {
case ACPI_STATE_C1:
+ acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr);
+#if defined(__i386__) || defined(__amd64__)
+ if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address,
+ &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL) {
+ if (class == CST_FFH_INTEL_CL_C1IO) {
+ /* C1 I/O then Halt */
+ cx_ptr->res_rid = sc->cpu_cx_count;
+ bus_set_resource(sc->cpu_dev, SYS_RES_IOPORT,
+ cx_ptr->res_rid, address, 1);
+ cx_ptr->p_lvlx = bus_alloc_resource_any(sc->cpu_dev,
+ SYS_RES_IOPORT, &cx_ptr->res_rid, RF_ACTIVE |
+ RF_SHAREABLE);
+ if (cx_ptr->p_lvlx == NULL) {
+ bus_delete_resource(sc->cpu_dev, SYS_RES_IOPORT,
+ cx_ptr->res_rid);
+ device_printf(sc->cpu_dev,
+ "C1 I/O failed to allocate port %d, "
+ "degrading to C1 Halt", (int)address);
+ }
+ } else if (class == CST_FFH_INTEL_CL_MWAIT) {
+ acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize);
+ }
+ }
+#endif
if (sc->cpu_cx_states[0].type == ACPI_STATE_C0) {
/* This is the first C1 state. Use the reserved slot. */
sc->cpu_cx_states[0] = *cx_ptr;
@@ -818,23 +886,34 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
}
/* Free up any previous register. */
- if (cx_ptr->p_lvlx != NULL) {
- bus_release_resource(sc->cpu_dev, cx_ptr->res_type, cx_ptr->res_rid,
- cx_ptr->p_lvlx);
- cx_ptr->p_lvlx = NULL;
- }
+ acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr);
/* Allocate the control register for C2 or C3. */
- cx_ptr->res_rid = sc->cpu_cx_count;
- acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type, &cx_ptr->res_rid,
- &cx_ptr->p_lvlx, RF_SHAREABLE);
- if (cx_ptr->p_lvlx) {
+#if defined(__i386__) || defined(__amd64__)
+ if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address,
+ &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL &&
+ class == CST_FFH_INTEL_CL_MWAIT) {
+ /* Native C State Instruction use (mwait) */
+ acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize);
ACPI_DEBUG_PRINT((ACPI_DB_INFO,
- "acpi_cpu%d: Got C%d - %d latency\n",
- device_get_unit(sc->cpu_dev), cx_ptr->type,
- cx_ptr->trans_lat));
+ "acpi_cpu%d: Got C%d/mwait - %d latency\n",
+ device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat));
cx_ptr++;
sc->cpu_cx_count++;
+ } else
+#endif
+ {
+ cx_ptr->res_rid = sc->cpu_cx_count;
+ acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type,
+ &cx_ptr->res_rid, &cx_ptr->p_lvlx, RF_SHAREABLE);
+ if (cx_ptr->p_lvlx) {
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "acpi_cpu%d: Got C%d - %d latency\n",
+ device_get_unit(sc->cpu_dev), cx_ptr->type,
+ cx_ptr->trans_lat));
+ cx_ptr++;
+ sc->cpu_cx_count++;
+ }
}
}
AcpiOsFree(buf.Pointer);
@@ -1043,7 +1122,14 @@ acpi_cpu_idle(sbintime_t sbt)
*/
if (cx_next->type == ACPI_STATE_C1) {
cputicks = cpu_ticks();
- acpi_cpu_c1();
+ if (cx_next->p_lvlx != NULL) {
+ /* C1 I/O then Halt */
+ CPU_GET_REG(cx_next->p_lvlx, 1);
+ }
+ if (cx_next->do_mwait)
+ acpi_cpu_idle_mwait(cx_next->mwait_hint);
+ else
+ acpi_cpu_c1();
end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate();
if (curthread->td_critnest == 0)
end_time = min(end_time, 500000 / hz);
@@ -1055,7 +1141,7 @@ acpi_cpu_idle(sbintime_t sbt)
* For C3, disable bus master arbitration and enable bus master wake
* if BM control is available, otherwise flush the CPU cache.
*/
- if (cx_next->type == ACPI_STATE_C3) {
+ if (cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) {
if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 1);
AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1);
@@ -1076,7 +1162,10 @@ acpi_cpu_idle(sbintime_t sbt)
start_time = 0;
cputicks = cpu_ticks();
}
- CPU_GET_REG(cx_next->p_lvlx, 1);
+ if (cx_next->do_mwait)
+ acpi_cpu_idle_mwait(cx_next->mwait_hint);
+ else
+ CPU_GET_REG(cx_next->p_lvlx, 1);
/*
* Read the end time twice. Since it may take an arbitrary time
@@ -1092,8 +1181,8 @@ acpi_cpu_idle(sbintime_t sbt)
end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate();
/* Enable bus master arbitration and disable bus master wakeup. */
- if (cx_next->type == ACPI_STATE_C3 &&
- (cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
+ if ((cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) &&
+ (cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 0);
AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0);
}
diff --git a/sys/dev/acpica/acpi_package.c b/sys/dev/acpica/acpi_package.c
index e38fea5..c1070cb 100644
--- a/sys/dev/acpica/acpi_package.c
+++ b/sys/dev/acpica/acpi_package.c
@@ -120,6 +120,28 @@ acpi_PkgGas(device_t dev, ACPI_OBJECT *res, int idx, int *type, int *rid,
return (acpi_bus_alloc_gas(dev, type, rid, &gas, dst, flags));
}
+int
+acpi_PkgFFH_IntelCpu(ACPI_OBJECT *res, int idx, int *vendor, int *class,
+ uint64_t *address, int *accsize)
+{
+ ACPI_GENERIC_ADDRESS gas;
+ ACPI_OBJECT *obj;
+
+ obj = &res->Package.Elements[idx];
+ if (obj == NULL || obj->Type != ACPI_TYPE_BUFFER ||
+ obj->Buffer.Length < sizeof(ACPI_GENERIC_ADDRESS) + 3)
+ return (EINVAL);
+
+ memcpy(&gas, obj->Buffer.Pointer + 3, sizeof(gas));
+ if (gas.SpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE)
+ return (ERESTART);
+ *vendor = gas.BitWidth;
+ *class = gas.BitOffset;
+ *address = gas.Address;
+ *accsize = gas.AccessWidth;
+ return (0);
+}
+
ACPI_HANDLE
acpi_GetReference(ACPI_HANDLE scope, ACPI_OBJECT *obj)
{
diff --git a/sys/dev/acpica/acpivar.h b/sys/dev/acpica/acpivar.h
index 2e2b96d..cbd4bd9 100644
--- a/sys/dev/acpica/acpivar.h
+++ b/sys/dev/acpica/acpivar.h
@@ -467,6 +467,8 @@ int acpi_PkgInt32(ACPI_OBJECT *res, int idx, uint32_t *dst);
int acpi_PkgStr(ACPI_OBJECT *res, int idx, void *dst, size_t size);
int acpi_PkgGas(device_t dev, ACPI_OBJECT *res, int idx, int *type,
int *rid, struct resource **dst, u_int flags);
+int acpi_PkgFFH_IntelCpu(ACPI_OBJECT *res, int idx, int *vendor,
+ int *class, uint64_t *address, int *accsize);
ACPI_HANDLE acpi_GetReference(ACPI_HANDLE scope, ACPI_OBJECT *obj);
/*
diff --git a/sys/i386/acpica/acpi_machdep.c b/sys/i386/acpica/acpi_machdep.c
index 049354b..4c79691 100644
--- a/sys/i386/acpica/acpi_machdep.c
+++ b/sys/i386/acpica/acpi_machdep.c
@@ -106,13 +106,6 @@ acpi_machdep_quirks(int *quirks)
return (0);
}
-void
-acpi_cpu_c1()
-{
-
- __asm __volatile("sti; hlt");
-}
-
/*
* Support for mapping ACPI tables during early boot. This abuses the
* crashdump map because the kernel cannot allocate KVA in
diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h
index bffdd57..b5bd35e 100644
--- a/sys/i386/include/md_var.h
+++ b/sys/i386/include/md_var.h
@@ -97,6 +97,7 @@ struct dumperinfo;
void *alloc_fpusave(int flags);
void bcopyb(const void *from, void *to, size_t len);
void busdma_swi(void);
+bool cpu_mwait_usable(void);
void cpu_probe_amdc1e(void);
void cpu_setregs(void);
void cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs));
diff --git a/sys/x86/include/acpica_machdep.h b/sys/x86/include/acpica_machdep.h
index 46080c0..136285c 100644
--- a/sys/x86/include/acpica_machdep.h
+++ b/sys/x86/include/acpica_machdep.h
@@ -74,6 +74,7 @@ enum intr_polarity;
void acpi_SetDefaultIntrModel(int model);
void acpi_cpu_c1(void);
+void acpi_cpu_idle_mwait(uint32_t mwait_hint);
void *acpi_map_table(vm_paddr_t pa, const char *sig);
void acpi_unmap_table(void *table);
vm_paddr_t acpi_find_table(const char *sig);
diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index 846a123..d1d49f4 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
#ifdef SMP
#include <machine/smp.h>
#endif
+#include <x86/acpica_machdep.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -130,6 +131,27 @@ cpu_flush_dcache(void *ptr, size_t len)
/* Not applicable */
}
+void
+acpi_cpu_c1(void)
+{
+
+ __asm __volatile("sti; hlt");
+}
+
+void
+acpi_cpu_idle_mwait(uint32_t mwait_hint)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ /*
+ * XXXKIB. Software coordination mode should be supported,
+ * but all Intel CPUs provide hardware coordination.
+ */
+ cpu_monitor(state, 0, 0);
+ cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+}
+
/* Get current clock frequency for the given cpu id. */
int
cpu_est_clockrate(int cpu_id, uint64_t *rate)
@@ -232,6 +254,15 @@ cpu_halt(void)
#endif
+bool
+cpu_mwait_usable(void)
+{
+
+ return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
+ (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
+ (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
+}
+
void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
@@ -258,7 +289,7 @@ cpu_idle_acpi(sbintime_t sbt)
else if (cpu_idle_hook)
cpu_idle_hook(sbt);
else
- __asm __volatile("sti; hlt");
+ acpi_cpu_c1();
*state = STATE_RUNNING;
}
#endif /* !PC98 */
@@ -292,7 +323,7 @@ cpu_idle_hlt(sbintime_t sbt)
if (sched_runnable())
enable_intr();
else
- __asm __volatile("sti; hlt");
+ acpi_cpu_c1();
*state = STATE_RUNNING;
}
#endif
More information about the freebsd-amd64
mailing list