diff --git a/sbin/sysctl/sysctl.c b/sbin/sysctl/sysctl.c index 5e9e562d308b..25a66b88c825 100644 --- a/sbin/sysctl/sysctl.c +++ b/sbin/sysctl/sysctl.c @@ -222,6 +222,7 @@ int sysctl_chipset(char *, char **, int *, int, int *); int sysctl_audio(char *, char **, int *, int, int *); int sysctl_video(char *, char **, int *, int, int *); int sysctl_witness(char *, char **, int *, int, int *); +int sysctl_hwp(char *, char **, int *, int, int *); void vfsinit(void); char *equ = "="; @@ -763,6 +764,14 @@ parse(char *string, int flags) return; break; } +#endif +#ifdef CPU_HWP + if (mib[1] == CPU_HWP) { + len = sysctl_hwp(string, &bufp, mib, flags, &type); + if (len < 0) + return; + break; + } #endif break; @@ -2862,6 +2871,30 @@ sysctl_witness(char *string, char **bufpp, int mib[], int flags, int *typep) return (3); } +#ifdef CPU_HWP +/* + * handle machdep.hwp requests + */ +struct ctlname hwpname[] = CTL_HWP_NAMES; +struct list hwplist = { hwpname, HWP_MAXID }; + +int +sysctl_hwp(char *string, char **bufpp, int mib[], int flags, int *typep) +{ + int indx; + + if (*bufpp == NULL) { + listall(string, &hwplist); + return (-1); + } + if ((indx = findname(string, "third", bufpp, &hwplist)) == -1) + return (-1); + mib[2] = indx; + *typep = hwplist.list[indx].ctl_type; + return (3); +} +#endif + /* * Scan a list of names searching for a particular name. */ diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c index 0ed9962bbfdb..2ab360bbc36a 100644 --- a/sys/arch/amd64/amd64/identcpu.c +++ b/sys/arch/amd64/amd64/identcpu.c @@ -213,7 +213,11 @@ const struct { { SEFF0EDX_SSBD, "SSBD" }, }, cpu_tpm_eaxfeatures[] = { { TPM_SENSOR, "SENSOR" }, + { TPM_TURBO, "TURBO" }, { TPM_ARAT, "ARAT" }, + { TPM_HWP, "HWP" }, +}, cpu_tpm_ecxfeatures[] = { + { TPM_EPB, "EPB" }, }, cpu_cpuid_perf_eax[] = { { CPUIDEAX_VERID, "PERF" }, }, cpu_cpuid_apmi_edx[] = { @@ -545,6 +549,8 @@ identifycpu(struct cpu_info *ci) /* Check if it's an invariant TSC */ if (cpu_apmi_edx & CPUIDEDX_ITSC) ci->ci_flags |= CPUF_INVAR_TSC; + + tsc_timecounter_init(ci, freq); } freq = cpu_freq(ci); @@ -605,14 +611,19 @@ identifycpu(struct cpu_info *ci) } if (!strcmp(cpu_vendor, "GenuineIntel") && cpuid_level >= 0x06) { - CPUID(0x06, ci->ci_feature_tpmflags, dummy, dummy, dummy); + CPUID(0x06, ci->ci_feature_tpmflags_eax, dummy, + ci->ci_feature_tpmflags_ecx, dummy); for (i = 0; i < nitems(cpu_tpm_eaxfeatures); i++) - if (ci->ci_feature_tpmflags & + if (ci->ci_feature_tpmflags_eax & cpu_tpm_eaxfeatures[i].bit) printf(",%s", cpu_tpm_eaxfeatures[i].str); + for (i = 0; i < nitems(cpu_tpm_eaxfeatures); i++) + if (ci->ci_feature_tpmflags_ecx & + cpu_tpm_ecxfeatures[i].bit) + printf(",%s", cpu_tpm_ecxfeatures[i].str); } else if (!strcmp(cpu_vendor, "AuthenticAMD")) { if (ci->ci_family >= 0x12) - ci->ci_feature_tpmflags |= TPM_ARAT; + ci->ci_feature_tpmflags_eax |= TPM_ARAT; } /* AMD speculation control features */ @@ -699,7 +710,9 @@ identifycpu(struct cpu_info *ci) setperf_setup = k1x_init; } - if (cpu_ecxfeature & CPUIDECX_EST) + if (ci->ci_feature_tpmflags_eax & TPM_HWP) + setperf_setup = pstate_init; + else if (cpu_ecxfeature & CPUIDECX_EST) setperf_setup = est_init; #endif @@ -722,7 +735,7 @@ identifycpu(struct cpu_info *ci) } #ifndef SMALL_KERNEL - if (CPU_IS_PRIMARY(ci) && (ci->ci_feature_tpmflags & TPM_SENSOR)) { + if (CPU_IS_PRIMARY(ci) && (ci->ci_feature_tpmflags_eax & TPM_SENSOR)) { strlcpy(ci->ci_sensordev.xname, ci->ci_dev->dv_xname, sizeof(ci->ci_sensordev.xname)); ci->ci_sensor.type = SENSOR_TEMP; @@ -757,8 +770,6 @@ identifycpu(struct cpu_info *ci) #endif } - tsc_timecounter_init(ci, freq); - cpu_topology(ci); #if NVMM > 0 cpu_check_vmm_cap(ci); diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c index 1a49c56376ed..98863cbaafd1 100644 --- a/sys/arch/amd64/amd64/machdep.c +++ b/sys/arch/amd64/amd64/machdep.c @@ -548,6 +548,9 @@ cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, #endif case CPU_TSCFREQ: return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency)); + case CPU_HWP: + return (pstate_hwp_sysctl(name + 1, namelen - 1, oldp, oldlenp, + newp, newlen, p)); default: return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars), name, namelen, oldp, oldlenp, newp, newlen)); diff --git a/sys/arch/amd64/amd64/pstate.c b/sys/arch/amd64/amd64/pstate.c new file mode 100644 index 000000000000..cf211825d04e --- /dev/null +++ b/sys/arch/amd64/amd64/pstate.c @@ -0,0 +1,303 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) 2020 joshua stein + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * "The default HWP control field values are expected to be suitable for many + * applications. The OS can enable autonomous HWP for these common cases by: + * + * Setting IA32_HWP_REQUEST.Desired Performance = 0 + * (hardware autonomous selection determines the performance target). + * + * Set IA32_HWP_REQUEST.Activity Window = 0 (enable HW dynamic selection of + * window size). + * + * To maximize HWP benefit for the common cases, the OS should set: + * IA32_HWP_REQUEST.Minimum_Performance = + * IA32_HWP_CAPABILITIES.Lowest_Performance and + * IA32_HWP_REQUEST.Maximum_Performance = + * IA32_HWP_CAPABILITIES.Highest_Performance." + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +extern int setperf_prio; +extern int perflevel; +#ifdef MULTIPROCESSOR +void mp_setperf(int); +#endif + +static int pstate_hwp = 0; +static int pstate_hwp_bias_style = -1; +enum { + PSTATE_HWP_BIAS_EPP, + PSTATE_HWP_BIAS_EPB, +}; + +/* IA32_HWP_REQUEST */ +union hwp_request { + uint64_t msr; + struct { + uint8_t min_perf; + uint8_t max_perf; + uint8_t desired_perf; + uint8_t epp; + uint16_t act_win : 10; + uint8_t package : 1; + uint32_t reserved : 21; + } __packed fields; +} pstate_hwp_req; + +uint64_t pstate_epb; + +/* IA32_HWP_CAPABILITIES */ +union hwp_capabilities { + uint64_t msr; + struct { + uint8_t highest_perf; + uint8_t guaranteed_perf; + uint8_t most_efficient; + uint8_t lowest_perf; + uint32_t reserved; + } __packed fields; +} pstate_hwp_cap; + +static struct { + int epb_min; + int epb_max; + int epp; + char *label; +} pstate_epp_labels[] = { + { 0x00, 0x03, 0x00, "performance" }, + { 0x04, 0x07, 0x80, "balance_performance" }, + { 0x08, 0x0b, 0xc0, "balance_powersave" }, + { 0x0c, 0x0f, 0xff, "powersave" }, +}; + +const char *pstate_hwp_bias_label(int); +void pstate_commit(void); + +void +pstate_init(struct cpu_info *ci) +{ + const char *cpu_device = ci->ci_dev->dv_xname; + union hwp_request hwp_req; + uint64_t msr; + int16_t eppepb; + + if (rdmsr_safe(MSR_PLATFORM_INFO, &msr) != 0) + return; + + /* power management must be enabled before reading capabilities */ + wrmsr(IA32_PM_ENABLE, 1); + if (rdmsr(IA32_PM_ENABLE) != 1) { + printf("%s: enabling HWP failed\n", cpu_device); + return; + } + + if (rdmsr_safe(IA32_HWP_CAPABILITIES, &pstate_hwp_cap.msr) != 0) { + printf("%s: no HWP capabilities\n", cpu_device); + /* XXX: what are we supposed to do now? */ + return; + } + + if (ci->ci_feature_tpmflags_eax & TPM_HWP_EPP) { + pstate_hwp_bias_style = PSTATE_HWP_BIAS_EPP; + pstate_hwp_req.msr = rdmsr(IA32_HWP_REQUEST); + eppepb = hwp_req.fields.epp; + } else if (ci->ci_feature_tpmflags_ecx & TPM_EPB) { + pstate_hwp_bias_style = PSTATE_HWP_BIAS_EPB; + eppepb = pstate_epb = rdmsr(IA32_ENERGY_PERF_BIAS) & 0x0f; + } else { + printf("%s: no energy bias control\n", cpu_device); + return; + } + + /* XXX: should we force epb to performance by default? */ + + pstate_hwp = 1; + setperf_prio = 1; + cpu_setperf = pstate_setperf; + + printf("%s: HWP enabled, bias %s, highest perf %d MHz, " + "guaranteed %d MHz, most efficient %d MHz, lowest perf %d MHz\n", + cpu_device, pstate_hwp_bias_label(eppepb), + pstate_hwp_cap.fields.highest_perf * 100, + pstate_hwp_cap.fields.guaranteed_perf * 100, + pstate_hwp_cap.fields.most_efficient * 100, + pstate_hwp_cap.fields.lowest_perf * 100); +} + +const char * +pstate_hwp_bias_label(int val) +{ + int i; + + for (i = 0; i < (sizeof(pstate_epp_labels) / + sizeof(pstate_epp_labels[0])); i++) { + if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPP) { + if (val == pstate_epp_labels[i].epp) + return pstate_epp_labels[i].label; + } else if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPB) { + if (val >= pstate_epp_labels[i].epb_min && + val <= pstate_epp_labels[i].epb_max) + return pstate_epp_labels[i].label; + } + } + + return "unknown"; +} + +void +pstate_setperf(int level) +{ + printf("%s: %s(%d)\n", curcpu()->ci_dev->dv_xname, __func__, level); + + if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPP) + wrmsr(IA32_HWP_REQUEST, pstate_hwp_req.msr); + else if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPB) + wrmsr(IA32_ENERGY_PERF_BIAS, pstate_epb); +} + +void +pstate_commit(void) +{ +#ifdef MULTIPROCESSOR + /* + * This will broadcast X86_IPI_SETPERF to call pstate_setperf on each + * CPU, so we can wrmsr on each + */ + mp_setperf(perflevel); +#else + setperf(perflevel); +#endif +} + +/* TODO: update cpuspeed in response to hwp notifications */ + +int +pstate_hwp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen, struct proc *p) +{ + uint64_t epb; + const char *bias; + char newbias[64]; + int newval, err, i, found = 0; + + if (namelen != 1) + return ENOTDIR; + + if (!pstate_hwp) + return EOPNOTSUPP; + + if (name[0] < 1 || name[0] >= HWP_MAXID) + return EOPNOTSUPP; + + pstate_hwp_req.msr = rdmsr(IA32_HWP_REQUEST); + + switch (name[0]) { + case HWP_MIN_PERF: + case HWP_MAX_PERF: + case HWP_DESIRED_PERF: + switch (name[0]) { + case HWP_MIN_PERF: + newval = pstate_hwp_req.fields.min_perf; + break; + case HWP_MAX_PERF: + newval = pstate_hwp_req.fields.max_perf; + break; + case HWP_DESIRED_PERF: + newval = pstate_hwp_req.fields.desired_perf; + break; + } + + if (newlen == 0) + return sysctl_rdint(oldp, oldlenp, newp, newval); + + err = sysctl_int(oldp, oldlenp, newp, newlen, &newval); + if (err) + return err; + + switch (name[0]) { + case HWP_MIN_PERF: + pstate_hwp_req.fields.min_perf = newval; + break; + case HWP_MAX_PERF: + pstate_hwp_req.fields.max_perf = newval; + break; + case HWP_DESIRED_PERF: + pstate_hwp_req.fields.desired_perf = newval; + break; + } + + printf("%s: name[0] %d, newval %d [%zu], writing hwpreq 0x%llx\n", + curcpu()->ci_dev->dv_xname, name[0], newval, newlen, + pstate_hwp_req.msr); + + pstate_commit(); + return 0; + + case HWP_EPP: + if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPP) + epb = pstate_hwp_req.fields.epp; + else if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPB) + pstate_epb = epb = rdmsr(IA32_ENERGY_PERF_BIAS) & 0x0f; + + bias = pstate_hwp_bias_label(epb); + + if (newlen == 0) + return sysctl_rdstring(oldp, oldlenp, newp, bias); + + memcpy(newbias, bias, sizeof(newbias)); + err = sysctl_string(oldp, oldlenp, newp, newlen, newbias, + sizeof(newbias)); + if (err) + return err; + + for (i = 0; i < (sizeof(pstate_epp_labels) / + sizeof(pstate_epp_labels[0])); i++) { + if (strcmp(pstate_epp_labels[i].label, newbias) != 0) + continue; + + if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPP) + pstate_hwp_req.fields.epp = + pstate_epp_labels[i].epp; + else if (pstate_hwp_bias_style == PSTATE_HWP_BIAS_EPB) + pstate_epb = pstate_epp_labels[i].epb_max; + + found = 1; + } + + if (!found) + return EINVAL; + + printf("%s: changing epp/epb bias to \"%s\" (0x%llx)\n", + curcpu()->ci_dev->dv_xname, newbias, pstate_hwp_req.msr); + + pstate_commit(); + return 0; + } + + return EOPNOTSUPP; +} diff --git a/sys/arch/amd64/conf/files.amd64 b/sys/arch/amd64/conf/files.amd64 index 7a5d40bf4cd6..b91b50804980 100644 --- a/sys/arch/amd64/conf/files.amd64 +++ b/sys/arch/amd64/conf/files.amd64 @@ -64,6 +64,7 @@ file arch/amd64/isa/clock.c file arch/amd64/amd64/powernow-k8.c !small_kernel file arch/amd64/amd64/est.c !small_kernel file arch/amd64/amd64/k1x-pstate.c !small_kernel +file arch/amd64/amd64/pstate.c !small_kernel include "dev/rasops/files.rasops" include "dev/wsfont/files.wsfont" diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h index 5a76eb21e3cd..05ef1897dcbb 100644 --- a/sys/arch/amd64/include/cpu.h +++ b/sys/arch/amd64/include/cpu.h @@ -144,7 +144,8 @@ struct cpu_info { u_int32_t ci_feature_sefflags_ecx; u_int32_t ci_feature_sefflags_edx; u_int32_t ci_feature_amdspec_ebx; - u_int32_t ci_feature_tpmflags; + u_int32_t ci_feature_tpmflags_eax; + u_int32_t ci_feature_tpmflags_ecx; u_int32_t ci_pnfeatset; u_int32_t ci_efeature_eax; u_int32_t ci_efeature_ecx; @@ -430,9 +431,16 @@ void k8_powernow_setperf(int); void k1x_init(struct cpu_info *); void k1x_setperf(int); +/* est.c */ void est_init(struct cpu_info *); void est_setperf(int); +/* pstate.c */ +void pstate_init(struct cpu_info *); +void pstate_setperf(int); +int pstate_hwp_sysctl(int *, u_int, void *, size_t *, void *, size_t, + struct proc *); + #ifdef MULTIPROCESSOR /* mp_setperf.c */ void mp_setperf_init(void); @@ -458,7 +466,8 @@ void mp_setperf_init(void); #define CPU_TSCFREQ 16 /* TSC frequency */ #define CPU_INVARIANTTSC 17 /* has invariant TSC */ #define CPU_PWRACTION 18 /* action caused by power button */ -#define CPU_MAXID 19 /* number of valid machdep ids */ +#define CPU_HWP 19 /* hardware p-state knobs */ +#define CPU_MAXID 20 /* number of valid machdep ids */ #define CTL_MACHDEP_NAMES { \ { 0, 0 }, \ @@ -480,6 +489,28 @@ void mp_setperf_init(void); { "tscfreq", CTLTYPE_QUAD }, \ { "invarianttsc", CTLTYPE_INT }, \ { "pwraction", CTLTYPE_INT }, \ + { "hwp", CTLTYPE_INT }, \ +} + +/* + * CTL_HWP definitions. + */ +#define HWP_MIN_PERF 1 +#define HWP_MIN_PERF_NAME "min_perf" +#define HWP_MAX_PERF 2 +#define HWP_MAX_PERF_NAME "max_perf" +#define HWP_DESIRED_PERF 3 +#define HWP_DESIRED_PERF_NAME "desired_perf" +#define HWP_EPP 4 +#define HWP_EPP_NAME "epp_bias" +#define HWP_MAXID 5 + +#define CTL_HWP_NAMES { \ + { 0, 0 }, \ + { HWP_MIN_PERF_NAME, CTLTYPE_INT }, \ + { HWP_MAX_PERF_NAME, CTLTYPE_INT }, \ + { HWP_DESIRED_PERF_NAME, CTLTYPE_INT }, \ + { HWP_EPP_NAME, CTLTYPE_STRING }, \ } #endif /* !_MACHINE_CPU_H_ */ diff --git a/sys/arch/amd64/include/specialreg.h b/sys/arch/amd64/include/specialreg.h index e0232887ff69..ab2b30b50a45 100644 --- a/sys/arch/amd64/include/specialreg.h +++ b/sys/arch/amd64/include/specialreg.h @@ -234,7 +234,17 @@ * Thermal and Power Management (CPUID function 0x6) EAX bits */ #define TPM_SENSOR 0x00000001 /* Digital temp sensor */ +#define TPM_TURBO 0x00000002 /* Turbo Boost available */ #define TPM_ARAT 0x00000004 /* APIC Timer Always Running */ +#define TPM_HWP 0x00000080 /* Hardware P-States supported */ +#define TPM_HWP_NOTIFY 0x00000100 /* HWP Notification */ +#define TPM_HWP_ACT_WIN 0x00000200 /* HWP Activity Window */ +#define TPM_HWP_EPP 0x00000400 /* HWP Energy Perf. Preference */ + +/* + * Thermal and Power Management (CPUID function 0x6) ECX bits + */ +#define TPM_EPB 0x00000008 /* IA32_ENERGY_PERF_BIAS supported */ /* * "Architectural Performance Monitoring" bits (CPUID function 0x0a): @@ -368,6 +378,7 @@ #define MSR_PERFCTR0 0x0c1 #define MSR_PERFCTR1 0x0c2 #define MSR_FSB_FREQ 0x0cd /* Core Duo/Solo only */ +#define MSR_PLATFORM_INFO 0x0ce #define MSR_MTRRcap 0x0fe #define MTRRcap_FIXED 0x100 /* bit 8 - fixed MTRRs supported */ #define MTRRcap_WC 0x400 /* bit 10 - WC type supported */ @@ -925,12 +936,29 @@ #define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ #define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ +/* Hardware-Controlled Performance States (HWP) */ +#define IA32_ENERGY_PERF_BIAS 0x1b0 +#define IA32_PM_ENABLE 0x770 +#define IA32_HWP_CAPABILITIES 0x771 +#define IA32_HWP_REQUEST_PKG 0x772 +#define IA32_HWP_INTERRUPT 0x773 +#define IA32_HWP_REQUEST 0x774 +#define IA32_HWP_STATUS 0x777 +#define MSR_PPERF 0x64e + /* Intel Silicon Debug */ #define IA32_DEBUG_INTERFACE 0xc80 #define IA32_DEBUG_INTERFACE_ENABLE 0x00000001 #define IA32_DEBUG_INTERFACE_LOCK 0x40000000 #define IA32_DEBUG_INTERFACE_MASK 0x80000000 +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + /* * VMX */ diff --git a/sys/dev/acpi/acpicpu.c b/sys/dev/acpi/acpicpu.c index 9e112d75872d..591094ee8d04 100644 --- a/sys/dev/acpi/acpicpu.c +++ b/sys/dev/acpi/acpicpu.c @@ -539,7 +539,7 @@ acpicpu_getcst(struct acpicpu_softc *sc) use_nonmwait = 0; while ((next_cx = SLIST_NEXT(cx, link)) != NULL) { if (cx->state > 1 && - (sc->sc_ci->ci_feature_tpmflags & TPM_ARAT) == 0) + (sc->sc_ci->ci_feature_tpmflags_eax & TPM_ARAT) == 0) cx->flags |= CST_FLAG_SKIP; else if (cx->method != CST_METH_MWAIT) use_nonmwait = 1; @@ -573,7 +573,7 @@ acpicpu_getcst_from_fadt(struct acpicpu_softc *sc) return; /* skip these C2 and C3 states if the CPU doesn't have ARAT */ - flags = (sc->sc_ci->ci_feature_tpmflags & TPM_ARAT) + flags = (sc->sc_ci->ci_feature_tpmflags_eax & TPM_ARAT) ? 0 : CST_FLAG_SKIP; /* Some systems don't export a full PBLK; reduce functionality */