Skip to content

Commit

Permalink
hw: calibrate TSC via ACPI timer
Browse files Browse the repository at this point in the history
To get the Time Stamp Counter's frequency, hw relied on a complex and
incomplete algorithm.

Since this is a one-time initialization issue, move TSC calibration to
bootstrap and implement it using the ACPI timer.

Issue genodelabs#5215
  • Loading branch information
atopia committed Jan 15, 2025
1 parent c564013 commit 272e77a
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 169 deletions.
1 change: 1 addition & 0 deletions repos/base-hw/src/bootstrap/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace Bootstrap {

using Genode::addr_t;
using Genode::size_t;
using Genode::uint32_t;
using Boot_info = Hw::Boot_info<::Board::Boot_info>;
using Hw::Mmio_space;
using Hw::Mapping;
Expand Down
31 changes: 31 additions & 0 deletions repos/base-hw/src/bootstrap/spec/x86_64/platform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <hw/memory_consts.h>
#include <hw/spec/x86_64/acpi.h>
#include <hw/spec/x86_64/apic.h>
#include <hw/spec/x86_64/x86_64.h>

using namespace Genode;

Expand Down Expand Up @@ -66,6 +67,34 @@ static Hw::Acpi_rsdp search_rsdp(addr_t area, addr_t area_size)
}


static uint32_t calibrate_tsc_frequency(addr_t fadt_addr)
{
const unsigned Tsc_fixed_value = 2400;
uint32_t default_freq = Tsc_fixed_value * 1000;

uint32_t const sleep_ms = 10;

if (!fadt_addr) {
Genode::error("FADT not found, returning default TSC value");

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

For more clarity, I suggest to change the error message to a warning like follows. (Maybe move before line 75?)

if (!fadt_addr) {
  warning("FADT not found, returning fixed TSC frequency of ", ..., " kHz");
  return default_freq;
}
return default_freq;
}

Hw::Acpi_fadt fadt(reinterpret_cast<Hw::Acpi_generic *>(fadt_addr));

uint32_t val = fadt.calibrate_freq_khz(sleep_ms, []() {
return Hw::Tsc::rdtsc();
});

if (!val)
{

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

Please consider this as warning with the TSC ref value too.

Also, { follows the if statement on the same line.

Genode::error("Unable to calibrate TSC, returning default value");
return default_freq;
}

return val;
}


Bootstrap::Platform::Board::Board()
:
core_mmio(Memory_region { 0, 0x1000 },
Expand Down Expand Up @@ -250,6 +279,8 @@ Bootstrap::Platform::Board::Board()
cpus = !cpus ? 1 : max_cpus;
}

info.tsc_frequency = calibrate_tsc_frequency(info.acpi_fadt);

/* copy 16 bit boot code for AP CPUs and for ACPI resume */
addr_t ap_code_size = (addr_t)&_start - (addr_t)&_ap;
memcpy((void *)AP_BOOT_CODE_PAGE, &_ap, ap_code_size);
Expand Down
4 changes: 2 additions & 2 deletions repos/base-hw/src/core/spec/x86_64/platform_support.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ void Platform::_init_additional_platform_info(Xml_generator &xml)
xml.attribute("vmx", Hw::Virtualization_support::has_vmx());
});
xml.node("tsc", [&] {
xml.attribute("invariant", Hw::Lapic::invariant_tsc());
xml.attribute("freq_khz", Hw::Lapic::tsc_freq());
xml.attribute("invariant", Hw::Tsc::invariant_tsc());
xml.attribute("freq_khz", _boot_info().plat_info.tsc_frequency);
});
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ void Vmcb::write_vcpu_state(Vcpu_state &state)
/* Guest activity state (actv) not used by SVM */
state.actv_state.set_charged();

state.tsc.charge(Hw::Lapic::rdtsc());
state.tsc.charge(Hw::Tsc::rdtsc());
state.tsc_offset.charge(v.read<Vmcb_buf::Tsc_offset>());

state.efer.charge(v.read<Vmcb_buf::Efer>());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void Board::Vcpu_context::write_vcpu_state(Vcpu_state &state)
state.r14.charge(regs->r14);
state.r15.charge(regs->r15);

state.tsc.charge(Hw::Lapic::rdtsc());
state.tsc.charge(Hw::Tsc::rdtsc());

tsc_aux_guest = Cpu::Ia32_tsc_aux::read();
state.tsc_aux.charge(tsc_aux_guest);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ void Vmcs::write_vcpu_state(Genode::Vcpu_state &state)
state.actv_state.charge(
static_cast<uint32_t>(read(E_GUEST_ACTIVITY_STATE)));

state.tsc.charge(Hw::Lapic::rdtsc());
state.tsc.charge(Hw::Tsc::rdtsc());
state.tsc_offset.charge(read(E_TSC_OFFSET));

state.efer.charge(read(E_GUEST_IA32_EFER));
Expand Down
55 changes: 55 additions & 0 deletions repos/base-hw/src/include/hw/spec/x86_64/acpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ struct Hw::Acpi_fadt : Genode::Mmio<276>
struct Smi_cmd : Register<0x30, 32> { };
struct Acpi_enable : Register<0x34, 8> { };

struct Pm_tmr_len : Register< 91, 8> { };

struct Pm1a_cnt_blk : Register < 64, 32> {
struct Slp_typ : Bitfield < 10, 3> { };
struct Slp_ena : Bitfield < 13, 1> { };
Expand Down Expand Up @@ -123,6 +125,13 @@ struct Hw::Acpi_fadt : Genode::Mmio<276>
};
struct Pm1b_cnt_blk_ext_addr : Register < 184 + 4, 64> { };

struct X_pm_tmr_blk : Register < 208, 32> {
struct Addressspace : Bitfield < 0, 8> { };
struct Width : Bitfield < 8, 8> { };
};

struct X_pm_tmr_blk_addr : Register < 208 + 4, 64> { };

struct Gpe0_blk_ext : Register < 220, 32> {
struct Addressspace : Bitfield < 0, 8> { };
struct Width : Bitfield < 8, 8> { };
Expand Down Expand Up @@ -232,6 +241,52 @@ struct Hw::Acpi_fadt : Genode::Mmio<276>
return pm1_a | pm1_b;
}

/* see ACPI spec version 6.5 4.8.3.3. Power Management Timer (PM_TMR) */
uint32_t read_pm_tmr()
{
if (read<Pm_tmr_len>() != 4)
return 0;

addr_t const tmr_addr = read<X_pm_tmr_blk_addr>();

if (!tmr_addr)
return 0;

uint8_t const tmr_addr_type =
read<Hw::Acpi_fadt::X_pm_tmr_blk::Addressspace>();

/* I/O port address, most likely */
if (tmr_addr_type == 1) {
return inl((uint16_t)tmr_addr);
}

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

if one-liner here too and below.


/* System Memory space address */
if (tmr_addr_type == 0) {
return *(uint32_t *)tmr_addr;
}

return 0;
}

uint32_t calibrate_freq_khz(uint32_t sleep_ms, auto get_value_fn)
{
unsigned const acpi_timer_freq = 3579545;

uint32_t initial = read_pm_tmr();

if (!initial) {

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

The warning is already logged in the calling function...

Genode::error("Unable to read ACPI timer");
return 0;
}

uint64_t t1 = get_value_fn();
while ((read_pm_tmr() - initial) < (acpi_timer_freq * sleep_ms / 1000))
asm volatile ("pause":::"memory");
uint64_t t2 = get_value_fn();

return (uint32_t)((t2 - t1) / sleep_ms);
}

void write_cnt_blk(unsigned value_a, unsigned value_b)
{
_write<Pm1_cnt_len, Pm1a_cnt_blk, Pm1a_cnt_blk_ext::Width,
Expand Down
9 changes: 5 additions & 4 deletions repos/base-hw/src/include/hw/spec/x86_64/pc_board.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ struct Hw::Pc_board::Serial : Genode::X86_uart

struct Hw::Pc_board::Boot_info
{
Acpi_rsdp acpi_rsdp { };
Framebuffer framebuffer { };
Genode::addr_t efi_system_table { 0 };
Genode::addr_t acpi_fadt { 0 };
Acpi_rsdp acpi_rsdp { };
Framebuffer framebuffer { };
Genode::addr_t efi_system_table { 0 };
Genode::addr_t acpi_fadt { 0 };
Genode::uint32_t tsc_frequency { 0 };

Boot_info() {}
Boot_info(Acpi_rsdp const &acpi_rsdp,
Expand Down
171 changes: 11 additions & 160 deletions repos/base-hw/src/include/hw/spec/x86_64/x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
namespace Hw {
struct Cpu_memory_map;
struct Virtualization_support;
class Vendor;
class Lapic;
class Vendor;
struct Tsc;
}


Expand Down Expand Up @@ -107,172 +107,23 @@ class Hw::Vendor
};


class Hw::Lapic
struct Hw::Tsc
{
private:
static bool _has_tsc_dl()
static Genode::uint64_t rdtsc()
{
using Cpu = Hw::X86_64_cpu;

Cpu::Cpuid_1_ecx::access_t ecx = Cpu::Cpuid_1_ecx::read();
return (bool)Cpu::Cpuid_1_ecx::Tsc_deadline::get(ecx);
Genode::uint32_t low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (Genode::uint64_t)(high) << 32 | low;
}

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

Please compare this implementation to repos/base/include/spec/x86_64/trace/timestamp.h. Are you sure the cpuid instruction is not required here?

This comment has been minimized.

Copy link
@atopia

atopia Jan 16, 2025

Author Owner

Interesting point. In the use case of providing the TSC value to vCPUs I don't want to synchronize on previous instructions, I really just want the current TSC value.
Of course in the case of the TSC calibration loop, the the second rdtsc() must not be executed before the comparison to the final ACPI timer value has broken the pause loop.
As I understand the Intel SDM, for our purpose this should be guaranteed sufficiently by the pause instruction: "The PAUSE
instruction provides a hint to the processor that the code sequence is a spin-wait loop. The processor uses this hint
to avoid the memory order violation in most situations, which greatly improves processor performance. For this
reason, it is recommended that a PAUSE instruction be placed in all spin-wait loops."
(Intel SDM September 2023 Vol. 2B, section 4.3)

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 16, 2025

From an offline discussion this morning I took along that rdtsc should always be fenced at least. Could anybody acquainted with this topic please drop a reference here?

This comment has been minimized.

Copy link
@cnuke

cnuke Jan 20, 2025

This comment has been minimized.

Copy link
@atopia

atopia Jan 20, 2025

Author Owner

Thanks @cnuke for the interesting hints. I did try to find a better explanation for Intel's "avoid the memory order violation in most situations" (emphasis mine) and my understanding from https://stackoverflow.com/a/12904645 is that for my use case of calibrating the TSC, just relying on pause for the loop would still be fine, because the way I understand it, pause should prevent speculative execution of the rdtsc after the loop. As anecdotal evidence, our version of the nova kernel does not do any additional serialization either.
I guess theoretically the first rdtsc could be executed out of order, but but given that the comparison here

while ((read_pm_tmr() - initial) < (acpi_timer_freq * sleep_ms / 1000))

will produce different values depending on the cycle count of the pause instruction, I wonder if this is significant for this use case?

For the use case of passing the TSC value into the VMM via Vcpu_state, I would argue that the lack of serialization does not matter because the value that is passed up to the VMM is fairly imprecise anyway.

If anyone disagrees with this analysis I am happy to add proper serialization.
From https://stackoverflow.com/questions/51844886/is-lfence-serializing-on-amd-processors and https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be261ffce6f13229dad50f59c5e491f933d3167f it appears that lfence might not be serializing on AMD, whereas cpuid could taint the measurement with a VM exit when running hw in a virtualized environment.

This comment has been minimized.

Copy link
@chelmuth

chelmuth Jan 21, 2025

I comprehend the combination of pause and rdtsc in the calibration-loop use case. On the other hand, the rdtsc() function (like Trace::timestamp()) appears like an independent function that it is not because of the serialization requirement. We should at least add a bold note in the documentation of the function how to use it in other use cases. Or maybe just add the lfence?

This comment has been minimized.

Copy link
@atopia

atopia Jan 21, 2025

Author Owner

I have summed up the discussion in genodelabs#5430 and after consideration decided to set up dispatch serializing lfence on AMD processors and serialize rdtsc access in d904299. While I stand by my interpretation that pause should serialize the second rdtsc, the potential out of order execution of the first rdtsc at the start of the calibration loop and having to add a bunch of warning comments didn't seem the right way to go.


/*
* Adapted from Christian Prochaska's and Alexander Boettcher's
* implementation for Nova.
*
* For details, see Vol. 3B of the Intel SDM (September 2023):
* 20.7.3 Determining the Processor Base Frequency
*/
static unsigned _read_tsc_freq()
static bool invariant_tsc()
{
using Cpu = Hw::X86_64_cpu;

if (Vendor::get_vendor_id() != Vendor::INTEL)
return 0;

unsigned const model = Vendor::get_model();
unsigned const family = Vendor::get_family();

enum
{
Cpu_id_clock = 0x15,
Cpu_id_base_freq = 0x16
};

Cpu::Cpuid_0_eax::access_t eax_0 = Cpu::Cpuid_0_eax::read();

/*
* If CPUID leaf 15 is available, return the frequency reported there.
*/
if (eax_0 >= Cpu_id_clock) {
Cpu::Cpuid_15_eax::access_t eax_15 = Cpu::Cpuid_15_eax::read();
Cpu::Cpuid_15_ebx::access_t ebx_15 = Cpu::Cpuid_15_ebx::read();
Cpu::Cpuid_15_ecx::access_t ecx_15 = Cpu::Cpuid_15_ecx::read();

if (eax_15 && ebx_15) {
if (ecx_15)
return static_cast<unsigned>(
((Genode::uint64_t)(ecx_15) * ebx_15) / eax_15 / 1000
);

if (family == 6) {
if (model == 0x5c) /* Goldmont */
return static_cast<unsigned>((19200ull * ebx_15) / eax_15);
if (model == 0x55) /* Xeon */
return static_cast<unsigned>((25000ull * ebx_15) / eax_15);
}

if (family >= 6)
return static_cast<unsigned>((24000ull * ebx_15) / eax_15);
}
}


/*
* Specific methods for family 6 models
*/
if (family == 6) {
unsigned freq_tsc = 0U;

if (model == 0x2a ||
model == 0x2d || /* Sandy Bridge */
model >= 0x3a) /* Ivy Bridge and later */
{
Cpu::Platform_info::access_t platform_info = Cpu::Platform_info::read();
Genode::uint64_t ratio = Cpu::Platform_info::Ratio::get(platform_info);
freq_tsc = static_cast<unsigned>(ratio * 100000);
} else if (model == 0x1a ||
model == 0x1e ||
model == 0x1f ||
model == 0x2e || /* Nehalem */
model == 0x25 ||
model == 0x2c ||
model == 0x2f) /* Xeon Westmere */
{
Cpu::Platform_info::access_t platform_info = Cpu::Platform_info::read();
Genode::uint64_t ratio = Cpu::Platform_info::Ratio::get(platform_info);
freq_tsc = static_cast<unsigned>(ratio * 133330);
} else if (model == 0x17 || model == 0xf) { /* Core 2 */
Cpu::Fsb_freq::access_t fsb_freq = Cpu::Fsb_freq::read();
Genode::uint64_t freq_bus = Cpu::Fsb_freq::Speed::get(fsb_freq);

switch (freq_bus) {
case 0b101: freq_bus = 100000; break;
case 0b001: freq_bus = 133330; break;
case 0b011: freq_bus = 166670; break;
case 0b010: freq_bus = 200000; break;
case 0b000: freq_bus = 266670; break;
case 0b100: freq_bus = 333330; break;
case 0b110: freq_bus = 400000; break;
default: freq_bus = 0; break;
}

Cpu::Platform_id::access_t platform_id = Cpu::Platform_id::read();
Genode::uint64_t ratio = Cpu::Platform_id::Bus_ratio::get(platform_id);

freq_tsc = static_cast<unsigned>(freq_bus * ratio);
}

if (!freq_tsc)
Genode::warning("TSC: family 6 Intel platform info reports bus frequency of 0");
else
return freq_tsc;
}


/*
* Finally, using Processor Frequency Information for a rough estimate
*/
if (eax_0 >= Cpu_id_base_freq) {
Cpu::Cpuid_16_eax::access_t base_mhz = Cpu::Cpuid_16_eax::read();

if (base_mhz) {
Genode::warning("TSC: using processor base frequency: ", base_mhz, " MHz");
return base_mhz * 1000;
} else {
Genode::warning("TSC: CPUID reported processor base frequency of 0");
}
}

return 0;
}

static unsigned _measure_tsc_freq()
{
const unsigned Tsc_fixed_value = 2400;

Genode::warning("TSC: calibration not yet implemented, using fixed value of ", Tsc_fixed_value, " MHz");
/* TODO: implement TSC calibration on AMD */
return Tsc_fixed_value * 1000;
Cpu::Cpuid_80000007_eax::access_t eax =
Cpu::Cpuid_80000007_eax::read();
return Cpu::Cpuid_80000007_eax::Invariant_tsc::get(eax);
}

public:
static Genode::uint64_t rdtsc()
{
Genode::uint32_t low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (Genode::uint64_t)(high) << 32 | low;
}

static bool invariant_tsc()
{
using Cpu = Hw::X86_64_cpu;

Cpu::Cpuid_80000007_eax::access_t eax =
Cpu::Cpuid_80000007_eax::read();
return Cpu::Cpuid_80000007_eax::Invariant_tsc::get(eax);
}

static unsigned tsc_freq()
{
unsigned freq = _read_tsc_freq();
if (freq)
return freq;
else
return _measure_tsc_freq();
}
};

struct Hw::Virtualization_support
Expand Down
1 change: 1 addition & 0 deletions repos/base-hw/src/include/hw/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ namespace Hw {

using Genode::addr_t;
using Genode::size_t;
using Genode::uint32_t;
using Genode::get_page_size;
using Genode::get_page_size_log2;

Expand Down

4 comments on commit 272e77a

@chelmuth
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume you tested this implementation on x260 and your Nano, but how does the calibration affects test runs in Qemu on systems with varying loads? In the past we at least had issues with the TSC calibration loop of the Fiasco kernel.

@atopia
Copy link
Owner Author

@atopia atopia commented on 272e77a Jan 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply, comments on my clone of the repo keep going to my private email address and I haven't found a per-repo setting to fix this (I tried some months ago and now I get notifications on repo pushes to @genode-labs.com but still not for comments).
Anyway, I had tested the implementation on the x260. For reasons I don't have the means to debug right now, even without the PIT loop, hw does not boot on the X1 Nano Gen 2.
When running the calibration in Qemu alongside artificial load, in maybe 10-20% of the tests the TSC frequency measured inside Qemu goes up significantly, nearly up to 25% (e.g. 1996968kHz vs. 2477338kHz).

@chelmuth
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late reply, comments on my clone of the repo keep going to my private email address and I haven't found a per-repo setting to fix this (I tried some months ago and now I get notifications on repo pushes to @genode-labs.com but still not for comments).

There exists a simple and reliable fix: Use only the @genode-labs.com email address for your GitHub genodelabs collaboration account and, thus, clearly distinguish this collaboration from private matters. In my opinion there's no room left for discussing this matter further.

@chelmuth
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway, I had tested the implementation on the x260. For reasons I don't have the means to debug right now, even without the PIT loop, hw does not boot on the X1 Nano Gen 2.

Looking forward to future findings and will test on Tigerlake myself once the commits entered staging.

When running the calibration in Qemu alongside artificial load, in maybe 10-20% of the tests the TSC frequency measured inside Qemu goes up significantly, nearly up to 25% (e.g. 1996968kHz vs. 2477338kHz).

As long as we stay with limited accuracy and nothing just hangs, I'd be okay with the results.

Please sign in to comment.