@@ -60,6 +60,11 @@ config ACPI_VMGENID
default y
depends on PC
+config ACPI_VMCLOCK
+ bool
+ default y
+ depends on PC
+
config ACPI_VIOT
bool
depends on ACPI
@@ -16,6 +16,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_NVDIMM', if_false: files('acpi-nvdimm-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
acpi_ss.add(when: 'CONFIG_ACPI_CXL', if_true: files('cxl.c'), if_false: files('cxl-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
+acpi_ss.add(when: 'CONFIG_ACPI_VMCLOCK', if_true: files('vmclock.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c'))
new file mode 100644
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+
+/*
+ * This structure provides a vDSO-style clock to VM guests, exposing the
+ * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
+ * counter, etc.) and real time. It is designed to address the problem of
+ * live migration, which other clock enlightenments do not.
+ *
+ * When a guest is live migrated, this affects the clock in two ways.
+ *
+ * First, even between identical hosts the actual frequency of the underlying
+ * counter will change within the tolerances of its specification (typically
+ * ±50PPM, or 4 seconds a day). The frequency also varies over time on the
+ * same host, but can be tracked by NTP as it generally varies slowly. With
+ * live migration there is a step change in the frequency, with no warning.
+ *
+ * Second, there may be a step change in the value of the counter itself, as
+ * its accuracy is limited by the precision of the NTP synchronization on the
+ * source and destination hosts.
+ *
+ * So any calibration (NTP, PTP, etc.) which the guest has done on the source
+ * host before migration is invalid, and needs to be redone on the new host.
+ *
+ * In its most basic mode, this structure provides only an indication to the
+ * guest that live migration has occurred. This allows the guest to know that
+ * its clock is invalid and take remedial action. For applications that need
+ * reliable accurate timestamps (e.g. distributed databases), the structure
+ * can be mapped all the way to userspace. This allows the application to see
+ * directly for itself that the clock is disrupted and take appropriate
+ * action, even when using a vDSO-style method to get the time instead of a
+ * system call.
+ *
+ * In its more advanced mode. this structure can also be used to expose the
+ * precise relationship of the CPU counter to real time, as calibrated by the
+ * host. This means that userspace applications can have accurate time
+ * immediately after live migration, rather than having to pause operations
+ * and wait for NTP to recover. This mode does, of course, rely on the
+ * counter being reliable and consistent across CPUs.
+ *
+ * Note that this must be true UTC, never with smeared leap seconds. If a
+ * guest wishes to construct a smeared clock, it can do so. Presenting a
+ * smeared clock through this interface would be problematic because it
+ * actually messes with the apparent counter *period*. A linear smearing
+ * of 1 ms per second would effectively tweak the counter period by 1000PPM
+ * at the start/end of the smearing period, while a sinusoidal smear would
+ * basically be impossible to represent.
+ */
+
+#ifndef __VMCLOCK_ABI_H__
+#define __VMCLOCK_ABI_H__
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+struct vmclock_abi {
+ uint32_t magic;
+#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */
+ uint16_t size; /* Size of page containing this structure */
+ uint16_t version; /* 1 */
+
+ /* Sequence lock. Low bit means an update is in progress. */
+ uint32_t seq_count;
+
+ uint32_t flags;
+ /* Indicates that the tai_offset_sec field is valid */
+#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0)
+ /*
+ * Optionally used to notify guests of pending maintenance events.
+ * A guest may wish to remove itself from service if an event is
+ * coming up. Two flags indicate the rough imminence of the event.
+ */
+#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */
+#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */
+ /* Indicates that the utc_time_maxerror_picosec field is valid */
+#define VMCLOCK_FLAG_UTC_MAXERROR_VALID (1 << 3)
+ /* Indicates counter_period_error_rate_frac_sec is valid */
+#define VMCLOCK_FLAG_PERIOD_ERROR_VALID (1 << 4)
+
+ /*
+ * This field changes to another non-repeating value when the CPU
+ * counter is disrupted, for example on live migration. This lets
+ * the guest know that it should discard any calibration it has
+ * performed of the counter against external sources (NTP/PTP/etc.).
+ */
+ uint64_t disruption_marker;
+
+ uint8_t clock_status;
+#define VMCLOCK_STATUS_UNKNOWN 0
+#define VMCLOCK_STATUS_INITIALIZING 1
+#define VMCLOCK_STATUS_SYNCHRONIZED 2
+#define VMCLOCK_STATUS_FREERUNNING 3
+#define VMCLOCK_STATUS_UNRELIABLE 4
+
+ uint8_t counter_id;
+#define VMCLOCK_COUNTER_INVALID 0
+#define VMCLOCK_COUNTER_X86_TSC 1
+#define VMCLOCK_COUNTER_ARM_VCNT 2
+#define VMCLOCK_COUNTER_X86_ART 3
+
+ /*
+ * By providing the offset from UTC to TAI, the guest can know both
+ * UTC and TAI reliably, whichever is indicated in the time_type
+ * field. Valid if VMCLOCK_FLAG_TAI_OFFSET_VALID is set in flags.
+ */
+ int16_t tai_offset_sec;
+
+ /*
+ * The time exposed through this device is never smeaared; if it
+ * claims to be VMCLOCK_TIME_UTC then it MUST be UTC. This field
+ * provides a hint to the guest operating system, such that *if*
+ * the guest OS wants to provide its users with an alternative
+ * clock which does not follow the POSIX CLOCK_REALTIME standard,
+ * it may do so in a fashion consistent with the other systems
+ * in the nearby environment.
+ */
+ uint8_t leap_second_smearing_hint;
+ /* Provide true UTC to users, unsmeared. */;
+#define VMCLOCK_SMEARING_NONE 0
+ /*
+ * https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/
+ * From noon on the day before to noon on the day after, smear the
+ * clock by a linear 1/86400s per second.
+ */
+#define VMCLOCK_SMEARING_LINEAR_86400 1
+ /*
+ * draft-kuhn-leapsecond-00
+ * For the 1000s leading up to the leap second, smear the clock by
+ * clock by a linear 1ms per second.
+ */
+#define VMCLOCK_SMEARING_UTC_SLS 2
+
+ /*
+ * What time is exposed in the time_sec/time_frac_sec fields?
+ */
+ uint8_t time_type;
+#define VMCLOCK_TIME_UNKNOWN 0 /* Invalid / no time exposed */
+#define VMCLOCK_TIME_UTC 1 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_TAI 2 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_MONOTONIC 3 /* Since undefined epoch */
+
+ /* Bit shift for counter_period_frac_sec and its error rate */
+ uint8_t counter_period_shift;
+
+ /*
+ * Unlike in NTP, this can indicate a leap second in the past. This
+ * is needed to allow guests to derive an imprecise clock with
+ * smeared leap seconds for themselves, as some modes of smearing
+ * need the adjustments to continue even after the moment at which
+ * the leap second should have occurred.
+ */
+ int8_t leapsecond_direction;
+ uint64_t leapsecond_tai_sec; /* Since 1970-01-01 00:00:00z */
+
+ /*
+ * Paired values of counter and UTC at a given point in time.
+ */
+ uint64_t counter_value;
+ uint64_t time_sec;
+ uint64_t time_frac_sec;
+
+ /*
+ * Counter frequency, and error margin. The unit of these fields is
+ * seconds >> (64 + counter_period_shift)
+ */
+ uint64_t counter_period_frac_sec;
+ uint64_t counter_period_error_rate_frac_sec;
+
+ /* Error margin of UTC reading above (± picoseconds) */
+ uint64_t utc_time_maxerror_picosec;
+};
+
+#endif /* __VMCLOCK_ABI_H__ */
new file mode 100644
@@ -0,0 +1,177 @@
+/*
+ * Virtual Machine Clock Device
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "hw/i386/e820_memory_layout.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/vmclock.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "migration/vmstate.h"
+#include "sysemu/reset.h"
+
+#include "vmclock-abi.h"
+
+void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
+ BIOSLinker *linker, const char *oem_id)
+{
+ Aml *ssdt, *dev, *scope, *method, *addr, *crs;
+ AcpiTable table = { .sig = "SSDT", .rev = 1,
+ .oem_id = oem_id, .oem_table_id = "VMCLOCK" };
+
+ /* Put VMCLOCK into a separate SSDT table */
+ acpi_table_begin(&table, table_data);
+ ssdt = init_aml_allocator();
+
+ scope = aml_scope("\\_SB");
+ dev = aml_device("VCLK");
+ aml_append(dev, aml_name_decl("_HID", aml_string("QEMUVCLK")));
+ aml_append(dev, aml_name_decl("_CID", aml_string("VMCLOCK")));
+ aml_append(dev, aml_name_decl("_DDN", aml_string("VMCLOCK")));
+
+ /* Simple status method */
+ method = aml_method("_STA", 0, AML_NOTSERIALIZED);
+ addr = aml_local(0);
+ aml_append(method, aml_store(aml_int(0xf), addr));
+ aml_append(method, aml_return(addr));
+ aml_append(dev, method);
+
+ crs = aml_resource_template();
+ aml_append(crs, aml_qword_memory(AML_POS_DECODE,
+ AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_CACHEABLE, AML_READ_ONLY,
+ 0xffffffffffffffffULL,
+ vms->physaddr,
+ vms->physaddr + VMCLOCK_SIZE - 1,
+ 0, VMCLOCK_SIZE));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ aml_append(ssdt, scope);
+
+ g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+ acpi_table_end(linker, &table);
+ free_aml_allocator();
+}
+
+static void vmclock_update_guest(VmclockState *vms)
+{
+ if (!vms->clk) {
+ return;
+ }
+ vms->clk->seq_count |= 1;
+ smp_wmb();
+
+ vms->clk->disruption_marker++;
+
+ smp_wmb();
+ vms->clk->seq_count += 1;
+}
+
+/* After restoring an image, we need to update the guest memory and notify
+ * it of a potential change to VM Generation ID
+ */
+static int vmclock_post_load(void *opaque, int version_id)
+{
+ VmclockState *vms = opaque;
+ vmclock_update_guest(vms);
+ return 0;
+}
+
+static const VMStateDescription vmstate_vmclock = {
+ .name = "vmclock",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = vmclock_post_load,
+ .fields = (const VMStateField[]) {
+ VMSTATE_UINT64(physaddr, VmclockState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static void vmclock_handle_reset(void *opaque)
+{
+ VmclockState *vms = VMCLOCK(opaque);
+
+ if (!memory_region_is_mapped(&vms->clk_page)) {
+ memory_region_add_subregion_overlap(get_system_memory(),
+ vms->physaddr,
+ &vms->clk_page, 0);
+ }
+}
+
+static void vmclock_realize(DeviceState *dev, Error **errp)
+{
+ VmclockState *vms = VMCLOCK(dev);
+
+ /* Given that this function is executing, there is at least one VMCLOCK
+ * device. Check if there are several.
+ */
+ if (!find_vmclock_dev()) {
+ error_setg(errp, "at most one %s device is permitted", TYPE_VMCLOCK);
+ return;
+ }
+
+ vms->physaddr = VMCLOCK_ADDR;
+
+ e820_add_entry(vms->physaddr, VMCLOCK_SIZE, E820_RESERVED);
+
+ memory_region_init_ram(&vms->clk_page, OBJECT(dev), "vmclock_page",
+ VMCLOCK_SIZE, &error_abort);
+ memory_region_set_enabled(&vms->clk_page, true);
+ vms->clk = memory_region_get_ram_ptr(&vms->clk_page);
+ memset(vms->clk, 0, VMCLOCK_SIZE);
+
+ vms->clk->magic = cpu_to_le32(VMCLOCK_MAGIC);
+ vms->clk->size = cpu_to_le16(VMCLOCK_SIZE);
+ vms->clk->version = cpu_to_le16(1);
+
+ /* These are all zero and thus default, but be explicit */
+ vms->clk->time_type = VMCLOCK_TIME_UNKNOWN;
+ vms->clk->clock_status = VMCLOCK_STATUS_UNKNOWN;
+ vms->clk->counter_id = VMCLOCK_COUNTER_INVALID;
+
+ qemu_register_reset(vmclock_handle_reset, vms);
+
+ vmclock_update_guest(vms);
+}
+
+static Property vmclock_device_properties[] = {
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vmclock_device_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->vmsd = &vmstate_vmclock;
+ dc->realize = vmclock_realize;
+ device_class_set_props(dc, vmclock_device_properties);
+ dc->hotpluggable = false;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo vmclock_device_info = {
+ .name = TYPE_VMCLOCK,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(VmclockState),
+ .class_init = vmclock_device_class_init,
+};
+
+static void vmclock_register_types(void)
+{
+ type_register_static(&vmclock_device_info);
+}
+
+type_init(vmclock_register_types)
@@ -43,6 +43,7 @@ config PC
select SERIAL_ISA
select ACPI_PCI
select ACPI_VMGENID
+ select ACPI_VMCLOCK
select VIRTIO_PMEM_SUPPORTED
select VIRTIO_MEM_SUPPORTED
select HV_BALLOON_SUPPORTED
@@ -43,6 +43,7 @@
#include "sysemu/tpm.h"
#include "hw/acpi/tpm.h"
#include "hw/acpi/vmgenid.h"
+#include "hw/acpi/vmclock.h"
#include "hw/acpi/erst.h"
#include "hw/acpi/piix4.h"
#include "sysemu/tpm_backend.h"
@@ -2508,7 +2509,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
size_t aml_len = 0;
GArray *tables_blob = tables->table_data;
AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL };
- Object *vmgenid_dev;
+ Object *vmgenid_dev, *vmclock_dev;
char *oem_id;
char *oem_table_id;
@@ -2588,6 +2589,13 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
tables->vmgenid, tables->linker, x86ms->oem_id);
}
+ vmclock_dev = find_vmclock_dev();
+ if (vmclock_dev) {
+ acpi_add_table(table_offsets, tables_blob);
+ vmclock_build_acpi(VMCLOCK(vmclock_dev), tables_blob, tables->linker,
+ x86ms->oem_id);
+ }
+
if (misc.has_hpet) {
acpi_add_table(table_offsets, tables_blob);
build_hpet(tables_blob, tables->linker, x86ms->oem_id,
new file mode 100644
@@ -0,0 +1,34 @@
+#ifndef ACPI_VMCLOCK_H
+#define ACPI_VMCLOCK_H
+
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/qdev-core.h"
+#include "qemu/uuid.h"
+#include "qom/object.h"
+
+#define TYPE_VMCLOCK "vmclock"
+
+#define VMCLOCK_ADDR 0xfeffb000
+#define VMCLOCK_SIZE 0x1000
+
+OBJECT_DECLARE_SIMPLE_TYPE(VmclockState, VMCLOCK)
+
+struct vmclock_abi;
+
+struct VmclockState {
+ DeviceState parent_obj;
+ MemoryRegion clk_page;
+ uint64_t physaddr;
+ struct vmclock_abi *clk;
+};
+
+/* returns NULL unless there is exactly one device */
+static inline Object *find_vmclock_dev(void)
+{
+ return object_resolve_path_type("", TYPE_VMCLOCK, NULL);
+}
+
+void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
+ BIOSLinker *linker, const char *oem_id);
+
+#endif