microsoft · juantian8seattle · Nov 7, 2024 · Nov 8, 2024 · mattkur · Nov 8, 2024
@@ -1065,6 +1065,7 @@ dependencies = [
  "async-trait",
  "futures",
  "guestmem",
+ "hvdef",
  "inspect",
  "scsi_buffers",
  "stackfuture",
@@ -2352,6 +2353,7 @@ name = "guestmem"
 version = "0.0.0"
 dependencies = [
  "inspect",
+ "memory_range",
  "pal_event",
  "sparse_mmap",
  "thiserror 2.0.0",
@@ -4193,6 +4195,7 @@ dependencies = [
  "futures",
  "guestmem",
  "guid",
+ "hvdef",
  "inspect",
  "inspect_counters",
  "mesh",
@@ -4207,8 +4210,10 @@ dependencies = [
  "task_control",
  "test_with_tracing",
  "thiserror 2.0.0",
+ "tracelimit",
  "tracing",
  "user_driver",
+ "virt_mshv_vtl",
  "vmcore",
  "zerocopy",
 ]

@@ -24,6 +24,7 @@ use thiserror::Error;
 use tracing::Instrument;
 use user_driver::vfio::VfioDevice;
 use user_driver::vfio::VfioDmaBuffer;
+use virt_mshv_vtl::UhPartition;
 use vm_resource::kind::DiskHandleKind;
 use vm_resource::AsyncResolveResource;
 use vm_resource::ResourceId;
@@ -83,6 +84,9 @@ impl NvmeManager {
         driver_source: &VmTaskDriverSource,
         vp_count: u32,
         dma_buffer: Arc<dyn VfioDmaBuffer>,
+        dma_bounce_buffer_pages_per_queue: u64,
+        dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
+        partition: Option<Arc<UhPartition>>,
     ) -> Self {
         let (send, recv) = mesh::channel();
         let driver = driver_source.simple();
@@ -91,6 +95,9 @@ impl NvmeManager {
             devices: HashMap::new(),
             vp_count,
             dma_buffer,
+            dma_bounce_buffer_pages_per_queue,
+            dma_bounce_buffer_pages_per_io_threshold,
+            partition,
         };
         let task = driver.spawn("nvme-manager", async move { worker.run(recv).await });
         Self {
@@ -167,6 +174,9 @@ struct NvmeManagerWorker {
     #[inspect(skip)]
     dma_buffer: Arc<dyn VfioDmaBuffer>,
     vp_count: u32,
+    dma_bounce_buffer_pages_per_queue: u64,
+    dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
+    partition: Option<Arc<UhPartition>>,
 }
 
 impl NvmeManagerWorker {
@@ -238,14 +248,20 @@ impl NvmeManagerWorker {
                         .await
                         .map_err(InnerError::Vfio)?;
 
-                let driver =
-                    nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
-                        .instrument(tracing::info_span!(
-                            "nvme_driver_init",
-                            pci_id = entry.key()
-                        ))
-                        .await
-                        .map_err(InnerError::DeviceInitFailed)?;
+                let driver = nvme_driver::NvmeDriver::new(
+                    &self.driver_source,
+                    self.vp_count,
+                    device,
+                    self.dma_bounce_buffer_pages_per_queue,
+                    self.dma_bounce_buffer_pages_per_io_threshold,
+                    self.partition.clone(),
+                )
+                .instrument(tracing::info_span!(
+                    "nvme_driver_init",
+                    pci_id = entry.key()
+                ))
+                .await
+                .map_err(InnerError::DeviceInitFailed)?;
 
                 entry.insert(driver)
             }

@@ -1207,12 +1207,89 @@ async fn new_underhill_vm(
 
     let boot_info = runtime_params.parsed_openhcl_boot();
 
+    // Determine if x2apic is supported so that the topology matches
+    // reality.
+    //
+    // We don't know if x2apic is forced on, but currently it doesn't really
+    // matter because the topology's initial x2apic state is not currently
+    // used in Underhill.
+    //
+    // FUTURE: consider having Underhill decide whether x2apic is enabled at
+    // boot rather than allowing the host to make that decision. This would
+    // just require Underhill setting the apicbase register on the VPs
+    // before start.
+    //
+    // TODO: centralize cpuid querying logic.
+    #[cfg(guest_arch = "x86_64")]
+    let x2apic = if isolation.is_hardware_isolated() {
+        // For hardware CVMs, always enable x2apic support at boot.
+        vm_topology::processor::x86::X2ApicState::Enabled
+    } else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
+        & (1 << 21)
+        != 0
+    {
+        vm_topology::processor::x86::X2ApicState::Supported
+    } else {
+        vm_topology::processor::x86::X2ApicState::Unsupported
+    };
+
+    #[cfg(guest_arch = "x86_64")]
+    let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
+        .context("failed to construct the processor topology")?;
+
+    #[cfg(guest_arch = "aarch64")]
+    let processor_topology = new_aarch64_topology(
+        boot_info
+            .gic
+            .context("did not get gic state from bootloader")?,
+        &boot_info.cpus,
+    )
+    .context("failed to construct the processor topology")?;
+
     // The amount of memory required by the GET igvm_attest request
     let attestation = get_protocol::IGVM_ATTEST_MSG_SHARED_GPA as u64 * hvdef::HV_PAGE_SIZE;
 
-    // TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
-    let device_dma = 10 * 1024 * 1024;
+    const MIN_PER_QUEUE_PAGES: u64 = (128 * 1024 + hvdef::HV_PAGE_SIZE) / hvdef::HV_PAGE_SIZE;
+    const DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE: u64 = 128;
+    #[allow(clippy::assertions_on_constants)]
+    const _: () = assert!(
+        DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE >= MIN_PER_QUEUE_PAGES,
+        "not enough room for an ATAPI IO plus a PRP list"
+    );
+
+    const DEFAULT_NVME_DRIVERS: u32 = 8;
+    let (max_nvme_drivers, dma_bounce_buffer_pages_per_queue, dma_bounce_buffer_pages_per_io_threshold) = dps.general.vtl2_settings.as_ref().map_or(
+        (DEFAULT_NVME_DRIVERS, DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE, None),
+        |vtl2_settings| {
+            let original_dma_bounce_buffer_pages_per_queue = vtl2_settings
+                .fixed
+                .dma_bounce_buffer_pages_per_queue
+                .unwrap_or(DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE);
+
+            let dma_bounce_buffer_pages_per_queue = if original_dma_bounce_buffer_pages_per_queue < MIN_PER_QUEUE_PAGES {
+                tracing::warn!(
+                    "the value of dma_bounce_buffer_pages_per_queue ({}) is less than MIN_PER_QUEUE_PAGES ({})",
+                    original_dma_bounce_buffer_pages_per_queue, MIN_PER_QUEUE_PAGES
+                );
+                MIN_PER_QUEUE_PAGES
+            } else {
+                original_dma_bounce_buffer_pages_per_queue
+            };
+
+            (
+                vtl2_settings.fixed.max_nvme_drivers.unwrap_or(DEFAULT_NVME_DRIVERS),
+                dma_bounce_buffer_pages_per_queue,
+                vtl2_settings.fixed.dma_bounce_buffer_pages_per_io_threshold,
+            )
+        },
+    );
 
+    // TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
+    let device_dma = 10 * 1024 * 1024
+        + max_nvme_drivers as u64
+            * processor_topology.vp_count() as u64
+            * dma_bounce_buffer_pages_per_queue
+            * hvdef::HV_PAGE_SIZE;
     // Determine the amount of shared memory to reserve from VTL0.
     let shared_pool_size = match isolation {
         #[cfg(guest_arch = "x86_64")]
@@ -1275,45 +1352,6 @@ async fn new_underhill_vm(
         physical_address_size,
     )?;
 
-    // Determine if x2apic is supported so that the topology matches
-    // reality.
-    //
-    // We don't know if x2apic is forced on, but currently it doesn't really
-    // matter because the topology's initial x2apic state is not currently
-    // used in Underhill.
-    //
-    // FUTURE: consider having Underhill decide whether x2apic is enabled at
-    // boot rather than allowing the host to make that decision. This would
-    // just require Underhill setting the apicbase register on the VPs
-    // before start.
-    //
-    // TODO: centralize cpuid querying logic.
-    #[cfg(guest_arch = "x86_64")]
-    let x2apic = if isolation.is_hardware_isolated() {
-        // For hardware CVMs, always enable x2apic support at boot.
-        vm_topology::processor::x86::X2ApicState::Enabled
-    } else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
-        & (1 << 21)
-        != 0
-    {
-        vm_topology::processor::x86::X2ApicState::Supported
-    } else {
-        vm_topology::processor::x86::X2ApicState::Unsupported
-    };
-
-    #[cfg(guest_arch = "x86_64")]
-    let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
-        .context("failed to construct the processor topology")?;
-
-    #[cfg(guest_arch = "aarch64")]
-    let processor_topology = new_aarch64_topology(
-        boot_info
-            .gic
-            .context("did not get gic state from bootloader")?,
-        &boot_info.cpus,
-    )
-    .context("failed to construct the processor topology")?;
-
     let mut with_vmbus: bool = false;
     let mut with_vmbus_relay = false;
     if dps.general.vmbus_redirection_enabled {
@@ -1737,6 +1775,9 @@ async fn new_underhill_vm(
             &driver_source,
             processor_topology.vp_count(),
             vfio_dma_buffer(&shared_vis_pages_pool),
+            dma_bounce_buffer_pages_per_queue,
+            dma_bounce_buffer_pages_per_io_threshold,
+            Some(partition.clone()),
         );
 
         resolver.add_async_resolver::<DiskHandleKind, _, NvmeDiskConfig, _>(NvmeDiskResolver::new(

@@ -1863,6 +1863,27 @@ impl UhPartition {
             );
         }
     }
+
+    /// Pins the specified guest physical address ranges in the hypervisor.
+    /// The memory ranges passed to this function must be VA backed memory.
+    /// If a partial failure occurs (i.e., some but not all the ranges were successfully pinned),
+    /// the function will automatically attempt to unpin any successfully pinned ranges.
+    /// This "rollback" behavior ensures that no partially pinned state remains, which
+    /// could otherwise lead to inconsistencies.
+    ///
+    pub fn pin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
+        self.inner.hcl.pin_gpa_ranges(ranges)
+    }
+
+    /// Unpins the specified guest physical address ranges in the hypervisor.
+    /// The memory ranges passed to this function must be VA backed memory.
+    /// If a partial failure occurs (i.e., some but not all the ranges were successfully unpinned),
+    /// the function will automatically attempt to pin any successfully unpinned ranges. This "rollback"
+    /// behavior ensures that no partially unpinned state remains, which could otherwise lead to inconsistencies.
+    ///
+    pub fn unpin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
+        self.inner.hcl.unpin_gpa_ranges(ranges)
+    }
 }
 
 #[cfg(guest_arch = "x86_64")]

@@ -178,8 +178,14 @@ pub struct Vtl2SettingsFixed {
     pub scsi_sub_channels: u16,
     /// size of the io-uring submission queues
     pub io_ring_size: u32,
-    /// Max bounce buffer pages active per cpu
+    /// Max bounce buffer pages active per cpu for unaligned IOs
     pub max_bounce_buffer_pages: Option<u32>,
+    // DMA bounce buffer pages per queue
+    pub dma_bounce_buffer_pages_per_queue: Option<u64>,
+    // Threshold of io size in pages to use bounce buffer
+    pub dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
+    // Max nvme drivers
+    pub max_nvme_drivers: Option<u32>,
 }
 
 #[derive(Debug, Clone, MeshPayload, Inspect)]

@@ -528,6 +528,9 @@ impl ParseSchema<crate::Vtl2SettingsFixed> for Vtl2SettingsFixed {
             scsi_sub_channels: self.scsi_sub_channels.map_or(0, |x| x as u16),
             io_ring_size: self.io_ring_size.unwrap_or(256),
             max_bounce_buffer_pages: self.max_bounce_buffer_pages,
+            dma_bounce_buffer_pages_per_queue: self.dma_bounce_buffer_pages_per_queue,
+            dma_bounce_buffer_pages_per_io_threshold: self.dma_bounce_buffer_pages_per_io_threshold,
+            max_nvme_drivers: self.max_nvme_drivers,
         })
     }
 }

@@ -27,6 +27,9 @@ message Vtl2SettingsFixed {
     optional uint32 io_ring_size = 2;
     // Specify the maximum number of bounce buffer pages allowed per cpu
     optional uint32 max_bounce_buffer_pages = 3;
+    optional uint64 dma_bounce_buffer_pages_per_queue = 4;
+    optional uint32 dma_bounce_buffer_pages_per_io_threshold = 5;
+    optional uint32 max_nvme_drivers = 6;
 }
 
 message Vtl2SettingsDynamic {

@@ -8,6 +8,7 @@ rust-version.workspace = true
 
 [dependencies]
 scsi_buffers.workspace = true
+hvdef.workspace = true
 
 guestmem.workspace = true
 vm_resource.workspace = true

@@ -19,6 +19,7 @@ pub mod sync_wrapper;
 pub mod zerodisk;
 
 use guestmem::AccessError;
+use hvdef::HvError;
 use inspect::Inspect;
 use scsi_buffers::RequestBuffers;
 use stackfuture::StackFuture;
@@ -50,6 +51,8 @@ pub enum DiskError {
     ReservationConflict,
     #[error("unsupported eject")]
     UnsupportedEject,
+    #[error("failed to pin/unpin guest memory {0}")]
+    Hv(HvError),
 }
 
 /// Io error details

@@ -16,6 +16,8 @@ task_control.workspace = true
 user_driver.workspace = true
 guestmem.workspace = true
 vmcore.workspace = true
+virt_mshv_vtl.workspace = true
+hvdef.workspace = true
 
 anyhow.workspace = true
 event-listener.workspace = true
@@ -25,6 +27,7 @@ safeatomic.workspace = true
 slab.workspace = true
 thiserror.workspace = true
 tracing.workspace = true
+tracelimit.workspace = true
 zerocopy.workspace = true
 
 [dev-dependencies]