Skip to content

Commit

Permalink
configurable bounce buffer for dma
Browse files Browse the repository at this point in the history
  • Loading branch information
juantian committed Nov 7, 2024
1 parent 87b6429 commit 0e6886c
Show file tree
Hide file tree
Showing 18 changed files with 327 additions and 93 deletions.
5 changes: 5 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,7 @@ dependencies = [
"async-trait",
"futures",
"guestmem",
"hvdef",
"inspect",
"scsi_buffers",
"stackfuture",
Expand Down Expand Up @@ -2351,6 +2352,7 @@ name = "guestmem"
version = "0.0.0"
dependencies = [
"inspect",
"memory_range",
"pal_event",
"sparse_mmap",
"thiserror",
Expand Down Expand Up @@ -4191,6 +4193,7 @@ dependencies = [
"futures",
"guestmem",
"guid",
"hvdef",
"inspect",
"inspect_counters",
"mesh",
Expand All @@ -4205,8 +4208,10 @@ dependencies = [
"task_control",
"test_with_tracing",
"thiserror",
"tracelimit",
"tracing",
"user_driver",
"virt_mshv_vtl",
"vmcore",
"zerocopy",
]
Expand Down
32 changes: 24 additions & 8 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use thiserror::Error;
use tracing::Instrument;
use user_driver::vfio::VfioDevice;
use user_driver::vfio::VfioDmaBuffer;
use virt_mshv_vtl::UhPartition;
use vm_resource::kind::DiskHandleKind;
use vm_resource::AsyncResolveResource;
use vm_resource::ResourceId;
Expand Down Expand Up @@ -83,6 +84,9 @@ impl NvmeManager {
driver_source: &VmTaskDriverSource,
vp_count: u32,
dma_buffer: Arc<dyn VfioDmaBuffer>,
dma_bounce_buffer_pages_per_queue: u64,
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
partition: Option<Arc<UhPartition>>,
) -> Self {
let (send, recv) = mesh::channel();
let driver = driver_source.simple();
Expand All @@ -91,6 +95,9 @@ impl NvmeManager {
devices: HashMap::new(),
vp_count,
dma_buffer,
dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold,
partition,
};
let task = driver.spawn("nvme-manager", async move { worker.run(recv).await });
Self {
Expand Down Expand Up @@ -167,6 +174,9 @@ struct NvmeManagerWorker {
#[inspect(skip)]
dma_buffer: Arc<dyn VfioDmaBuffer>,
vp_count: u32,
dma_bounce_buffer_pages_per_queue: u64,
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
partition: Option<Arc<UhPartition>>,
}

impl NvmeManagerWorker {
Expand Down Expand Up @@ -238,14 +248,20 @@ impl NvmeManagerWorker {
.await
.map_err(InnerError::Vfio)?;

let driver =
nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
.instrument(tracing::info_span!(
"nvme_driver_init",
pci_id = entry.key()
))
.await
.map_err(InnerError::DeviceInitFailed)?;
let driver = nvme_driver::NvmeDriver::new(
&self.driver_source,
self.vp_count,
device,
self.dma_bounce_buffer_pages_per_queue,
self.dma_bounce_buffer_pages_per_io_threshold,
self.partition.clone(),
)
.instrument(tracing::info_span!(
"nvme_driver_init",
pci_id = entry.key()
))
.await
.map_err(InnerError::DeviceInitFailed)?;

entry.insert(driver)
}
Expand Down
123 changes: 82 additions & 41 deletions openhcl/underhill_core/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1246,12 +1246,89 @@ async fn new_underhill_vm(

let boot_info = runtime_params.parsed_openhcl_boot();

// Determine if x2apic is supported so that the topology matches
// reality.
//
// We don't know if x2apic is forced on, but currently it doesn't really
// matter because the topology's initial x2apic state is not currently
// used in Underhill.
//
// FUTURE: consider having Underhill decide whether x2apic is enabled at
// boot rather than allowing the host to make that decision. This would
// just require Underhill setting the apicbase register on the VPs
// before start.
//
// TODO: centralize cpuid querying logic.
#[cfg(guest_arch = "x86_64")]
let x2apic = if isolation.is_hardware_isolated() {
// For hardware CVMs, always enable x2apic support at boot.
vm_topology::processor::x86::X2ApicState::Enabled
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
& (1 << 21)
!= 0
{
vm_topology::processor::x86::X2ApicState::Supported
} else {
vm_topology::processor::x86::X2ApicState::Unsupported
};

#[cfg(guest_arch = "x86_64")]
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
.context("failed to construct the processor topology")?;

#[cfg(guest_arch = "aarch64")]
let processor_topology = new_aarch64_topology(
boot_info
.gic
.context("did not get gic state from bootloader")?,
&boot_info.cpus,
)
.context("failed to construct the processor topology")?;

// The amount of memory required by the GET igvm_attest request
let attestation = get_protocol::IGVM_ATTEST_MSG_SHARED_GPA as u64 * hvdef::HV_PAGE_SIZE;

// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
let device_dma = 10 * 1024 * 1024;
const MIN_PER_QUEUE_PAGES: u64 = (128 * 1024 + hvdef::HV_PAGE_SIZE) / hvdef::HV_PAGE_SIZE;
const DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE: u64 = 128;
#[allow(clippy::assertions_on_constants)]
const _: () = assert!(
DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE >= MIN_PER_QUEUE_PAGES,
"not enough room for an ATAPI IO plus a PRP list"
);

const DEFAULT_NVME_DRIVERS: u32 = 8;
let (max_nvme_drivers, dma_bounce_buffer_pages_per_queue, dma_bounce_buffer_pages_per_io_threshold) = dps.general.vtl2_settings.as_ref().map_or(
(DEFAULT_NVME_DRIVERS, DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE, None),
|vtl2_settings| {
let original_dma_bounce_buffer_pages_per_queue = vtl2_settings
.fixed
.dma_bounce_buffer_pages_per_queue
.unwrap_or(DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE);

let dma_bounce_buffer_pages_per_queue = if original_dma_bounce_buffer_pages_per_queue < MIN_PER_QUEUE_PAGES {
tracing::warn!(
"the value of dma_bounce_buffer_pages_per_queue ({}) is less than MIN_PER_QUEUE_PAGES ({})",
original_dma_bounce_buffer_pages_per_queue, MIN_PER_QUEUE_PAGES
);
MIN_PER_QUEUE_PAGES
} else {
original_dma_bounce_buffer_pages_per_queue
};

(
vtl2_settings.fixed.max_nvme_drivers.unwrap_or(DEFAULT_NVME_DRIVERS),
dma_bounce_buffer_pages_per_queue,
vtl2_settings.fixed.dma_bounce_buffer_pages_per_io_threshold,
)
},
);

// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
let device_dma = 10 * 1024 * 1024
+ max_nvme_drivers as u64
* processor_topology.vp_count() as u64
* dma_bounce_buffer_pages_per_queue
* hvdef::HV_PAGE_SIZE;
// Determine the amount of shared memory to reserve from VTL0.
let shared_pool_size = match isolation {
#[cfg(guest_arch = "x86_64")]
Expand Down Expand Up @@ -1314,45 +1391,6 @@ async fn new_underhill_vm(
physical_address_size,
)?;

// Determine if x2apic is supported so that the topology matches
// reality.
//
// We don't know if x2apic is forced on, but currently it doesn't really
// matter because the topology's initial x2apic state is not currently
// used in Underhill.
//
// FUTURE: consider having Underhill decide whether x2apic is enabled at
// boot rather than allowing the host to make that decision. This would
// just require Underhill setting the apicbase register on the VPs
// before start.
//
// TODO: centralize cpuid querying logic.
#[cfg(guest_arch = "x86_64")]
let x2apic = if isolation.is_hardware_isolated() {
// For hardware CVMs, always enable x2apic support at boot.
vm_topology::processor::x86::X2ApicState::Enabled
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
& (1 << 21)
!= 0
{
vm_topology::processor::x86::X2ApicState::Supported
} else {
vm_topology::processor::x86::X2ApicState::Unsupported
};

#[cfg(guest_arch = "x86_64")]
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
.context("failed to construct the processor topology")?;

#[cfg(guest_arch = "aarch64")]
let processor_topology = new_aarch64_topology(
boot_info
.gic
.context("did not get gic state from bootloader")?,
&boot_info.cpus,
)
.context("failed to construct the processor topology")?;

let mut with_vmbus: bool = false;
let mut with_vmbus_relay = false;
if dps.general.vmbus_redirection_enabled {
Expand Down Expand Up @@ -1770,6 +1808,9 @@ async fn new_underhill_vm(
&driver_source,
processor_topology.vp_count(),
vfio_dma_buffer(&shared_vis_pages_pool),
dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold,
Some(partition.clone()),
);

resolver.add_async_resolver::<DiskHandleKind, _, NvmeDiskConfig, _>(NvmeDiskResolver::new(
Expand Down
21 changes: 21 additions & 0 deletions openhcl/virt_mshv_vtl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1774,6 +1774,27 @@ impl UhPartition {
);
}
}

/// Pins the specified guest physical address ranges in the hypervisor.
/// The memory ranges passed to this function must be VA backed memory.
/// If a partial failure occurs (i.e., some but not all the ranges were successfully pinned),
/// the function will automatically attempt to unpin any successfully pinned ranges.
/// This "rollback" behavior ensures that no partially pinned state remains, which
/// could otherwise lead to inconsistencies.
///
pub fn pin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
self.inner.hcl.pin_gpa_ranges(ranges)
}

/// Unpins the specified guest physical address ranges in the hypervisor.
/// The memory ranges passed to this function must be VA backed memory.
/// If a partial failure occurs (i.e., some but not all the ranges were successfully unpinned),
/// the function will automatically attempt to pin any successfully unpinned ranges. This "rollback"
/// behavior ensures that no partially unpinned state remains, which could otherwise lead to inconsistencies.
///
pub fn unpin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
self.inner.hcl.unpin_gpa_ranges(ranges)
}
}

#[cfg(guest_arch = "x86_64")]
Expand Down
8 changes: 7 additions & 1 deletion vm/devices/get/underhill_config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,14 @@ pub struct Vtl2SettingsFixed {
pub scsi_sub_channels: u16,
/// size of the io-uring submission queues
pub io_ring_size: u32,
/// Max bounce buffer pages active per cpu
/// Max bounce buffer pages active per cpu for unaligned IOs
pub max_bounce_buffer_pages: Option<u32>,
// DMA bounce buffer pages per queue
pub dma_bounce_buffer_pages_per_queue: Option<u64>,
// Threshold of io size in pages to use bounce buffer
pub dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
// Max nvme drivers
pub max_nvme_drivers: Option<u32>,
}

#[derive(Debug, Clone, MeshPayload, Inspect)]
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/get/underhill_config/src/schema/v1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,9 @@ impl ParseSchema<crate::Vtl2SettingsFixed> for Vtl2SettingsFixed {
scsi_sub_channels: self.scsi_sub_channels.map_or(0, |x| x as u16),
io_ring_size: self.io_ring_size.unwrap_or(256),
max_bounce_buffer_pages: self.max_bounce_buffer_pages,
dma_bounce_buffer_pages_per_queue: self.dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold: self.dma_bounce_buffer_pages_per_io_threshold,
max_nvme_drivers: self.max_nvme_drivers,
})
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ message Vtl2SettingsFixed {
optional uint32 io_ring_size = 2;
// Specify the maximum number of bounce buffer pages allowed per cpu
optional uint32 max_bounce_buffer_pages = 3;
optional uint64 dma_bounce_buffer_pages_per_queue = 4;
optional uint32 dma_bounce_buffer_pages_per_io_threshold = 5;
optional uint32 max_nvme_drivers = 6;
}

message Vtl2SettingsDynamic {
Expand Down
1 change: 1 addition & 0 deletions vm/devices/storage/disk_backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ rust-version.workspace = true

[dependencies]
scsi_buffers.workspace = true
hvdef.workspace = true

guestmem.workspace = true
vm_resource.workspace = true
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/storage/disk_backend/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod sync_wrapper;
pub mod zerodisk;

use guestmem::AccessError;
use hvdef::HvError;
use inspect::Inspect;
use scsi_buffers::RequestBuffers;
use stackfuture::StackFuture;
Expand Down Expand Up @@ -50,6 +51,8 @@ pub enum DiskError {
ReservationConflict,
#[error("unsupported eject")]
UnsupportedEject,
#[error("failed to pin/unpin guest memory {0}")]
Hv(HvError),
}

/// Io error details
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ task_control.workspace = true
user_driver.workspace = true
guestmem.workspace = true
vmcore.workspace = true
virt_mshv_vtl.workspace = true
hvdef.workspace = true

anyhow.workspace = true
event-listener.workspace = true
Expand All @@ -25,6 +27,7 @@ safeatomic.workspace = true
slab.workspace = true
thiserror.workspace = true
tracing.workspace = true
tracelimit.workspace = true
zerocopy.workspace = true

[dev-dependencies]
Expand Down
Loading

0 comments on commit 0e6886c

Please sign in to comment.