devices/
vfio.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::collections::HashMap;
6use std::ffi::CString;
7use std::fs::File;
8use std::fs::OpenOptions;
9use std::io;
10use std::mem;
11use std::os::raw::c_ulong;
12use std::os::unix::prelude::FileExt;
13use std::path::Path;
14use std::path::PathBuf;
15#[cfg(all(target_os = "android", target_arch = "aarch64"))]
16use std::ptr::addr_of_mut;
17use std::result;
18use std::slice;
19use std::sync::Arc;
20use std::sync::OnceLock;
21
22use base::error;
23use base::ioctl;
24use base::ioctl_with_mut_ptr;
25use base::ioctl_with_mut_ref;
26use base::ioctl_with_ptr;
27use base::ioctl_with_ref;
28use base::ioctl_with_val;
29use base::warn;
30use base::AsRawDescriptor;
31use base::Error;
32use base::Event;
33use base::FromRawDescriptor;
34use base::RawDescriptor;
35use base::SafeDescriptor;
36use cfg_if::cfg_if;
37use data_model::vec_with_array_field;
38use hypervisor::DeviceKind;
39use hypervisor::Vm;
40use rand::seq::index::sample;
41use remain::sorted;
42use resources::address_allocator::AddressAllocator;
43use resources::AddressRange;
44use resources::Alloc;
45use resources::Error as ResourcesError;
46use sync::Mutex;
47use thiserror::Error;
48use vfio_sys::vfio::vfio_acpi_dsm;
49use vfio_sys::vfio::VFIO_IRQ_SET_DATA_BOOL;
50use vfio_sys::*;
51use zerocopy::FromBytes;
52use zerocopy::Immutable;
53use zerocopy::IntoBytes;
54
55use crate::IommuDevType;
56
57#[sorted]
58#[derive(Error, Debug)]
59pub enum VfioError {
60    #[error("failed to duplicate VfioContainer")]
61    ContainerDupError,
62    #[error("failed to set container's IOMMU driver type as {0:?}: {1}")]
63    ContainerSetIOMMU(IommuType, Error),
64    #[error("failed to create KVM vfio device")]
65    CreateVfioKvmDevice,
66    #[error("failed to get Group Status: {0}")]
67    GetGroupStatus(Error),
68    #[error("failed to get vfio device fd: {0}")]
69    GroupGetDeviceFD(Error),
70    #[error("failed to add vfio group into vfio container: {0}")]
71    GroupSetContainer(Error),
72    #[error("group is inviable")]
73    GroupViable,
74    #[error("invalid region index: {0}")]
75    InvalidIndex(usize),
76    #[error("invalid operation")]
77    InvalidOperation,
78    #[error("invalid file path")]
79    InvalidPath,
80    #[error("failed to add guest memory map into iommu table: {0}")]
81    IommuDmaMap(Error),
82    #[error("failed to remove guest memory map from iommu table: {0}")]
83    IommuDmaUnmap(Error),
84    #[error("failed to get IOMMU cap info from host")]
85    IommuGetCapInfo,
86    #[error("failed to get IOMMU info from host: {0}")]
87    IommuGetInfo(Error),
88    #[error("failed to attach device to pKVM pvIOMMU: {0}")]
89    KvmPviommuSetConfig(Error),
90    #[error("failed to set KVM vfio device's attribute: {0}")]
91    KvmSetDeviceAttr(Error),
92    #[error("AddressAllocator is unavailable")]
93    NoRescAlloc,
94    #[error("failed to open /dev/vfio/vfio container: {0}")]
95    OpenContainer(io::Error),
96    #[error("failed to open {1} group: {0}")]
97    OpenGroup(io::Error, String),
98    #[error("failed to read {1} link: {0}")]
99    ReadLink(io::Error, PathBuf),
100    #[error("resources error: {0}")]
101    Resources(ResourcesError),
102    #[error("unknown vfio device type (flags: {0:#x})")]
103    UnknownDeviceType(u32),
104    #[error("failed to call vfio device's ACPI _DSM: {0}")]
105    VfioAcpiDsm(Error),
106    #[error("failed to disable vfio device's acpi notification: {0}")]
107    VfioAcpiNotificationDisable(Error),
108    #[error("failed to enable vfio device's acpi notification: {0}")]
109    VfioAcpiNotificationEnable(Error),
110    #[error("failed to test vfio device's acpi notification: {0}")]
111    VfioAcpiNotificationTest(Error),
112    #[error(
113        "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
114    )]
115    VfioApiVersion,
116    #[error("failed to get vfio device's info or info doesn't match: {0}")]
117    VfioDeviceGetInfo(Error),
118    #[error("failed to get vfio device's region info: {0}")]
119    VfioDeviceGetRegionInfo(Error),
120    #[error("container doesn't support IOMMU driver type {0:?}")]
121    VfioIommuSupport(IommuType),
122    #[error("failed to disable vfio device's irq: {0}")]
123    VfioIrqDisable(Error),
124    #[error("failed to enable vfio device's irq: {0}")]
125    VfioIrqEnable(Error),
126    #[error("failed to mask vfio device's irq: {0}")]
127    VfioIrqMask(Error),
128    #[error("failed to unmask vfio device's irq: {0}")]
129    VfioIrqUnmask(Error),
130    #[error("failed to enter vfio device's low power state: {0}")]
131    VfioPmLowPowerEnter(Error),
132    #[error("failed to exit vfio device's low power state: {0}")]
133    VfioPmLowPowerExit(Error),
134}
135
136type Result<T> = std::result::Result<T, VfioError>;
137
138fn get_error() -> Error {
139    Error::last()
140}
141
142static KVM_VFIO_FILE: OnceLock<Option<SafeDescriptor>> = OnceLock::new();
143
144fn create_kvm_vfio_file(vm: &impl Vm) -> Option<&'static SafeDescriptor> {
145    KVM_VFIO_FILE
146        .get_or_init(|| vm.create_device(DeviceKind::Vfio).ok())
147        .as_ref()
148}
149
150fn kvm_vfio_file() -> Option<&'static SafeDescriptor> {
151    match KVM_VFIO_FILE.get() {
152        Some(Some(v)) => Some(v),
153        _ => None,
154    }
155}
156
157#[derive(Copy, Clone, Debug, PartialEq, Eq)]
158pub enum VfioDeviceType {
159    Pci,
160    Platform,
161}
162
163enum KvmVfioGroupOps {
164    Add,
165    Delete,
166}
167
168#[derive(Debug)]
169pub struct KvmVfioPviommu {
170    file: File,
171}
172
173impl KvmVfioPviommu {
174    pub fn new(vm: &impl Vm) -> Result<Self> {
175        cfg_if! {
176            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
177                let file = Self::ioctl_kvm_dev_vfio_pviommu_attach(vm)?;
178
179                Ok(Self { file })
180            } else {
181                let _ = vm;
182                unimplemented!()
183            }
184        }
185    }
186
187    pub fn attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()> {
188        cfg_if! {
189            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
190                self.ioctl_kvm_pviommu_set_config(device, sid_idx, vsid)
191            } else {
192                let _ = device;
193                let _ = sid_idx;
194                let _ = vsid;
195                unimplemented!()
196            }
197        }
198    }
199
200    pub fn id(&self) -> u32 {
201        let fd = self.as_raw_descriptor();
202        // Guests identify pvIOMMUs to the hypervisor using the corresponding VMM FDs.
203        fd.try_into().unwrap()
204    }
205
206    pub fn get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32> {
207        cfg_if! {
208            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
209                let info = Self::ioctl_kvm_dev_vfio_pviommu_get_info(vm, device)?;
210
211                Ok(info.nr_sids)
212            } else {
213                let _ = vm;
214                let _ = device;
215                unimplemented!()
216            }
217        }
218    }
219
220    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
221    fn ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File> {
222        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
223
224        let vfio_dev_attr = kvm_sys::kvm_device_attr {
225            flags: 0,
226            group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
227            attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_ATTACH as u64,
228            addr: 0,
229        };
230
231        // SAFETY:
232        // Safe as we are the owner of vfio_dev_attr, which is valid.
233        let ret =
234            unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
235
236        if ret < 0 {
237            Err(VfioError::KvmSetDeviceAttr(get_error()))
238        } else {
239            // SAFETY: Safe as we verify the return value.
240            Ok(unsafe { File::from_raw_descriptor(ret) })
241        }
242    }
243
244    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
245    fn ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>(
246        &self,
247        device: &T,
248        sid_idx: u32,
249        vsid: u32,
250    ) -> Result<()> {
251        let config = kvm_sys::kvm_vfio_iommu_config {
252            size: mem::size_of::<kvm_sys::kvm_vfio_iommu_config>() as u32,
253            device_fd: device.as_raw_descriptor(),
254            sid_idx,
255            vsid,
256            __reserved: 0,
257        };
258
259        // SAFETY:
260        // Safe as we are the owner of device and config which are valid, and we verify the return
261        // value.
262        let ret = unsafe { ioctl_with_ref(self, kvm_sys::KVM_PVIOMMU_SET_CONFIG, &config) };
263
264        if ret < 0 {
265            Err(VfioError::KvmPviommuSetConfig(get_error()))
266        } else {
267            Ok(())
268        }
269    }
270
271    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
272    fn ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>(
273        vm: &impl Vm,
274        device: &T,
275    ) -> Result<kvm_sys::kvm_vfio_iommu_info> {
276        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
277
278        let mut info = kvm_sys::kvm_vfio_iommu_info {
279            size: mem::size_of::<kvm_sys::kvm_vfio_iommu_info>() as u32,
280            device_fd: device.as_raw_descriptor(),
281            nr_sids: 0,
282            __reserved: 0,
283        };
284
285        let vfio_dev_attr = kvm_sys::kvm_device_attr {
286            flags: 0,
287            group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
288            attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_GET_INFO as u64,
289            addr: addr_of_mut!(info) as usize as u64,
290        };
291
292        // SAFETY:
293        // Safe as we are the owner of vfio_dev_attr, which is valid.
294        let ret =
295            unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
296
297        if ret < 0 {
298            Err(VfioError::KvmSetDeviceAttr(get_error()))
299        } else {
300            Ok(info)
301        }
302    }
303}
304
305impl AsRawDescriptor for KvmVfioPviommu {
306    fn as_raw_descriptor(&self) -> RawDescriptor {
307        self.file.as_raw_descriptor()
308    }
309}
310
311#[repr(u32)]
312#[derive(Copy, Clone, Debug, PartialEq, Eq)]
313pub enum IommuType {
314    Type1V2 = VFIO_TYPE1v2_IOMMU,
315    PkvmPviommu = VFIO_PKVM_PVIOMMU,
316    // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
317    // small, dynamic mappings. For clients which create large, relatively
318    // static mappings, Type1V2 is still preferred.
319    //
320    // See crrev.com/c/3593528 for the implementation.
321    Type1ChromeOS = 100001,
322}
323
324/// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
325pub struct VfioContainer {
326    container: File,
327    groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
328    iommu_type: Option<IommuType>,
329}
330
331fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T>
332where
333    T: FromBytes,
334{
335    Some(T::read_from_prefix(bytes.get(offset..)?).ok()?.0)
336}
337
338const VFIO_API_VERSION: u8 = 0;
339impl VfioContainer {
340    pub fn new() -> Result<Self> {
341        let container = OpenOptions::new()
342            .read(true)
343            .write(true)
344            .open("/dev/vfio/vfio")
345            .map_err(VfioError::OpenContainer)?;
346
347        Self::new_from_container(container)
348    }
349
350    // Construct a VfioContainer from an exist container file.
351    pub fn new_from_container(container: File) -> Result<Self> {
352        // SAFETY:
353        // Safe as file is vfio container descriptor and ioctl is defined by kernel.
354        let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION) };
355        if version as u8 != VFIO_API_VERSION {
356            return Err(VfioError::VfioApiVersion);
357        }
358
359        Ok(VfioContainer {
360            container,
361            groups: HashMap::new(),
362            iommu_type: None,
363        })
364    }
365
366    fn is_group_set(&self, group_id: u32) -> bool {
367        self.groups.contains_key(&group_id)
368    }
369
370    fn check_extension(&self, val: IommuType) -> bool {
371        // SAFETY:
372        // Safe as file is vfio container and make sure val is valid.
373        let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION, val as c_ulong) };
374        ret != 0
375    }
376
377    fn set_iommu(&mut self, val: IommuType) -> i32 {
378        // SAFETY:
379        // Safe as file is vfio container and make sure val is valid.
380        unsafe { ioctl_with_val(self, VFIO_SET_IOMMU, val as c_ulong) }
381    }
382
383    fn set_iommu_checked(&mut self, val: IommuType) -> Result<()> {
384        if !self.check_extension(val) {
385            Err(VfioError::VfioIommuSupport(val))
386        } else if self.set_iommu(val) != 0 {
387            Err(VfioError::ContainerSetIOMMU(val, get_error()))
388        } else {
389            self.iommu_type = Some(val);
390            Ok(())
391        }
392    }
393
394    /// # Safety
395    ///
396    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
397    pub unsafe fn vfio_dma_map(
398        &self,
399        iova: u64,
400        size: u64,
401        user_addr: u64,
402        write_en: bool,
403    ) -> Result<()> {
404        match self
405            .iommu_type
406            .expect("vfio_dma_map called before configuring IOMMU")
407        {
408            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
409                self.vfio_iommu_type1_dma_map(iova, size, user_addr, write_en)
410            }
411            IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
412        }
413    }
414
415    /// # Safety
416    ///
417    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
418    unsafe fn vfio_iommu_type1_dma_map(
419        &self,
420        iova: u64,
421        size: u64,
422        user_addr: u64,
423        write_en: bool,
424    ) -> Result<()> {
425        let mut dma_map = vfio_iommu_type1_dma_map {
426            argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
427            flags: VFIO_DMA_MAP_FLAG_READ,
428            vaddr: user_addr,
429            iova,
430            size,
431        };
432
433        if write_en {
434            dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
435        }
436
437        let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA, &dma_map);
438        if ret != 0 {
439            return Err(VfioError::IommuDmaMap(get_error()));
440        }
441
442        Ok(())
443    }
444
445    pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
446        match self
447            .iommu_type
448            .expect("vfio_dma_unmap called before configuring IOMMU")
449        {
450            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
451                self.vfio_iommu_type1_dma_unmap(iova, size)
452            }
453            IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
454        }
455    }
456
457    fn vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
458        let mut dma_unmap = vfio_iommu_type1_dma_unmap {
459            argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
460            flags: 0,
461            iova,
462            size,
463            ..Default::default()
464        };
465
466        // SAFETY:
467        // Safe as file is vfio container, dma_unmap is constructed by us, and
468        // we check the return value
469        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA, &mut dma_unmap) };
470        if ret != 0 || dma_unmap.size != size {
471            return Err(VfioError::IommuDmaUnmap(get_error()));
472        }
473
474        Ok(())
475    }
476
477    pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
478        match self
479            .iommu_type
480            .expect("vfio_get_iommu_page_size_mask called before configuring IOMMU")
481        {
482            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
483                self.vfio_iommu_type1_get_iommu_page_size_mask()
484            }
485            IommuType::PkvmPviommu => Ok(0),
486        }
487    }
488
489    fn vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64> {
490        let mut iommu_info = vfio_iommu_type1_info {
491            argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
492            flags: 0,
493            iova_pgsizes: 0,
494            ..Default::default()
495        };
496
497        // SAFETY:
498        // Safe as file is vfio container, iommu_info has valid values,
499        // and we check the return value
500        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info) };
501        if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
502            return Err(VfioError::IommuGetInfo(get_error()));
503        }
504
505        Ok(iommu_info.iova_pgsizes)
506    }
507
508    pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
509        match self
510            .iommu_type
511            .expect("vfio_iommu_iova_get_iova_ranges called before configuring IOMMU")
512        {
513            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
514                self.vfio_iommu_type1_get_iova_ranges()
515            }
516            IommuType::PkvmPviommu => Ok(Vec::new()),
517        }
518    }
519
520    fn vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
521        // Query the buffer size needed fetch the capabilities.
522        let mut iommu_info_argsz = vfio_iommu_type1_info {
523            argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
524            flags: 0,
525            iova_pgsizes: 0,
526            ..Default::default()
527        };
528
529        // SAFETY:
530        // Safe as file is vfio container, iommu_info_argsz has valid values,
531        // and we check the return value
532        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info_argsz) };
533        if ret != 0 {
534            return Err(VfioError::IommuGetInfo(get_error()));
535        }
536
537        if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
538            return Err(VfioError::IommuGetCapInfo);
539        }
540
541        let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
542            iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
543        );
544        iommu_info[0].argsz = iommu_info_argsz.argsz;
545        let ret =
546            // SAFETY:
547            // Safe as file is vfio container, iommu_info has valid values,
548            // and we check the return value
549            unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO, iommu_info.as_mut_ptr()) };
550        if ret != 0 {
551            return Err(VfioError::IommuGetInfo(get_error()));
552        }
553
554        // SAFETY:
555        // Safe because we initialized iommu_info with enough space, u8 has less strict
556        // alignment, and since it will no longer be mutated.
557        let info_bytes = unsafe {
558            std::slice::from_raw_parts(
559                iommu_info.as_ptr() as *const u8,
560                iommu_info_argsz.argsz as usize,
561            )
562        };
563
564        if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
565            return Err(VfioError::IommuGetCapInfo);
566        }
567
568        let mut offset = iommu_info[0].cap_offset as usize;
569        while offset != 0 {
570            let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset)
571                .ok_or(VfioError::IommuGetCapInfo)?;
572
573            if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
574                let iova_header =
575                    extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
576                        info_bytes, offset,
577                    )
578                    .ok_or(VfioError::IommuGetCapInfo)?;
579                let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
580                let mut ret = Vec::new();
581                for i in 0..iova_header.nr_iovas {
582                    ret.push(
583                        extract_vfio_struct::<vfio_iova_range>(
584                            info_bytes,
585                            range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
586                        )
587                        .ok_or(VfioError::IommuGetCapInfo)?,
588                    );
589                }
590                return Ok(ret
591                    .iter()
592                    .map(|range| AddressRange {
593                        start: range.start,
594                        end: range.end,
595                    })
596                    .collect());
597            }
598            offset = header.next as usize;
599        }
600
601        Err(VfioError::IommuGetCapInfo)
602    }
603
604    fn set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()> {
605        match iommu_dev {
606            IommuDevType::CoIommu | IommuDevType::VirtioIommu => {
607                // If we expect granular, dynamic mappings, try the ChromeOS Type1ChromeOS first,
608                // then fall back to upstream versions.
609                self.set_iommu_checked(IommuType::Type1ChromeOS)
610                    .or_else(|_| self.set_iommu_checked(IommuType::Type1V2))
611            }
612            IommuDevType::NoIommu => self.set_iommu_checked(IommuType::Type1V2),
613            IommuDevType::PkvmPviommu => self.set_iommu_checked(IommuType::PkvmPviommu),
614        }
615    }
616
617    fn get_group_with_vm(
618        &mut self,
619        id: u32,
620        vm: &impl Vm,
621        iommu_dev: IommuDevType,
622    ) -> Result<Arc<Mutex<VfioGroup>>> {
623        if let Some(group) = self.groups.get(&id) {
624            return Ok(group.clone());
625        }
626
627        let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
628        if self.groups.is_empty() {
629            self.set_iommu_from(iommu_dev)?;
630            // Before the first group is added into container, do once per container
631            // initialization. Both coiommu and virtio-iommu rely on small, dynamic
632            // mappings. However, if an iommu is not enabled, then we map the entirety
633            // of guest memory as a small number of large, static mappings.
634            match iommu_dev {
635                IommuDevType::CoIommu | IommuDevType::PkvmPviommu | IommuDevType::VirtioIommu => {}
636                IommuDevType::NoIommu => {
637                    for region in vm.get_memory().regions() {
638                        // SAFETY:
639                        // Safe because the guest regions are guaranteed not to overlap
640                        unsafe {
641                            self.vfio_dma_map(
642                                region.guest_addr.0,
643                                region.size as u64,
644                                region.host_addr as u64,
645                                true,
646                            )
647                        }?;
648                    }
649                }
650            }
651        }
652
653        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
654        group
655            .lock()
656            .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
657
658        self.groups.insert(id, group.clone());
659
660        Ok(group)
661    }
662
663    fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
664        if let Some(group) = self.groups.get(&id) {
665            return Ok(group.clone());
666        }
667
668        let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
669
670        if self.groups.is_empty() {
671            // Before the first group is added into container, do once per
672            // container initialization.
673            self.set_iommu_checked(IommuType::Type1V2)?;
674        }
675
676        self.groups.insert(id, group.clone());
677        Ok(group)
678    }
679
680    fn remove_group(&mut self, id: u32, reduce: bool) {
681        let mut remove = false;
682
683        if let Some(group) = self.groups.get(&id) {
684            if reduce {
685                group.lock().reduce_device_num();
686            }
687            if group.lock().device_num() == 0 {
688                let kvm_vfio_file = kvm_vfio_file().expect("kvm vfio file isn't created");
689                if group
690                    .lock()
691                    .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
692                    .is_err()
693                {
694                    warn!("failing in remove vfio group from kvm device");
695                }
696                remove = true;
697            }
698        }
699
700        if remove {
701            self.groups.remove(&id);
702        }
703    }
704
705    pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
706        // SAFETY: this call is safe because it doesn't modify any memory and we
707        // check the return value.
708        let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
709        if raw_descriptor < 0 {
710            Err(VfioError::ContainerDupError)
711        } else {
712            Ok(raw_descriptor)
713        }
714    }
715
716    // Gets group ids for all groups in the container.
717    pub fn group_ids(&self) -> Vec<&u32> {
718        self.groups.keys().collect()
719    }
720}
721
722impl AsRawDescriptor for VfioContainer {
723    fn as_raw_descriptor(&self) -> RawDescriptor {
724        self.container.as_raw_descriptor()
725    }
726}
727
728struct VfioGroup {
729    group: File,
730    device_num: u32,
731}
732
733impl VfioGroup {
734    fn new(container: &VfioContainer, id: u32) -> Result<Self> {
735        let group_path = format!("/dev/vfio/{id}");
736        let group_file = OpenOptions::new()
737            .read(true)
738            .write(true)
739            .open(Path::new(&group_path))
740            .map_err(|e| VfioError::OpenGroup(e, group_path))?;
741
742        let mut group_status = vfio_group_status {
743            argsz: mem::size_of::<vfio_group_status>() as u32,
744            flags: 0,
745        };
746        let mut ret =
747            // SAFETY:
748            // Safe as we are the owner of group_file and group_status which are valid value.
749            unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS, &mut group_status) };
750        if ret < 0 {
751            return Err(VfioError::GetGroupStatus(get_error()));
752        }
753
754        if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
755            return Err(VfioError::GroupViable);
756        }
757
758        let container_raw_descriptor = container.as_raw_descriptor();
759        // SAFETY:
760        // Safe as we are the owner of group_file and container_raw_descriptor which are valid
761        // value, and we verify the ret value
762        ret = unsafe {
763            ioctl_with_ref(
764                &group_file,
765                VFIO_GROUP_SET_CONTAINER,
766                &container_raw_descriptor,
767            )
768        };
769        if ret < 0 {
770            return Err(VfioError::GroupSetContainer(get_error()));
771        }
772
773        Ok(VfioGroup {
774            group: group_file,
775            device_num: 0,
776        })
777    }
778
779    fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
780        let mut uuid_path = PathBuf::new();
781        uuid_path.push(sysfspath);
782        uuid_path.push("iommu_group");
783        let group_path = uuid_path
784            .read_link()
785            .map_err(|e| VfioError::ReadLink(e, uuid_path))?;
786        let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
787        let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
788        let group_id = group_str
789            .parse::<u32>()
790            .map_err(|_| VfioError::InvalidPath)?;
791
792        Ok(group_id)
793    }
794
795    fn kvm_device_set_group(
796        &self,
797        kvm_vfio_file: &SafeDescriptor,
798        ops: KvmVfioGroupOps,
799    ) -> Result<()> {
800        let group_descriptor = self.as_raw_descriptor();
801        let group_descriptor_ptr = &group_descriptor as *const i32;
802        let vfio_dev_attr = match ops {
803            KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
804                flags: 0,
805                group: kvm_sys::KVM_DEV_VFIO_GROUP,
806                attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
807                addr: group_descriptor_ptr as u64,
808            },
809            KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
810                flags: 0,
811                group: kvm_sys::KVM_DEV_VFIO_GROUP,
812                attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
813                addr: group_descriptor_ptr as u64,
814            },
815        };
816
817        // SAFETY:
818        // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
819        // and we verify the return value.
820        if 0 != unsafe {
821            ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr)
822        } {
823            return Err(VfioError::KvmSetDeviceAttr(get_error()));
824        }
825
826        Ok(())
827    }
828
829    fn get_device(&self, name: &str) -> Result<File> {
830        let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
831        let path_ptr = path.as_ptr();
832
833        // SAFETY:
834        // Safe as we are the owner of self and path_ptr which are valid value.
835        let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD, path_ptr) };
836        if ret < 0 {
837            return Err(VfioError::GroupGetDeviceFD(get_error()));
838        }
839
840        // SAFETY:
841        // Safe as ret is valid descriptor
842        Ok(unsafe { File::from_raw_descriptor(ret) })
843    }
844
845    fn add_device_num(&mut self) {
846        self.device_num += 1;
847    }
848
849    fn reduce_device_num(&mut self) {
850        self.device_num -= 1;
851    }
852
853    fn device_num(&self) -> u32 {
854        self.device_num
855    }
856}
857
858impl AsRawDescriptor for VfioGroup {
859    fn as_raw_descriptor(&self) -> RawDescriptor {
860        self.group.as_raw_descriptor()
861    }
862}
863
864/// A helper struct for managing VFIO containers
865#[derive(Default)]
866pub struct VfioContainerManager {
867    /// One VFIO container shared by all VFIO devices that don't attach to any IOMMU device.
868    no_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
869
870    /// For IOMMU enabled devices, all VFIO groups that share the same IOVA space are managed by
871    /// one VFIO container.
872    iommu_containers: Vec<Arc<Mutex<VfioContainer>>>,
873
874    /// One VFIO container shared by all VFIO devices that attach to the CoIOMMU device.
875    coiommu_container: Option<Arc<Mutex<VfioContainer>>>,
876
877    /// One VFIO container shared by all VFIO devices that attach to pKVM.
878    pkvm_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
879}
880
881impl VfioContainerManager {
882    pub fn new() -> Self {
883        Self::default()
884    }
885
886    /// The single place to create a VFIO container for a PCI endpoint.
887    ///
888    /// The policy to determine whether an individual or a shared VFIO container
889    /// will be created for this device is governed by the physical PCI topology,
890    /// and the argument iommu_type.
891    ///
892    ///  # Arguments
893    ///
894    ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
895    ///  * `iommu_type` - which type of IOMMU is enabled on this device
896    pub fn get_container<P: AsRef<Path>>(
897        &mut self,
898        iommu_type: IommuDevType,
899        sysfspath: Option<P>,
900    ) -> Result<Arc<Mutex<VfioContainer>>> {
901        match iommu_type {
902            IommuDevType::NoIommu => {
903                // One VFIO container is used for all IOMMU disabled groups.
904                if let Some(container) = &self.no_iommu_container {
905                    Ok(container.clone())
906                } else {
907                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
908                    self.no_iommu_container = Some(container.clone());
909                    Ok(container)
910                }
911            }
912            IommuDevType::VirtioIommu => {
913                let path = sysfspath.ok_or(VfioError::InvalidPath)?;
914                let group_id = VfioGroup::get_group_id(path)?;
915
916                // One VFIO container is used for all devices that belong to one VFIO group.
917                // NOTE: vfio_wrapper relies on each container containing exactly one group.
918                if let Some(container) = self
919                    .iommu_containers
920                    .iter()
921                    .find(|container| container.lock().is_group_set(group_id))
922                {
923                    Ok(container.clone())
924                } else {
925                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
926                    self.iommu_containers.push(container.clone());
927                    Ok(container)
928                }
929            }
930            IommuDevType::CoIommu => {
931                // One VFIO container is used for devices attached to CoIommu
932                if let Some(container) = &self.coiommu_container {
933                    Ok(container.clone())
934                } else {
935                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
936                    self.coiommu_container = Some(container.clone());
937                    Ok(container)
938                }
939            }
940            IommuDevType::PkvmPviommu => {
941                // One VFIO container is used for devices attached to pKVM
942                if let Some(container) = &self.pkvm_iommu_container {
943                    Ok(container.clone())
944                } else {
945                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
946                    self.pkvm_iommu_container = Some(container.clone());
947                    Ok(container)
948                }
949            }
950        }
951    }
952}
953
954/// Vfio Irq type used to enable/disable/mask/unmask vfio irq
955pub enum VfioIrqType {
956    Intx,
957    Msi,
958    Msix,
959}
960
961/// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
962pub struct VfioIrq {
963    pub flags: u32,
964    pub index: u32,
965}
966
967/// Address on VFIO memory region.
968#[derive(Debug, Default, Clone)]
969pub struct VfioRegionAddr {
970    /// region number.
971    pub index: usize,
972    /// offset in the region.
973    pub addr: u64,
974}
975
976#[derive(Debug)]
977pub struct VfioRegion {
978    // flags for this region: read/write/mmap
979    flags: u32,
980    size: u64,
981    // region offset used to read/write with vfio device descriptor
982    offset: u64,
983    // vectors for mmap offset and size
984    mmaps: Vec<vfio_region_sparse_mmap_area>,
985    // type and subtype for cap type
986    cap_info: Option<(u32, u32)>,
987    // if true, then the caller can safely mmap the MSIX region
988    // if false, the caller should remove the MSIX part of the region before mmapping
989    msix_region_mmappable: bool,
990}
991
992/// Vfio device for exposing regions which could be read/write to kernel vfio device.
993pub struct VfioDevice {
994    dev: File,
995    name: String,
996    container: Arc<Mutex<VfioContainer>>,
997    dev_type: VfioDeviceType,
998    group_descriptor: RawDescriptor,
999    group_id: u32,
1000    // vec for vfio device's regions
1001    regions: Vec<VfioRegion>,
1002    num_irqs: u32,
1003
1004    iova_alloc: Arc<Mutex<AddressAllocator>>,
1005    dt_symbol: Option<String>,
1006    pviommu: Option<(Arc<Mutex<KvmVfioPviommu>>, Vec<u32>)>,
1007}
1008
1009impl VfioDevice {
1010    /// Create a new vfio device, then guest read/write on this device could be
1011    /// transfered into kernel vfio.
1012    /// sysfspath specify the vfio device path in sys file system.
1013    pub fn new_passthrough<P: AsRef<Path>>(
1014        sysfspath: &P,
1015        vm: &impl Vm,
1016        container: Arc<Mutex<VfioContainer>>,
1017        iommu_dev: IommuDevType,
1018        dt_symbol: Option<String>,
1019    ) -> Result<Self> {
1020        let group_id = VfioGroup::get_group_id(sysfspath)?;
1021
1022        let group = container
1023            .lock()
1024            .get_group_with_vm(group_id, vm, iommu_dev)?;
1025        let name_osstr = sysfspath
1026            .as_ref()
1027            .file_name()
1028            .ok_or(VfioError::InvalidPath)?;
1029        let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1030        let name = String::from(name_str);
1031        let dev = group.lock().get_device(&name)?;
1032        let (dev_info, dev_type) = Self::get_device_info(&dev)?;
1033        let regions = Self::get_regions(&dev, dev_info.num_regions)?;
1034        group.lock().add_device_num();
1035        let group_descriptor = group.lock().as_raw_descriptor();
1036
1037        let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1038        let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1039            .map_err(VfioError::Resources)?;
1040
1041        let pviommu = if matches!(iommu_dev, IommuDevType::PkvmPviommu) {
1042            // We currently have a 1-to-1 mapping between pvIOMMUs and VFIO devices.
1043            let pviommu = KvmVfioPviommu::new(vm)?;
1044
1045            let vsids_len = KvmVfioPviommu::get_sid_count(vm, &dev)?.try_into().unwrap();
1046            let max_vsid = u32::MAX.try_into().unwrap();
1047            let random_vsids = sample(&mut rand::thread_rng(), max_vsid, vsids_len).into_iter();
1048            let vsids = Vec::from_iter(random_vsids.map(|v| u32::try_from(v).unwrap()));
1049            for (i, vsid) in vsids.iter().enumerate() {
1050                pviommu.attach(&dev, i.try_into().unwrap(), *vsid)?;
1051            }
1052
1053            Some((Arc::new(Mutex::new(pviommu)), vsids))
1054        } else {
1055            None
1056        };
1057
1058        Ok(VfioDevice {
1059            dev,
1060            name,
1061            container,
1062            dev_type,
1063            group_descriptor,
1064            group_id,
1065            regions,
1066            num_irqs: dev_info.num_irqs,
1067            iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1068            dt_symbol,
1069            pviommu,
1070        })
1071    }
1072
1073    pub fn new<P: AsRef<Path>>(
1074        sysfspath: &P,
1075        container: Arc<Mutex<VfioContainer>>,
1076    ) -> Result<Self> {
1077        let group_id = VfioGroup::get_group_id(sysfspath)?;
1078        let group = container.lock().get_group(group_id)?;
1079        let name_osstr = sysfspath
1080            .as_ref()
1081            .file_name()
1082            .ok_or(VfioError::InvalidPath)?;
1083        let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1084        let name = String::from(name_str);
1085
1086        let dev = match group.lock().get_device(&name) {
1087            Ok(dev) => dev,
1088            Err(e) => {
1089                container.lock().remove_group(group_id, false);
1090                return Err(e);
1091            }
1092        };
1093        let (dev_info, dev_type) = match Self::get_device_info(&dev) {
1094            Ok(dev_info) => dev_info,
1095            Err(e) => {
1096                container.lock().remove_group(group_id, false);
1097                return Err(e);
1098            }
1099        };
1100        let regions = match Self::get_regions(&dev, dev_info.num_regions) {
1101            Ok(regions) => regions,
1102            Err(e) => {
1103                container.lock().remove_group(group_id, false);
1104                return Err(e);
1105            }
1106        };
1107        group.lock().add_device_num();
1108        let group_descriptor = group.lock().as_raw_descriptor();
1109
1110        let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1111        let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1112            .map_err(VfioError::Resources)?;
1113
1114        Ok(VfioDevice {
1115            dev,
1116            name,
1117            container,
1118            dev_type,
1119            group_descriptor,
1120            group_id,
1121            regions,
1122            num_irqs: dev_info.num_irqs,
1123            iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1124            dt_symbol: None,
1125            pviommu: None,
1126        })
1127    }
1128
1129    /// Returns the file for this device.
1130    pub fn dev_file(&self) -> &File {
1131        &self.dev
1132    }
1133
1134    /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
1135    pub fn device_name(&self) -> &String {
1136        &self.name
1137    }
1138
1139    /// Returns the type of this VFIO device.
1140    pub fn device_type(&self) -> VfioDeviceType {
1141        self.dev_type
1142    }
1143
1144    /// Returns the DT symbol (node label) of this VFIO device.
1145    pub fn dt_symbol(&self) -> Option<&str> {
1146        self.dt_symbol.as_deref()
1147    }
1148
1149    /// Returns the type and indentifier (if applicable) of the IOMMU used by this VFIO device and
1150    /// its master IDs.
1151    pub fn iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])> {
1152        // We currently only report IommuDevType::PkvmPviommu.
1153        if let Some((ref pviommu, ref ids)) = self.pviommu {
1154            Some((
1155                IommuDevType::PkvmPviommu,
1156                Some(pviommu.lock().id()),
1157                ids.as_ref(),
1158            ))
1159        } else {
1160            None
1161        }
1162    }
1163
1164    /// enter the device's low power state
1165    pub fn pm_low_power_enter(&self) -> Result<()> {
1166        self.device_feature(VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY)
1167            .map_err(VfioError::VfioPmLowPowerEnter)
1168    }
1169
1170    /// enter the device's low power state with wakeup notification
1171    pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
1172        let payload = vfio_device_low_power_entry_with_wakeup {
1173            wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
1174            reserved: 0,
1175        };
1176        let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
1177        let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
1178        device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
1179        device_feature[0].flags =
1180            VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
1181        // SAFETY:
1182        // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
1183        unsafe {
1184            device_feature[0]
1185                .data
1186                .as_mut_slice(payload_size)
1187                .copy_from_slice(
1188                    mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
1189                        .as_slice(),
1190                );
1191        }
1192        // SAFETY:
1193        // Safe as we are the owner of self and power_management which are valid value
1194        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1195        if ret < 0 {
1196            Err(VfioError::VfioPmLowPowerEnter(get_error()))
1197        } else {
1198            Ok(())
1199        }
1200    }
1201
1202    /// exit the device's low power state
1203    pub fn pm_low_power_exit(&self) -> Result<()> {
1204        self.device_feature(VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT)
1205            .map_err(VfioError::VfioPmLowPowerExit)
1206    }
1207
1208    fn device_feature(&self, flags: u32) -> result::Result<(), Error> {
1209        let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1210        device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1211        device_feature[0].flags = flags;
1212        // SAFETY:
1213        // Safe as we are the owner of self and device_feature which are valid value
1214        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1215        if ret < 0 {
1216            Err(get_error())
1217        } else {
1218            Ok(())
1219        }
1220    }
1221
1222    /// call _DSM from the device's ACPI table
1223    pub fn acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>> {
1224        let count = args.len();
1225        let mut dsm = vec_with_array_field::<vfio_acpi_dsm, u8>(count);
1226        dsm[0].argsz = (mem::size_of::<vfio_acpi_dsm>() + mem::size_of_val(args)) as u32;
1227        dsm[0].padding = 0;
1228        // SAFETY:
1229        // Safe as we allocated enough space to hold args
1230        unsafe {
1231            dsm[0].args.as_mut_slice(count).clone_from_slice(args);
1232        }
1233        // SAFETY:
1234        // Safe as we are the owner of self and dsm which are valid value
1235        let ret = unsafe { ioctl_with_mut_ref(&self.dev, VFIO_DEVICE_ACPI_DSM, &mut dsm[0]) };
1236        if ret < 0 {
1237            Err(VfioError::VfioAcpiDsm(get_error()))
1238        } else {
1239            // SAFETY:
1240            // Safe as we allocated enough space to hold args
1241            let res = unsafe { dsm[0].args.as_slice(count) };
1242            Ok(res.to_vec())
1243        }
1244    }
1245
1246    /// Enable vfio device's ACPI notifications and associate EventFD with device.
1247    pub fn acpi_notification_evt_enable(
1248        &self,
1249        acpi_notification_eventfd: &Event,
1250        index: u32,
1251    ) -> Result<()> {
1252        let u32_size = mem::size_of::<u32>();
1253        let count = 1;
1254
1255        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1256        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1257        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1258        irq_set[0].index = index;
1259        irq_set[0].start = 0;
1260        irq_set[0].count = count as u32;
1261
1262        // SAFETY:
1263        // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1264        let data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1265        data.copy_from_slice(&acpi_notification_eventfd.as_raw_descriptor().to_ne_bytes()[..]);
1266
1267        // SAFETY:
1268        // Safe as we are the owner of self and irq_set which are valid value
1269        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1270        if ret < 0 {
1271            Err(VfioError::VfioAcpiNotificationEnable(get_error()))
1272        } else {
1273            Ok(())
1274        }
1275    }
1276
1277    /// Disable vfio device's ACPI notification and disconnect EventFd with device.
1278    pub fn acpi_notification_disable(&self, index: u32) -> Result<()> {
1279        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1280        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1281        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1282        irq_set[0].index = index;
1283        irq_set[0].start = 0;
1284        irq_set[0].count = 0;
1285
1286        // SAFETY:
1287        // Safe as we are the owner of self and irq_set which are valid value
1288        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1289        if ret < 0 {
1290            Err(VfioError::VfioAcpiNotificationDisable(get_error()))
1291        } else {
1292            Ok(())
1293        }
1294    }
1295
1296    /// Test vfio device's ACPI notification by simulating hardware triggering.
1297    /// When the signaling mechanism is set, the VFIO_IRQ_SET_DATA_BOOL can be used with
1298    /// VFIO_IRQ_SET_ACTION_TRIGGER to perform kernel level interrupt loopback testing.
1299    pub fn acpi_notification_test(&self, index: u32, val: u32) -> Result<()> {
1300        let u32_size = mem::size_of::<u32>();
1301        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1302        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + u32_size) as u32;
1303        irq_set[0].flags = VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_ACTION_TRIGGER;
1304        irq_set[0].index = index;
1305        irq_set[0].start = 0;
1306        irq_set[0].count = 1;
1307
1308        // SAFETY:
1309        // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1310        let data = unsafe { irq_set[0].data.as_mut_slice(u32_size) };
1311        data.copy_from_slice(&val.to_ne_bytes()[..]);
1312
1313        // SAFETY:
1314        // Safe as we are the owner of self and irq_set which are valid value
1315        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1316        if ret < 0 {
1317            Err(VfioError::VfioAcpiNotificationTest(get_error()))
1318        } else {
1319            Ok(())
1320        }
1321    }
1322
1323    /// Enable vfio device's irq and associate Irqfd Event with device.
1324    /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to
1325    /// subindex + descriptors length will be assigned with irqfd in the descriptors array.
1326    /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical
1327    /// device is removed.
1328    /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
1329    /// interrupts if already assigned or skip un-assigned interrupts.
1330    pub fn irq_enable(
1331        &self,
1332        descriptors: &[Option<&Event>],
1333        index: u32,
1334        subindex: u32,
1335    ) -> Result<()> {
1336        let count = descriptors.len();
1337        let u32_size = mem::size_of::<u32>();
1338        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1339        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1340        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1341        irq_set[0].index = index;
1342        irq_set[0].start = subindex;
1343        irq_set[0].count = count as u32;
1344
1345        // SAFETY:
1346        // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
1347        // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1348        // together as u32. It is safe as enough space is reserved through
1349        // vec_with_array_field(u32)<count>.
1350        let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1351        for descriptor in descriptors.iter().take(count) {
1352            let (left, right) = data.split_at_mut(u32_size);
1353            match descriptor {
1354                Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
1355                None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
1356            }
1357            data = right;
1358        }
1359
1360        // SAFETY:
1361        // Safe as we are the owner of self and irq_set which are valid value
1362        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1363        if ret < 0 {
1364            Err(VfioError::VfioIrqEnable(get_error()))
1365        } else {
1366            Ok(())
1367        }
1368    }
1369
1370    /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
1371    /// is used to get guest EOI notification.
1372    /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
1373    /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
1374    /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1375    /// generate another interrupts.
1376    /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1377    ///
1378    /// descriptor: should be resample IrqFd.
1379    pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1380        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1381        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1382        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1383        irq_set[0].index = index;
1384        irq_set[0].start = 0;
1385        irq_set[0].count = 1;
1386
1387        {
1388            // SAFETY:
1389            // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1390            // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1391            // together as u32. It is safe as enough space is reserved through
1392            // vec_with_array_field(u32)<1>.
1393            let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1394            descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1395        }
1396
1397        // SAFETY:
1398        // Safe as we are the owner of self and irq_set which are valid value
1399        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1400        if ret < 0 {
1401            Err(VfioError::VfioIrqEnable(get_error()))
1402        } else {
1403            Ok(())
1404        }
1405    }
1406
1407    /// disable vfio device's irq and disconnect Irqfd Event with device
1408    pub fn irq_disable(&self, index: u32) -> Result<()> {
1409        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1410        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1411        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1412        irq_set[0].index = index;
1413        irq_set[0].start = 0;
1414        irq_set[0].count = 0;
1415
1416        // SAFETY:
1417        // Safe as we are the owner of self and irq_set which are valid value
1418        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1419        if ret < 0 {
1420            Err(VfioError::VfioIrqDisable(get_error()))
1421        } else {
1422            Ok(())
1423        }
1424    }
1425
1426    /// Unmask vfio device irq
1427    pub fn irq_unmask(&self, index: u32) -> Result<()> {
1428        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1429        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1430        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1431        irq_set[0].index = index;
1432        irq_set[0].start = 0;
1433        irq_set[0].count = 1;
1434
1435        // SAFETY:
1436        // Safe as we are the owner of self and irq_set which are valid value
1437        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1438        if ret < 0 {
1439            Err(VfioError::VfioIrqUnmask(get_error()))
1440        } else {
1441            Ok(())
1442        }
1443    }
1444
1445    /// Mask vfio device irq
1446    pub fn irq_mask(&self, index: u32) -> Result<()> {
1447        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1448        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1449        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1450        irq_set[0].index = index;
1451        irq_set[0].start = 0;
1452        irq_set[0].count = 1;
1453
1454        // SAFETY:
1455        // Safe as we are the owner of self and irq_set which are valid value
1456        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1457        if ret < 0 {
1458            Err(VfioError::VfioIrqMask(get_error()))
1459        } else {
1460            Ok(())
1461        }
1462    }
1463
1464    /// Get and validate VFIO device information.
1465    fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1466        let mut dev_info = vfio_device_info {
1467            argsz: mem::size_of::<vfio_device_info>() as u32,
1468            flags: 0,
1469            num_regions: 0,
1470            num_irqs: 0,
1471            ..Default::default()
1472        };
1473
1474        // SAFETY:
1475        // Safe as we are the owner of device_file and dev_info which are valid value,
1476        // and we verify the return value.
1477        let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO, &mut dev_info) };
1478        if ret < 0 {
1479            return Err(VfioError::VfioDeviceGetInfo(get_error()));
1480        }
1481
1482        let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1483            if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1484                || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1485            {
1486                return Err(VfioError::VfioDeviceGetInfo(get_error()));
1487            }
1488
1489            VfioDeviceType::Pci
1490        } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1491            VfioDeviceType::Platform
1492        } else {
1493            return Err(VfioError::UnknownDeviceType(dev_info.flags));
1494        };
1495
1496        Ok((dev_info, dev_type))
1497    }
1498
1499    /// Query interrupt information
1500    /// return: Vector of interrupts information, each of which contains flags and index
1501    pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1502        let mut irqs: Vec<VfioIrq> = Vec::new();
1503
1504        for i in 0..self.num_irqs {
1505            let argsz = mem::size_of::<vfio_irq_info>() as u32;
1506            let mut irq_info = vfio_irq_info {
1507                argsz,
1508                flags: 0,
1509                index: i,
1510                count: 0,
1511            };
1512            // SAFETY:
1513            // Safe as we are the owner of dev and irq_info which are valid value,
1514            // and we verify the return value.
1515            let ret = unsafe {
1516                ioctl_with_mut_ref(self.device_file(), VFIO_DEVICE_GET_IRQ_INFO, &mut irq_info)
1517            };
1518            if ret < 0 || irq_info.count != 1 {
1519                return Err(VfioError::VfioDeviceGetInfo(get_error()));
1520            }
1521
1522            let irq = VfioIrq {
1523                flags: irq_info.flags,
1524                index: irq_info.index,
1525            };
1526            irqs.push(irq);
1527        }
1528        Ok(irqs)
1529    }
1530
1531    #[allow(clippy::cast_ptr_alignment)]
1532    fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1533        let mut regions: Vec<VfioRegion> = Vec::new();
1534        for i in 0..num_regions {
1535            let argsz = mem::size_of::<vfio_region_info>() as u32;
1536            let mut reg_info = vfio_region_info {
1537                argsz,
1538                flags: 0,
1539                index: i,
1540                cap_offset: 0,
1541                size: 0,
1542                offset: 0,
1543            };
1544            let ret =
1545                // SAFETY:
1546                // Safe as we are the owner of dev and reg_info which are valid value,
1547                // and we verify the return value.
1548                unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO, &mut reg_info) };
1549            if ret < 0 {
1550                continue;
1551            }
1552
1553            let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1554            let mut cap_info: Option<(u32, u32)> = None;
1555            let mut msix_region_mmappable = false;
1556            if reg_info.argsz > argsz {
1557                let cap_len: usize = (reg_info.argsz - argsz) as usize;
1558                let mut region_with_cap =
1559                    vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1560                region_with_cap[0].region_info.argsz = reg_info.argsz;
1561                region_with_cap[0].region_info.flags = 0;
1562                region_with_cap[0].region_info.index = i;
1563                region_with_cap[0].region_info.cap_offset = 0;
1564                region_with_cap[0].region_info.size = 0;
1565                region_with_cap[0].region_info.offset = 0;
1566                // SAFETY:
1567                // Safe as we are the owner of dev and region_info which are valid value,
1568                // and we verify the return value.
1569                let ret = unsafe {
1570                    ioctl_with_mut_ref(
1571                        dev,
1572                        VFIO_DEVICE_GET_REGION_INFO,
1573                        &mut (region_with_cap[0].region_info),
1574                    )
1575                };
1576                if ret < 0 {
1577                    return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1578                }
1579
1580                // Some drivers (e.g. for NVIDIA vGPUs) do not fully populate the
1581                // `vfio_region_info` structure in response to the
1582                // `VFIO_DEVICE_GET_REGION_INFO` call if the passed size is not enough
1583                // to hold the entirety of the data.
1584                // This ensures we use complete data when we construct the `VfioRegion`
1585                // instance.
1586                reg_info = region_with_cap[0].region_info;
1587
1588                if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1589                    continue;
1590                }
1591
1592                let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1593                let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1594                let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1595                let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1596                let region_info_sz = reg_info.argsz;
1597
1598                // region_with_cap[0].cap_info may contain many structures, like
1599                // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1600                // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1601                // vfio_into_cap_header.
1602                // Go through all the cap structs.
1603                let info_ptr = region_with_cap.as_ptr() as *mut u8;
1604                let mut offset = region_with_cap[0].region_info.cap_offset;
1605                while offset != 0 {
1606                    if offset + cap_header_sz > region_info_sz {
1607                        break;
1608                    }
1609                    // SAFETY:
1610                    // Safe, as cap_header struct is in this function allocated region_with_cap
1611                    // vec.
1612                    let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1613                    // SAFETY:
1614                    // Safe, as cap_header struct is in this function allocated region_with_cap
1615                    // vec.
1616                    let cap_header = unsafe { &*(cap_ptr as *const vfio_info_cap_header) };
1617                    if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1618                        if offset + mmap_cap_sz > region_info_sz {
1619                            break;
1620                        }
1621                        // cap_ptr is vfio_region_info_cap_sparse_mmap here
1622                        let sparse_mmap =
1623                            // SAFETY:
1624                            // Safe, this vfio_region_info_cap_sparse_mmap is in this function
1625                            // allocated region_with_cap vec.
1626                            unsafe { &*(cap_ptr as *const vfio_region_info_cap_sparse_mmap) };
1627
1628                        let area_num = sparse_mmap.nr_areas;
1629                        if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1630                            break;
1631                        }
1632                        let areas =
1633                            // SAFETY:
1634                            // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1635                            // region_with_cap vec.
1636                            unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1637                        for area in areas.iter() {
1638                            mmaps.push(*area);
1639                        }
1640
1641                        // Sparse regions means the driver can decide which parts of the BAR are
1642                        // safe to mmap. If that overlaps with the MSIX
1643                        // data, that's the decision of the driver.
1644                        // This is required for some devices (e.g. NVIDIA vGPUs).
1645                        msix_region_mmappable = true;
1646                    } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1647                        if offset + type_cap_sz > region_info_sz {
1648                            break;
1649                        }
1650                        // cap_ptr is vfio_region_info_cap_type here
1651                        let cap_type_info =
1652                            // SAFETY:
1653                            // Safe, this vfio_region_info_cap_type is in this function allocated
1654                            // region_with_cap vec
1655                            unsafe { &*(cap_ptr as *const vfio_region_info_cap_type) };
1656
1657                        cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1658                    } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1659                        mmaps.push(vfio_region_sparse_mmap_area {
1660                            offset: 0,
1661                            size: region_with_cap[0].region_info.size,
1662                        });
1663                        msix_region_mmappable = true;
1664                    }
1665
1666                    offset = cap_header.next;
1667                }
1668            } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1669                mmaps.push(vfio_region_sparse_mmap_area {
1670                    offset: 0,
1671                    size: reg_info.size,
1672                });
1673            }
1674
1675            let region = VfioRegion {
1676                flags: reg_info.flags,
1677                size: reg_info.size,
1678                offset: reg_info.offset,
1679                mmaps,
1680                cap_info,
1681                msix_region_mmappable,
1682            };
1683            regions.push(region);
1684        }
1685
1686        Ok(regions)
1687    }
1688
1689    /// get a region's flag
1690    /// the return's value may conatin:
1691    ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1692    ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1693    ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1694    ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
1695    pub fn get_region_flags(&self, index: usize) -> u32 {
1696        match self.regions.get(index) {
1697            Some(v) => v.flags,
1698            None => {
1699                warn!("get_region_flags() with invalid index: {}", index);
1700                0
1701            }
1702        }
1703    }
1704
1705    /// get a region's offset
1706    /// return: Region offset from the start of vfio device descriptor
1707    pub fn get_region_offset(&self, index: usize) -> u64 {
1708        match self.regions.get(index) {
1709            Some(v) => v.offset,
1710            None => {
1711                warn!("get_region_offset with invalid index: {}", index);
1712                0
1713            }
1714        }
1715    }
1716
1717    /// get a region's size
1718    /// return: Region size from the start of vfio device descriptor
1719    pub fn get_region_size(&self, index: usize) -> u64 {
1720        match self.regions.get(index) {
1721            Some(v) => v.size,
1722            None => {
1723                warn!("get_region_size with invalid index: {}", index);
1724                0
1725            }
1726        }
1727    }
1728
1729    /// get a number of regions
1730    /// return: Number of regions of vfio device descriptor
1731    pub fn get_region_count(&self) -> usize {
1732        self.regions.len()
1733    }
1734
1735    /// get a region's mmap info vector
1736    pub fn get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area> {
1737        match self.regions.get(index) {
1738            Some(v) => v.mmaps.clone(),
1739            None => {
1740                warn!("get_region_mmap with invalid index: {}", index);
1741                Vec::new()
1742            }
1743        }
1744    }
1745
1746    /// get if the MSIX data with a region is safe to mmap, or if it should be removed
1747    /// before mmapping
1748    pub fn get_region_msix_mmappable(&self, index: usize) -> bool {
1749        match self.regions.get(index) {
1750            Some(v) => v.msix_region_mmappable,
1751            None => {
1752                warn!("get_region_msix_mmappable with invalid index: {}", index);
1753                false
1754            }
1755        }
1756    }
1757
1758    /// find the specified cap type in device regions
1759    /// Input:
1760    ///      type_:  cap type
1761    ///      sub_type: cap sub_type
1762    /// Output:
1763    ///     None: device doesn't have the specified cap type
1764    ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1765    ///                                     index and size
1766    pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1767        for (index, region) in self.regions.iter().enumerate() {
1768            if let Some(cap_info) = &region.cap_info {
1769                if cap_info.0 == type_ && cap_info.1 == sub_type {
1770                    return Some((index as u32, region.size));
1771                }
1772            }
1773        }
1774
1775        None
1776    }
1777
1778    /// Returns file offset corresponding to the given `VfioRegionAddr`.
1779    /// The offset can be used when reading/writing the VFIO device's FD directly.
1780    pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1781        let region = self
1782            .regions
1783            .get(addr.index)
1784            .ok_or(VfioError::InvalidIndex(addr.index))?;
1785        Ok(region.offset + addr.addr)
1786    }
1787
1788    /// Read region's data from VFIO device into buf
1789    /// index: region num
1790    /// buf: data destination and buf length is read size
1791    /// addr: offset in the region
1792    pub fn region_read(&self, index: usize, buf: &mut [u8], addr: u64) {
1793        let stub: &VfioRegion = self
1794            .regions
1795            .get(index)
1796            .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {index}"));
1797
1798        let size = buf.len() as u64;
1799        if size > stub.size || addr + size > stub.size {
1800            panic!(
1801                "tried to read VFIO region with invalid arguments: index={index}, addr=0x{addr:x}, size=0x{size:x}"
1802            );
1803        }
1804
1805        self.dev
1806            .read_exact_at(buf, stub.offset + addr)
1807            .unwrap_or_else(|e| {
1808                panic!("failed to read region: index={index}, addr=0x{addr:x}, error={e}")
1809            });
1810    }
1811
1812    /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
1813    pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1814        let mut val = mem::MaybeUninit::zeroed();
1815        let buf =
1816            // SAFETY:
1817            // Safe because we have zero-initialized `size_of::<T>()` bytes.
1818            unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1819        self.region_read(addr.index, buf, addr.addr + offset);
1820        // SAFETY:
1821        // Safe because any bit pattern is valid for a type that implements FromBytes.
1822        unsafe { val.assume_init() }
1823    }
1824
1825    /// write the data from buf into a vfio device region
1826    /// index: region num
1827    /// buf: data src and buf length is write size
1828    /// addr: offset in the region
1829    pub fn region_write(&self, index: usize, buf: &[u8], addr: u64) {
1830        let stub: &VfioRegion = self
1831            .regions
1832            .get(index)
1833            .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {index}"));
1834
1835        let size = buf.len() as u64;
1836        if size > stub.size
1837            || addr + size > stub.size
1838            || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1839        {
1840            panic!(
1841                "tried to write VFIO region with invalid arguments: index={index}, addr=0x{addr:x}, size=0x{size:x}"
1842            );
1843        }
1844
1845        self.dev
1846            .write_all_at(buf, stub.offset + addr)
1847            .unwrap_or_else(|e| {
1848                panic!("failed to write region: index={index}, addr=0x{addr:x}, error={e}")
1849            });
1850    }
1851
1852    /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
1853    pub fn region_write_to_addr(&self, data: &[u8], addr: &VfioRegionAddr, offset: u64) {
1854        self.region_write(addr.index, data, addr.addr + offset);
1855    }
1856
1857    /// get vfio device's descriptors which are passed into minijail process
1858    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1859        vec![
1860            self.dev.as_raw_descriptor(),
1861            self.group_descriptor,
1862            self.container.lock().as_raw_descriptor(),
1863        ]
1864    }
1865
1866    /// Add (iova, user_addr) map into vfio container iommu table
1867    /// # Safety
1868    ///
1869    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
1870    pub unsafe fn vfio_dma_map(
1871        &self,
1872        iova: u64,
1873        size: u64,
1874        user_addr: u64,
1875        write_en: bool,
1876    ) -> Result<()> {
1877        self.container
1878            .lock()
1879            .vfio_dma_map(iova, size, user_addr, write_en)
1880    }
1881
1882    /// Remove (iova, user_addr) map from vfio container iommu table
1883    pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1884        self.container.lock().vfio_dma_unmap(iova, size)
1885    }
1886
1887    pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1888        self.container.lock().vfio_get_iommu_page_size_mask()
1889    }
1890
1891    pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1892        self.iova_alloc
1893            .lock()
1894            .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1895            .map_err(VfioError::Resources)
1896    }
1897
1898    pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1899        self.iova_alloc.lock().get(alloc).map(|res| res.0)
1900    }
1901
1902    pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1903        self.iova_alloc
1904            .lock()
1905            .release(alloc)
1906            .map_err(VfioError::Resources)
1907    }
1908
1909    pub fn get_max_addr(&self) -> u64 {
1910        self.iova_alloc.lock().get_max_addr()
1911    }
1912
1913    /// Gets the vfio device backing `File`.
1914    pub fn device_file(&self) -> &File {
1915        &self.dev
1916    }
1917
1918    /// close vfio device
1919    pub fn close(&self) {
1920        self.container.lock().remove_group(self.group_id, true);
1921    }
1922}
1923
1924pub struct VfioPciConfig {
1925    device: Arc<VfioDevice>,
1926}
1927
1928impl VfioPciConfig {
1929    pub fn new(device: Arc<VfioDevice>) -> Self {
1930        VfioPciConfig { device }
1931    }
1932
1933    pub fn read_config<T: IntoBytes + FromBytes>(&self, offset: u32) -> T {
1934        let mut config = T::new_zeroed();
1935        self.device.region_read(
1936            VFIO_PCI_CONFIG_REGION_INDEX as usize,
1937            config.as_mut_bytes(),
1938            offset.into(),
1939        );
1940        config
1941    }
1942
1943    pub fn write_config<T: Immutable + IntoBytes>(&self, config: T, offset: u32) {
1944        self.device.region_write(
1945            VFIO_PCI_CONFIG_REGION_INDEX as usize,
1946            config.as_bytes(),
1947            offset.into(),
1948        );
1949    }
1950
1951    /// Set the VFIO device this config refers to as the bus master.
1952    pub fn set_bus_master(&self) {
1953        /// Constant definitions from `linux/pci_regs.h`.
1954        const PCI_COMMAND: u32 = 0x4;
1955        /// Enable bus mastering
1956        const PCI_COMMAND_MASTER: u16 = 0x4;
1957
1958        let mut cmd: u16 = self.read_config(PCI_COMMAND);
1959
1960        if cmd & PCI_COMMAND_MASTER != 0 {
1961            return;
1962        }
1963
1964        cmd |= PCI_COMMAND_MASTER;
1965
1966        self.write_config(cmd, PCI_COMMAND);
1967    }
1968}
1969
1970impl AsRawDescriptor for VfioDevice {
1971    fn as_raw_descriptor(&self) -> RawDescriptor {
1972        self.dev.as_raw_descriptor()
1973    }
1974}