devices/
vfio.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::collections::HashMap;
6use std::ffi::CString;
7use std::fs::File;
8use std::fs::OpenOptions;
9use std::io;
10use std::mem;
11use std::os::raw::c_ulong;
12use std::os::unix::prelude::FileExt;
13use std::path::Path;
14use std::path::PathBuf;
15#[cfg(all(target_os = "android", target_arch = "aarch64"))]
16use std::ptr::addr_of_mut;
17use std::result;
18use std::slice;
19use std::sync::Arc;
20use std::sync::OnceLock;
21
22use base::error;
23use base::ioctl;
24use base::ioctl_with_mut_ptr;
25use base::ioctl_with_mut_ref;
26use base::ioctl_with_ptr;
27use base::ioctl_with_ref;
28use base::ioctl_with_val;
29use base::warn;
30use base::AsRawDescriptor;
31use base::Error;
32use base::Event;
33use base::FromRawDescriptor;
34use base::RawDescriptor;
35use base::SafeDescriptor;
36use cfg_if::cfg_if;
37use data_model::vec_with_array_field;
38use hypervisor::DeviceKind;
39use hypervisor::Vm;
40use rand::seq::index::sample;
41use remain::sorted;
42use resources::address_allocator::AddressAllocator;
43use resources::AddressRange;
44use resources::Alloc;
45use resources::Error as ResourcesError;
46use sync::Mutex;
47use thiserror::Error;
48use vfio_sys::vfio::vfio_acpi_dsm;
49use vfio_sys::vfio::VFIO_IRQ_SET_DATA_BOOL;
50use vfio_sys::*;
51use zerocopy::FromBytes;
52use zerocopy::Immutable;
53use zerocopy::IntoBytes;
54
55use crate::IommuDevType;
56
57#[sorted]
58#[derive(Error, Debug)]
59pub enum VfioError {
60    #[error("failed to duplicate VfioContainer")]
61    ContainerDupError,
62    #[error("failed to set container's IOMMU driver type as {0:?}: {1}")]
63    ContainerSetIOMMU(IommuType, Error),
64    #[error("failed to create KVM vfio device")]
65    CreateVfioKvmDevice,
66    #[error("failed to get Group Status: {0}")]
67    GetGroupStatus(Error),
68    #[error("failed to get vfio device fd: {0}")]
69    GroupGetDeviceFD(Error),
70    #[error("failed to add vfio group into vfio container: {0}")]
71    GroupSetContainer(Error),
72    #[error("group is inviable")]
73    GroupViable,
74    #[error("invalid region index: {0}")]
75    InvalidIndex(usize),
76    #[error("invalid operation")]
77    InvalidOperation,
78    #[error("invalid file path")]
79    InvalidPath,
80    #[error("failed to add guest memory map into iommu table: {0}")]
81    IommuDmaMap(Error),
82    #[error("failed to remove guest memory map from iommu table: {0}")]
83    IommuDmaUnmap(Error),
84    #[error("failed to get IOMMU cap info from host")]
85    IommuGetCapInfo,
86    #[error("failed to get IOMMU info from host: {0}")]
87    IommuGetInfo(Error),
88    #[error("failed to attach device to pKVM pvIOMMU: {0}")]
89    KvmPviommuSetConfig(Error),
90    #[error("failed to set KVM vfio device's attribute: {0}")]
91    KvmSetDeviceAttr(Error),
92    #[error("AddressAllocator is unavailable")]
93    NoRescAlloc,
94    #[error("failed to open /dev/vfio/vfio container: {0}")]
95    OpenContainer(io::Error),
96    #[error("failed to open {1} group: {0}")]
97    OpenGroup(io::Error, String),
98    #[error("failed to read {1} link: {0}")]
99    ReadLink(io::Error, PathBuf),
100    #[error("resources error: {0}")]
101    Resources(ResourcesError),
102    #[error("unknown vfio device type (flags: {0:#x})")]
103    UnknownDeviceType(u32),
104    #[error("failed to call vfio device's ACPI _DSM: {0}")]
105    VfioAcpiDsm(Error),
106    #[error("failed to disable vfio device's acpi notification: {0}")]
107    VfioAcpiNotificationDisable(Error),
108    #[error("failed to enable vfio device's acpi notification: {0}")]
109    VfioAcpiNotificationEnable(Error),
110    #[error("failed to test vfio device's acpi notification: {0}")]
111    VfioAcpiNotificationTest(Error),
112    #[error(
113        "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
114    )]
115    VfioApiVersion,
116    #[error("failed to get vfio device's info or info doesn't match: {0}")]
117    VfioDeviceGetInfo(Error),
118    #[error("failed to get vfio device's region info: {0}")]
119    VfioDeviceGetRegionInfo(Error),
120    #[error("container doesn't support IOMMU driver type {0:?}")]
121    VfioIommuSupport(IommuType),
122    #[error("failed to disable vfio device's irq: {0}")]
123    VfioIrqDisable(Error),
124    #[error("failed to enable vfio device's irq: {0}")]
125    VfioIrqEnable(Error),
126    #[error("failed to mask vfio device's irq: {0}")]
127    VfioIrqMask(Error),
128    #[error("failed to unmask vfio device's irq: {0}")]
129    VfioIrqUnmask(Error),
130    #[error("failed to enter vfio device's low power state: {0}")]
131    VfioPmLowPowerEnter(Error),
132    #[error("failed to exit vfio device's low power state: {0}")]
133    VfioPmLowPowerExit(Error),
134    #[error("failed to probe support for VFIO low power state entry: {0}")]
135    VfioProbePmLowPowerEntry(Error),
136    #[error("failed to probe support for VFIO low power state exit: {0}")]
137    VfioProbePmLowPowerExit(Error),
138}
139
140type Result<T> = std::result::Result<T, VfioError>;
141
142fn get_error() -> Error {
143    Error::last()
144}
145
146static KVM_VFIO_FILE: OnceLock<Option<SafeDescriptor>> = OnceLock::new();
147
148fn create_kvm_vfio_file(vm: &impl Vm) -> Option<&'static SafeDescriptor> {
149    KVM_VFIO_FILE
150        .get_or_init(|| vm.create_device(DeviceKind::Vfio).ok())
151        .as_ref()
152}
153
154fn kvm_vfio_file() -> Option<&'static SafeDescriptor> {
155    match KVM_VFIO_FILE.get() {
156        Some(Some(v)) => Some(v),
157        _ => None,
158    }
159}
160
161#[derive(Copy, Clone, Debug, PartialEq, Eq)]
162pub enum VfioDeviceType {
163    Pci,
164    Platform,
165}
166
167enum KvmVfioGroupOps {
168    Add,
169    Delete,
170}
171
172#[derive(Debug)]
173pub struct KvmVfioPviommu {
174    file: File,
175}
176
177impl KvmVfioPviommu {
178    pub fn new(vm: &impl Vm) -> Result<Self> {
179        cfg_if! {
180            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
181                let file = Self::ioctl_kvm_dev_vfio_pviommu_attach(vm)?;
182
183                Ok(Self { file })
184            } else {
185                let _ = vm;
186                unimplemented!()
187            }
188        }
189    }
190
191    pub fn attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()> {
192        cfg_if! {
193            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
194                self.ioctl_kvm_pviommu_set_config(device, sid_idx, vsid)
195            } else {
196                let _ = device;
197                let _ = sid_idx;
198                let _ = vsid;
199                unimplemented!()
200            }
201        }
202    }
203
204    pub fn id(&self) -> u32 {
205        let fd = self.as_raw_descriptor();
206        // Guests identify pvIOMMUs to the hypervisor using the corresponding VMM FDs.
207        fd.try_into().unwrap()
208    }
209
210    pub fn get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32> {
211        cfg_if! {
212            if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
213                let info = Self::ioctl_kvm_dev_vfio_pviommu_get_info(vm, device)?;
214
215                Ok(info.nr_sids)
216            } else {
217                let _ = vm;
218                let _ = device;
219                unimplemented!()
220            }
221        }
222    }
223
224    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
225    fn ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File> {
226        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
227
228        let vfio_dev_attr = kvm_sys::kvm_device_attr {
229            flags: 0,
230            group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
231            attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_ATTACH as u64,
232            addr: 0,
233        };
234
235        // SAFETY:
236        // Safe as we are the owner of vfio_dev_attr, which is valid.
237        let ret =
238            unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
239
240        if ret < 0 {
241            Err(VfioError::KvmSetDeviceAttr(get_error()))
242        } else {
243            // SAFETY: Safe as we verify the return value.
244            Ok(unsafe { File::from_raw_descriptor(ret) })
245        }
246    }
247
248    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
249    fn ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>(
250        &self,
251        device: &T,
252        sid_idx: u32,
253        vsid: u32,
254    ) -> Result<()> {
255        let config = kvm_sys::kvm_vfio_iommu_config {
256            size: mem::size_of::<kvm_sys::kvm_vfio_iommu_config>() as u32,
257            device_fd: device.as_raw_descriptor(),
258            sid_idx,
259            vsid,
260            __reserved: 0,
261        };
262
263        // SAFETY:
264        // Safe as we are the owner of device and config which are valid, and we verify the return
265        // value.
266        let ret = unsafe { ioctl_with_ref(self, kvm_sys::KVM_PVIOMMU_SET_CONFIG, &config) };
267
268        if ret < 0 {
269            Err(VfioError::KvmPviommuSetConfig(get_error()))
270        } else {
271            Ok(())
272        }
273    }
274
275    #[cfg(all(target_os = "android", target_arch = "aarch64"))]
276    fn ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>(
277        vm: &impl Vm,
278        device: &T,
279    ) -> Result<kvm_sys::kvm_vfio_iommu_info> {
280        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
281
282        let mut info = kvm_sys::kvm_vfio_iommu_info {
283            size: mem::size_of::<kvm_sys::kvm_vfio_iommu_info>() as u32,
284            device_fd: device.as_raw_descriptor(),
285            nr_sids: 0,
286            __reserved: 0,
287        };
288
289        let vfio_dev_attr = kvm_sys::kvm_device_attr {
290            flags: 0,
291            group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
292            attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_GET_INFO as u64,
293            addr: addr_of_mut!(info) as usize as u64,
294        };
295
296        // SAFETY:
297        // Safe as we are the owner of vfio_dev_attr, which is valid.
298        let ret =
299            unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
300
301        if ret < 0 {
302            Err(VfioError::KvmSetDeviceAttr(get_error()))
303        } else {
304            Ok(info)
305        }
306    }
307}
308
309impl AsRawDescriptor for KvmVfioPviommu {
310    fn as_raw_descriptor(&self) -> RawDescriptor {
311        self.file.as_raw_descriptor()
312    }
313}
314
315#[repr(u32)]
316#[derive(Copy, Clone, Debug, PartialEq, Eq)]
317pub enum IommuType {
318    Type1V2 = VFIO_TYPE1v2_IOMMU,
319    PkvmPviommu = VFIO_PKVM_PVIOMMU,
320    // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
321    // small, dynamic mappings. For clients which create large, relatively
322    // static mappings, Type1V2 is still preferred.
323    //
324    // See crrev.com/c/3593528 for the implementation.
325    Type1ChromeOS = 100001,
326}
327
328/// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
329pub struct VfioContainer {
330    container: File,
331    groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
332    iommu_type: Option<IommuType>,
333}
334
335fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T>
336where
337    T: FromBytes,
338{
339    Some(T::read_from_prefix(bytes.get(offset..)?).ok()?.0)
340}
341
342const VFIO_API_VERSION: u8 = 0;
343impl VfioContainer {
344    pub fn new() -> Result<Self> {
345        let container = OpenOptions::new()
346            .read(true)
347            .write(true)
348            .open("/dev/vfio/vfio")
349            .map_err(VfioError::OpenContainer)?;
350
351        Self::new_from_container(container)
352    }
353
354    // Construct a VfioContainer from an exist container file.
355    pub fn new_from_container(container: File) -> Result<Self> {
356        // SAFETY:
357        // Safe as file is vfio container descriptor and ioctl is defined by kernel.
358        let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION) };
359        if version as u8 != VFIO_API_VERSION {
360            return Err(VfioError::VfioApiVersion);
361        }
362
363        Ok(VfioContainer {
364            container,
365            groups: HashMap::new(),
366            iommu_type: None,
367        })
368    }
369
370    fn is_group_set(&self, group_id: u32) -> bool {
371        self.groups.contains_key(&group_id)
372    }
373
374    fn check_extension(&self, val: IommuType) -> bool {
375        // SAFETY:
376        // Safe as file is vfio container and make sure val is valid.
377        let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION, val as c_ulong) };
378        ret != 0
379    }
380
381    fn set_iommu(&mut self, val: IommuType) -> i32 {
382        // SAFETY:
383        // Safe as file is vfio container and make sure val is valid.
384        unsafe { ioctl_with_val(self, VFIO_SET_IOMMU, val as c_ulong) }
385    }
386
387    fn set_iommu_checked(&mut self, val: IommuType) -> Result<()> {
388        if !self.check_extension(val) {
389            Err(VfioError::VfioIommuSupport(val))
390        } else if self.set_iommu(val) != 0 {
391            Err(VfioError::ContainerSetIOMMU(val, get_error()))
392        } else {
393            self.iommu_type = Some(val);
394            Ok(())
395        }
396    }
397
398    /// # Safety
399    ///
400    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
401    pub unsafe fn vfio_dma_map(
402        &self,
403        iova: u64,
404        size: u64,
405        user_addr: u64,
406        write_en: bool,
407    ) -> Result<()> {
408        match self
409            .iommu_type
410            .expect("vfio_dma_map called before configuring IOMMU")
411        {
412            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
413                self.vfio_iommu_type1_dma_map(iova, size, user_addr, write_en)
414            }
415            IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
416        }
417    }
418
419    /// # Safety
420    ///
421    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
422    unsafe fn vfio_iommu_type1_dma_map(
423        &self,
424        iova: u64,
425        size: u64,
426        user_addr: u64,
427        write_en: bool,
428    ) -> Result<()> {
429        let mut dma_map = vfio_iommu_type1_dma_map {
430            argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
431            flags: VFIO_DMA_MAP_FLAG_READ,
432            vaddr: user_addr,
433            iova,
434            size,
435        };
436
437        if write_en {
438            dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
439        }
440
441        let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA, &dma_map);
442        if ret != 0 {
443            return Err(VfioError::IommuDmaMap(get_error()));
444        }
445
446        Ok(())
447    }
448
449    pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
450        match self
451            .iommu_type
452            .expect("vfio_dma_unmap called before configuring IOMMU")
453        {
454            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
455                self.vfio_iommu_type1_dma_unmap(iova, size)
456            }
457            IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
458        }
459    }
460
461    fn vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
462        let mut dma_unmap = vfio_iommu_type1_dma_unmap {
463            argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
464            flags: 0,
465            iova,
466            size,
467            ..Default::default()
468        };
469
470        // SAFETY:
471        // Safe as file is vfio container, dma_unmap is constructed by us, and
472        // we check the return value
473        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA, &mut dma_unmap) };
474        if ret != 0 || dma_unmap.size != size {
475            return Err(VfioError::IommuDmaUnmap(get_error()));
476        }
477
478        Ok(())
479    }
480
481    pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
482        match self
483            .iommu_type
484            .expect("vfio_get_iommu_page_size_mask called before configuring IOMMU")
485        {
486            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
487                self.vfio_iommu_type1_get_iommu_page_size_mask()
488            }
489            IommuType::PkvmPviommu => Ok(0),
490        }
491    }
492
493    fn vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64> {
494        let mut iommu_info = vfio_iommu_type1_info {
495            argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
496            flags: 0,
497            iova_pgsizes: 0,
498            ..Default::default()
499        };
500
501        // SAFETY:
502        // Safe as file is vfio container, iommu_info has valid values,
503        // and we check the return value
504        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info) };
505        if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
506            return Err(VfioError::IommuGetInfo(get_error()));
507        }
508
509        Ok(iommu_info.iova_pgsizes)
510    }
511
512    pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
513        match self
514            .iommu_type
515            .expect("vfio_iommu_iova_get_iova_ranges called before configuring IOMMU")
516        {
517            IommuType::Type1V2 | IommuType::Type1ChromeOS => {
518                self.vfio_iommu_type1_get_iova_ranges()
519            }
520            IommuType::PkvmPviommu => Ok(Vec::new()),
521        }
522    }
523
524    fn vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
525        // Query the buffer size needed fetch the capabilities.
526        let mut iommu_info_argsz = vfio_iommu_type1_info {
527            argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
528            flags: 0,
529            iova_pgsizes: 0,
530            ..Default::default()
531        };
532
533        // SAFETY:
534        // Safe as file is vfio container, iommu_info_argsz has valid values,
535        // and we check the return value
536        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info_argsz) };
537        if ret != 0 {
538            return Err(VfioError::IommuGetInfo(get_error()));
539        }
540
541        if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
542            return Err(VfioError::IommuGetCapInfo);
543        }
544
545        let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
546            iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
547        );
548        iommu_info[0].argsz = iommu_info_argsz.argsz;
549        let ret =
550            // SAFETY:
551            // Safe as file is vfio container, iommu_info has valid values,
552            // and we check the return value
553            unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO, iommu_info.as_mut_ptr()) };
554        if ret != 0 {
555            return Err(VfioError::IommuGetInfo(get_error()));
556        }
557
558        // SAFETY:
559        // Safe because we initialized iommu_info with enough space, u8 has less strict
560        // alignment, and since it will no longer be mutated.
561        let info_bytes = unsafe {
562            std::slice::from_raw_parts(
563                iommu_info.as_ptr() as *const u8,
564                iommu_info_argsz.argsz as usize,
565            )
566        };
567
568        if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
569            return Err(VfioError::IommuGetCapInfo);
570        }
571
572        let mut offset = iommu_info[0].cap_offset as usize;
573        while offset != 0 {
574            let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset)
575                .ok_or(VfioError::IommuGetCapInfo)?;
576
577            if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
578                let iova_header =
579                    extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
580                        info_bytes, offset,
581                    )
582                    .ok_or(VfioError::IommuGetCapInfo)?;
583                let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
584                let mut ret = Vec::new();
585                for i in 0..iova_header.nr_iovas {
586                    ret.push(
587                        extract_vfio_struct::<vfio_iova_range>(
588                            info_bytes,
589                            range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
590                        )
591                        .ok_or(VfioError::IommuGetCapInfo)?,
592                    );
593                }
594                return Ok(ret
595                    .iter()
596                    .map(|range| AddressRange {
597                        start: range.start,
598                        end: range.end,
599                    })
600                    .collect());
601            }
602            offset = header.next as usize;
603        }
604
605        Err(VfioError::IommuGetCapInfo)
606    }
607
608    fn set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()> {
609        match iommu_dev {
610            IommuDevType::CoIommu | IommuDevType::VirtioIommu => {
611                // If we expect granular, dynamic mappings, try the ChromeOS Type1ChromeOS first,
612                // then fall back to upstream versions.
613                self.set_iommu_checked(IommuType::Type1ChromeOS)
614                    .or_else(|_| self.set_iommu_checked(IommuType::Type1V2))
615            }
616            IommuDevType::NoIommu => self.set_iommu_checked(IommuType::Type1V2),
617            IommuDevType::PkvmPviommu => self.set_iommu_checked(IommuType::PkvmPviommu),
618        }
619    }
620
621    fn get_group_with_vm(
622        &mut self,
623        id: u32,
624        vm: &impl Vm,
625        iommu_dev: IommuDevType,
626    ) -> Result<Arc<Mutex<VfioGroup>>> {
627        if let Some(group) = self.groups.get(&id) {
628            return Ok(group.clone());
629        }
630
631        let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
632        if self.groups.is_empty() {
633            self.set_iommu_from(iommu_dev)?;
634            // Before the first group is added into container, do once per container
635            // initialization. Both coiommu and virtio-iommu rely on small, dynamic
636            // mappings. However, if an iommu is not enabled, then we map the entirety
637            // of guest memory as a small number of large, static mappings.
638            match iommu_dev {
639                IommuDevType::CoIommu | IommuDevType::PkvmPviommu | IommuDevType::VirtioIommu => {}
640                IommuDevType::NoIommu => {
641                    for region in vm.get_memory().regions() {
642                        // SAFETY:
643                        // Safe because the guest regions are guaranteed not to overlap
644                        unsafe {
645                            self.vfio_dma_map(
646                                region.guest_addr.0,
647                                region.size as u64,
648                                region.host_addr as u64,
649                                true,
650                            )
651                        }?;
652                    }
653                }
654            }
655        }
656
657        let kvm_vfio_file = create_kvm_vfio_file(vm).ok_or(VfioError::CreateVfioKvmDevice)?;
658        group
659            .lock()
660            .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
661
662        self.groups.insert(id, group.clone());
663
664        Ok(group)
665    }
666
667    fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
668        if let Some(group) = self.groups.get(&id) {
669            return Ok(group.clone());
670        }
671
672        let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
673
674        if self.groups.is_empty() {
675            // Before the first group is added into container, do once per
676            // container initialization.
677            self.set_iommu_checked(IommuType::Type1V2)?;
678        }
679
680        self.groups.insert(id, group.clone());
681        Ok(group)
682    }
683
684    fn remove_group(&mut self, id: u32, reduce: bool) {
685        let mut remove = false;
686
687        if let Some(group) = self.groups.get(&id) {
688            if reduce {
689                group.lock().reduce_device_num();
690            }
691            if group.lock().device_num() == 0 {
692                let kvm_vfio_file = kvm_vfio_file().expect("kvm vfio file isn't created");
693                if group
694                    .lock()
695                    .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
696                    .is_err()
697                {
698                    warn!("failing in remove vfio group from kvm device");
699                }
700                remove = true;
701            }
702        }
703
704        if remove {
705            self.groups.remove(&id);
706        }
707    }
708
709    pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
710        // SAFETY: this call is safe because it doesn't modify any memory and we
711        // check the return value.
712        let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
713        if raw_descriptor < 0 {
714            Err(VfioError::ContainerDupError)
715        } else {
716            Ok(raw_descriptor)
717        }
718    }
719
720    // Gets group ids for all groups in the container.
721    pub fn group_ids(&self) -> Vec<&u32> {
722        self.groups.keys().collect()
723    }
724}
725
726impl AsRawDescriptor for VfioContainer {
727    fn as_raw_descriptor(&self) -> RawDescriptor {
728        self.container.as_raw_descriptor()
729    }
730}
731
732struct VfioGroup {
733    group: File,
734    device_num: u32,
735}
736
737impl VfioGroup {
738    fn new(container: &VfioContainer, id: u32) -> Result<Self> {
739        let group_path = format!("/dev/vfio/{id}");
740        let group_file = OpenOptions::new()
741            .read(true)
742            .write(true)
743            .open(Path::new(&group_path))
744            .map_err(|e| VfioError::OpenGroup(e, group_path))?;
745
746        let mut group_status = vfio_group_status {
747            argsz: mem::size_of::<vfio_group_status>() as u32,
748            flags: 0,
749        };
750        let mut ret =
751            // SAFETY:
752            // Safe as we are the owner of group_file and group_status which are valid value.
753            unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS, &mut group_status) };
754        if ret < 0 {
755            return Err(VfioError::GetGroupStatus(get_error()));
756        }
757
758        if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
759            return Err(VfioError::GroupViable);
760        }
761
762        let container_raw_descriptor = container.as_raw_descriptor();
763        // SAFETY:
764        // Safe as we are the owner of group_file and container_raw_descriptor which are valid
765        // value, and we verify the ret value
766        ret = unsafe {
767            ioctl_with_ref(
768                &group_file,
769                VFIO_GROUP_SET_CONTAINER,
770                &container_raw_descriptor,
771            )
772        };
773        if ret < 0 {
774            return Err(VfioError::GroupSetContainer(get_error()));
775        }
776
777        Ok(VfioGroup {
778            group: group_file,
779            device_num: 0,
780        })
781    }
782
783    fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
784        let mut uuid_path = PathBuf::new();
785        uuid_path.push(sysfspath);
786        uuid_path.push("iommu_group");
787        let group_path = uuid_path
788            .read_link()
789            .map_err(|e| VfioError::ReadLink(e, uuid_path))?;
790        let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
791        let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
792        let group_id = group_str
793            .parse::<u32>()
794            .map_err(|_| VfioError::InvalidPath)?;
795
796        Ok(group_id)
797    }
798
799    fn kvm_device_set_group(
800        &self,
801        kvm_vfio_file: &SafeDescriptor,
802        ops: KvmVfioGroupOps,
803    ) -> Result<()> {
804        let group_descriptor = self.as_raw_descriptor();
805        let group_descriptor_ptr = &group_descriptor as *const i32;
806        let vfio_dev_attr = match ops {
807            KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
808                flags: 0,
809                group: kvm_sys::KVM_DEV_VFIO_GROUP,
810                attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
811                addr: group_descriptor_ptr as u64,
812            },
813            KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
814                flags: 0,
815                group: kvm_sys::KVM_DEV_VFIO_GROUP,
816                attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
817                addr: group_descriptor_ptr as u64,
818            },
819        };
820
821        // SAFETY:
822        // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
823        // and we verify the return value.
824        if 0 != unsafe {
825            ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr)
826        } {
827            return Err(VfioError::KvmSetDeviceAttr(get_error()));
828        }
829
830        Ok(())
831    }
832
833    fn get_device(&self, name: &str) -> Result<File> {
834        let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
835        let path_ptr = path.as_ptr();
836
837        // SAFETY:
838        // Safe as we are the owner of self and path_ptr which are valid value.
839        let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD, path_ptr) };
840        if ret < 0 {
841            return Err(VfioError::GroupGetDeviceFD(get_error()));
842        }
843
844        // SAFETY:
845        // Safe as ret is valid descriptor
846        Ok(unsafe { File::from_raw_descriptor(ret) })
847    }
848
849    fn add_device_num(&mut self) {
850        self.device_num += 1;
851    }
852
853    fn reduce_device_num(&mut self) {
854        self.device_num -= 1;
855    }
856
857    fn device_num(&self) -> u32 {
858        self.device_num
859    }
860}
861
862impl AsRawDescriptor for VfioGroup {
863    fn as_raw_descriptor(&self) -> RawDescriptor {
864        self.group.as_raw_descriptor()
865    }
866}
867
868/// A helper struct for managing VFIO containers
869#[derive(Default)]
870pub struct VfioContainerManager {
871    /// One VFIO container shared by all VFIO devices that don't attach to any IOMMU device.
872    no_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
873
874    /// For IOMMU enabled devices, all VFIO groups that share the same IOVA space are managed by
875    /// one VFIO container.
876    iommu_containers: Vec<Arc<Mutex<VfioContainer>>>,
877
878    /// One VFIO container shared by all VFIO devices that attach to the CoIOMMU device.
879    coiommu_container: Option<Arc<Mutex<VfioContainer>>>,
880
881    /// One VFIO container shared by all VFIO devices that attach to pKVM.
882    pkvm_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
883}
884
885impl VfioContainerManager {
886    pub fn new() -> Self {
887        Self::default()
888    }
889
890    /// The single place to create a VFIO container for a PCI endpoint.
891    ///
892    /// The policy to determine whether an individual or a shared VFIO container
893    /// will be created for this device is governed by the physical PCI topology,
894    /// and the argument iommu_type.
895    ///
896    ///  # Arguments
897    ///
898    ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
899    ///  * `iommu_type` - which type of IOMMU is enabled on this device
900    pub fn get_container<P: AsRef<Path>>(
901        &mut self,
902        iommu_type: IommuDevType,
903        sysfspath: Option<P>,
904    ) -> Result<Arc<Mutex<VfioContainer>>> {
905        match iommu_type {
906            IommuDevType::NoIommu => {
907                // One VFIO container is used for all IOMMU disabled groups.
908                if let Some(container) = &self.no_iommu_container {
909                    Ok(container.clone())
910                } else {
911                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
912                    self.no_iommu_container = Some(container.clone());
913                    Ok(container)
914                }
915            }
916            IommuDevType::VirtioIommu => {
917                let path = sysfspath.ok_or(VfioError::InvalidPath)?;
918                let group_id = VfioGroup::get_group_id(path)?;
919
920                // One VFIO container is used for all devices that belong to one VFIO group.
921                // NOTE: vfio_wrapper relies on each container containing exactly one group.
922                if let Some(container) = self
923                    .iommu_containers
924                    .iter()
925                    .find(|container| container.lock().is_group_set(group_id))
926                {
927                    Ok(container.clone())
928                } else {
929                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
930                    self.iommu_containers.push(container.clone());
931                    Ok(container)
932                }
933            }
934            IommuDevType::CoIommu => {
935                // One VFIO container is used for devices attached to CoIommu
936                if let Some(container) = &self.coiommu_container {
937                    Ok(container.clone())
938                } else {
939                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
940                    self.coiommu_container = Some(container.clone());
941                    Ok(container)
942                }
943            }
944            IommuDevType::PkvmPviommu => {
945                // One VFIO container is used for devices attached to pKVM
946                if let Some(container) = &self.pkvm_iommu_container {
947                    Ok(container.clone())
948                } else {
949                    let container = Arc::new(Mutex::new(VfioContainer::new()?));
950                    self.pkvm_iommu_container = Some(container.clone());
951                    Ok(container)
952                }
953            }
954        }
955    }
956}
957
958/// Vfio Irq type used to enable/disable/mask/unmask vfio irq
959pub enum VfioIrqType {
960    Intx,
961    Msi,
962    Msix,
963}
964
965/// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
966pub struct VfioIrq {
967    pub flags: u32,
968    pub index: u32,
969}
970
971/// Address on VFIO memory region.
972#[derive(Debug, Default, Clone)]
973pub struct VfioRegionAddr {
974    /// region number.
975    pub index: usize,
976    /// offset in the region.
977    pub addr: u64,
978}
979
980#[derive(Debug)]
981pub struct VfioRegion {
982    // flags for this region: read/write/mmap
983    flags: u32,
984    size: u64,
985    // region offset used to read/write with vfio device descriptor
986    offset: u64,
987    // vectors for mmap offset and size
988    mmaps: Vec<vfio_region_sparse_mmap_area>,
989    // type and subtype for cap type
990    cap_info: Option<(u32, u32)>,
991    // if true, then the caller can safely mmap the MSIX region
992    // if false, the caller should remove the MSIX part of the region before mmapping
993    msix_region_mmappable: bool,
994}
995
996/// Vfio device for exposing regions which could be read/write to kernel vfio device.
997pub struct VfioDevice {
998    dev: File,
999    name: String,
1000    container: Arc<Mutex<VfioContainer>>,
1001    dev_type: VfioDeviceType,
1002    group_descriptor: RawDescriptor,
1003    group_id: u32,
1004    // vec for vfio device's regions
1005    regions: Vec<VfioRegion>,
1006    num_irqs: u32,
1007
1008    iova_alloc: Arc<Mutex<AddressAllocator>>,
1009    dt_symbol: Option<String>,
1010    pviommu: Option<(Arc<Mutex<KvmVfioPviommu>>, Vec<u32>)>,
1011}
1012
1013impl VfioDevice {
1014    /// Create a new vfio device, then guest read/write on this device could be
1015    /// transfered into kernel vfio.
1016    /// sysfspath specify the vfio device path in sys file system.
1017    pub fn new_passthrough<P: AsRef<Path>>(
1018        sysfspath: &P,
1019        vm: &impl Vm,
1020        container: Arc<Mutex<VfioContainer>>,
1021        iommu_dev: IommuDevType,
1022        dt_symbol: Option<String>,
1023    ) -> Result<Self> {
1024        let group_id = VfioGroup::get_group_id(sysfspath)?;
1025
1026        let group = container
1027            .lock()
1028            .get_group_with_vm(group_id, vm, iommu_dev)?;
1029        let name_osstr = sysfspath
1030            .as_ref()
1031            .file_name()
1032            .ok_or(VfioError::InvalidPath)?;
1033        let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1034        let name = String::from(name_str);
1035        let dev = group.lock().get_device(&name)?;
1036        let (dev_info, dev_type) = Self::get_device_info(&dev)?;
1037        let regions = Self::get_regions(&dev, dev_info.num_regions)?;
1038        group.lock().add_device_num();
1039        let group_descriptor = group.lock().as_raw_descriptor();
1040
1041        let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1042        let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1043            .map_err(VfioError::Resources)?;
1044
1045        let pviommu = if matches!(iommu_dev, IommuDevType::PkvmPviommu) {
1046            // We currently have a 1-to-1 mapping between pvIOMMUs and VFIO devices.
1047            let pviommu = KvmVfioPviommu::new(vm)?;
1048
1049            let vsids_len = KvmVfioPviommu::get_sid_count(vm, &dev)?.try_into().unwrap();
1050            let max_vsid = u32::MAX.try_into().unwrap();
1051            let random_vsids = sample(&mut rand::rng(), max_vsid, vsids_len).into_iter();
1052            let vsids = Vec::from_iter(random_vsids.map(|v| u32::try_from(v).unwrap()));
1053            for (i, vsid) in vsids.iter().enumerate() {
1054                pviommu.attach(&dev, i.try_into().unwrap(), *vsid)?;
1055            }
1056
1057            Some((Arc::new(Mutex::new(pviommu)), vsids))
1058        } else {
1059            None
1060        };
1061
1062        Ok(VfioDevice {
1063            dev,
1064            name,
1065            container,
1066            dev_type,
1067            group_descriptor,
1068            group_id,
1069            regions,
1070            num_irqs: dev_info.num_irqs,
1071            iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1072            dt_symbol,
1073            pviommu,
1074        })
1075    }
1076
1077    pub fn new<P: AsRef<Path>>(
1078        sysfspath: &P,
1079        container: Arc<Mutex<VfioContainer>>,
1080    ) -> Result<Self> {
1081        let group_id = VfioGroup::get_group_id(sysfspath)?;
1082        let group = container.lock().get_group(group_id)?;
1083        let name_osstr = sysfspath
1084            .as_ref()
1085            .file_name()
1086            .ok_or(VfioError::InvalidPath)?;
1087        let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1088        let name = String::from(name_str);
1089
1090        let dev = match group.lock().get_device(&name) {
1091            Ok(dev) => dev,
1092            Err(e) => {
1093                container.lock().remove_group(group_id, false);
1094                return Err(e);
1095            }
1096        };
1097        let (dev_info, dev_type) = match Self::get_device_info(&dev) {
1098            Ok(dev_info) => dev_info,
1099            Err(e) => {
1100                container.lock().remove_group(group_id, false);
1101                return Err(e);
1102            }
1103        };
1104        let regions = match Self::get_regions(&dev, dev_info.num_regions) {
1105            Ok(regions) => regions,
1106            Err(e) => {
1107                container.lock().remove_group(group_id, false);
1108                return Err(e);
1109            }
1110        };
1111        group.lock().add_device_num();
1112        let group_descriptor = group.lock().as_raw_descriptor();
1113
1114        let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1115        let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1116            .map_err(VfioError::Resources)?;
1117
1118        Ok(VfioDevice {
1119            dev,
1120            name,
1121            container,
1122            dev_type,
1123            group_descriptor,
1124            group_id,
1125            regions,
1126            num_irqs: dev_info.num_irqs,
1127            iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1128            dt_symbol: None,
1129            pviommu: None,
1130        })
1131    }
1132
1133    /// Returns the file for this device.
1134    pub fn dev_file(&self) -> &File {
1135        &self.dev
1136    }
1137
1138    /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
1139    pub fn device_name(&self) -> &String {
1140        &self.name
1141    }
1142
1143    /// Returns the type of this VFIO device.
1144    pub fn device_type(&self) -> VfioDeviceType {
1145        self.dev_type
1146    }
1147
1148    /// Returns the DT symbol (node label) of this VFIO device.
1149    pub fn dt_symbol(&self) -> Option<&str> {
1150        self.dt_symbol.as_deref()
1151    }
1152
1153    /// Returns the type and indentifier (if applicable) of the IOMMU used by this VFIO device and
1154    /// its master IDs.
1155    pub fn iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])> {
1156        // We currently only report IommuDevType::PkvmPviommu.
1157        if let Some((ref pviommu, ref ids)) = self.pviommu {
1158            Some((
1159                IommuDevType::PkvmPviommu,
1160                Some(pviommu.lock().id()),
1161                ids.as_ref(),
1162            ))
1163        } else {
1164            None
1165        }
1166    }
1167
1168    /// Probes support for VFIO LOW_POWER features.
1169    pub fn supports_pm_low_power(&self) -> bool {
1170        if self.probe_pm_low_power_entry().is_err() {
1171            false
1172        } else if self.probe_pm_low_power_exit().is_err() {
1173            warn!("VFIO supports LOW_POWER_ENTRY but not LOW_POWER_EXIT: ignoring feature");
1174            false
1175        } else {
1176            true
1177        }
1178    }
1179
1180    /// enter the device's low power state
1181    pub fn pm_low_power_enter(&self) -> Result<()> {
1182        self.device_feature(VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY)
1183            .map_err(VfioError::VfioPmLowPowerEnter)
1184    }
1185
1186    /// enter the device's low power state with wakeup notification
1187    pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
1188        let payload = vfio_device_low_power_entry_with_wakeup {
1189            wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
1190            reserved: 0,
1191        };
1192        let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
1193        let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
1194        device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
1195        device_feature[0].flags =
1196            VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
1197        // SAFETY:
1198        // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
1199        unsafe {
1200            device_feature[0]
1201                .data
1202                .as_mut_slice(payload_size)
1203                .copy_from_slice(
1204                    mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
1205                        .as_slice(),
1206                );
1207        }
1208        // SAFETY:
1209        // Safe as we are the owner of self and power_management which are valid value
1210        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1211        if ret < 0 {
1212            Err(VfioError::VfioPmLowPowerEnter(get_error()))
1213        } else {
1214            Ok(())
1215        }
1216    }
1217
1218    /// exit the device's low power state
1219    pub fn pm_low_power_exit(&self) -> Result<()> {
1220        self.device_feature(VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT)
1221            .map_err(VfioError::VfioPmLowPowerExit)
1222    }
1223
1224    fn probe_pm_low_power_entry(&self) -> Result<()> {
1225        self.device_feature(VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY)
1226            .map_err(VfioError::VfioProbePmLowPowerEntry)
1227    }
1228
1229    fn probe_pm_low_power_exit(&self) -> Result<()> {
1230        self.device_feature(VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT)
1231            .map_err(VfioError::VfioProbePmLowPowerExit)
1232    }
1233
1234    fn device_feature(&self, flags: u32) -> result::Result<(), Error> {
1235        let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1236        device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1237        device_feature[0].flags = flags;
1238        // SAFETY:
1239        // Safe as we are the owner of self and device_feature which are valid value
1240        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1241        if ret < 0 {
1242            Err(get_error())
1243        } else {
1244            Ok(())
1245        }
1246    }
1247
1248    /// call _DSM from the device's ACPI table
1249    pub fn acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>> {
1250        let count = args.len();
1251        let mut dsm = vec_with_array_field::<vfio_acpi_dsm, u8>(count);
1252        dsm[0].argsz = (mem::size_of::<vfio_acpi_dsm>() + mem::size_of_val(args)) as u32;
1253        dsm[0].padding = 0;
1254        // SAFETY:
1255        // Safe as we allocated enough space to hold args
1256        unsafe {
1257            dsm[0].args.as_mut_slice(count).clone_from_slice(args);
1258        }
1259        // SAFETY:
1260        // Safe as we are the owner of self and dsm which are valid value
1261        let ret = unsafe { ioctl_with_mut_ref(&self.dev, VFIO_DEVICE_ACPI_DSM, &mut dsm[0]) };
1262        if ret < 0 {
1263            Err(VfioError::VfioAcpiDsm(get_error()))
1264        } else {
1265            // SAFETY:
1266            // Safe as we allocated enough space to hold args
1267            let res = unsafe { dsm[0].args.as_slice(count) };
1268            Ok(res.to_vec())
1269        }
1270    }
1271
1272    /// Enable vfio device's ACPI notifications and associate EventFD with device.
1273    pub fn acpi_notification_evt_enable(
1274        &self,
1275        acpi_notification_eventfd: &Event,
1276        index: u32,
1277    ) -> Result<()> {
1278        let u32_size = mem::size_of::<u32>();
1279        let count = 1;
1280
1281        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1282        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1283        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1284        irq_set[0].index = index;
1285        irq_set[0].start = 0;
1286        irq_set[0].count = count as u32;
1287
1288        // SAFETY:
1289        // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1290        let data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1291        data.copy_from_slice(&acpi_notification_eventfd.as_raw_descriptor().to_ne_bytes()[..]);
1292
1293        // SAFETY:
1294        // Safe as we are the owner of self and irq_set which are valid value
1295        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1296        if ret < 0 {
1297            Err(VfioError::VfioAcpiNotificationEnable(get_error()))
1298        } else {
1299            Ok(())
1300        }
1301    }
1302
1303    /// Disable vfio device's ACPI notification and disconnect EventFd with device.
1304    pub fn acpi_notification_disable(&self, index: u32) -> Result<()> {
1305        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1306        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1307        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1308        irq_set[0].index = index;
1309        irq_set[0].start = 0;
1310        irq_set[0].count = 0;
1311
1312        // SAFETY:
1313        // Safe as we are the owner of self and irq_set which are valid value
1314        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1315        if ret < 0 {
1316            Err(VfioError::VfioAcpiNotificationDisable(get_error()))
1317        } else {
1318            Ok(())
1319        }
1320    }
1321
1322    /// Test vfio device's ACPI notification by simulating hardware triggering.
1323    /// When the signaling mechanism is set, the VFIO_IRQ_SET_DATA_BOOL can be used with
1324    /// VFIO_IRQ_SET_ACTION_TRIGGER to perform kernel level interrupt loopback testing.
1325    pub fn acpi_notification_test(&self, index: u32, val: u32) -> Result<()> {
1326        let u32_size = mem::size_of::<u32>();
1327        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1328        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + u32_size) as u32;
1329        irq_set[0].flags = VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_ACTION_TRIGGER;
1330        irq_set[0].index = index;
1331        irq_set[0].start = 0;
1332        irq_set[0].count = 1;
1333
1334        // SAFETY:
1335        // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1336        let data = unsafe { irq_set[0].data.as_mut_slice(u32_size) };
1337        data.copy_from_slice(&val.to_ne_bytes()[..]);
1338
1339        // SAFETY:
1340        // Safe as we are the owner of self and irq_set which are valid value
1341        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1342        if ret < 0 {
1343            Err(VfioError::VfioAcpiNotificationTest(get_error()))
1344        } else {
1345            Ok(())
1346        }
1347    }
1348
1349    /// Enable vfio device's irq and associate Irqfd Event with device.
1350    /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to
1351    /// subindex + descriptors length will be assigned with irqfd in the descriptors array.
1352    /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical
1353    /// device is removed.
1354    /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
1355    /// interrupts if already assigned or skip un-assigned interrupts.
1356    pub fn irq_enable(
1357        &self,
1358        descriptors: &[Option<&Event>],
1359        index: u32,
1360        subindex: u32,
1361    ) -> Result<()> {
1362        let count = descriptors.len();
1363        let u32_size = mem::size_of::<u32>();
1364        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1365        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1366        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1367        irq_set[0].index = index;
1368        irq_set[0].start = subindex;
1369        irq_set[0].count = count as u32;
1370
1371        // SAFETY:
1372        // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
1373        // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1374        // together as u32. It is safe as enough space is reserved through
1375        // vec_with_array_field(u32)<count>.
1376        let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1377        for descriptor in descriptors.iter().take(count) {
1378            let (left, right) = data.split_at_mut(u32_size);
1379            match descriptor {
1380                Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
1381                None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
1382            }
1383            data = right;
1384        }
1385
1386        // SAFETY:
1387        // Safe as we are the owner of self and irq_set which are valid value
1388        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1389        if ret < 0 {
1390            Err(VfioError::VfioIrqEnable(get_error()))
1391        } else {
1392            Ok(())
1393        }
1394    }
1395
1396    /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
1397    /// is used to get guest EOI notification.
1398    /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
1399    /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
1400    /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1401    /// generate another interrupts.
1402    /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1403    ///
1404    /// descriptor: should be resample IrqFd.
1405    pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1406        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1407        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1408        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1409        irq_set[0].index = index;
1410        irq_set[0].start = 0;
1411        irq_set[0].count = 1;
1412
1413        {
1414            // SAFETY:
1415            // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1416            // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1417            // together as u32. It is safe as enough space is reserved through
1418            // vec_with_array_field(u32)<1>.
1419            let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1420            descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1421        }
1422
1423        // SAFETY:
1424        // Safe as we are the owner of self and irq_set which are valid value
1425        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1426        if ret < 0 {
1427            Err(VfioError::VfioIrqEnable(get_error()))
1428        } else {
1429            Ok(())
1430        }
1431    }
1432
1433    /// disable vfio device's irq and disconnect Irqfd Event with device
1434    pub fn irq_disable(&self, index: u32) -> Result<()> {
1435        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1436        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1437        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1438        irq_set[0].index = index;
1439        irq_set[0].start = 0;
1440        irq_set[0].count = 0;
1441
1442        // SAFETY:
1443        // Safe as we are the owner of self and irq_set which are valid value
1444        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1445        if ret < 0 {
1446            Err(VfioError::VfioIrqDisable(get_error()))
1447        } else {
1448            Ok(())
1449        }
1450    }
1451
1452    /// Unmask vfio device irq
1453    pub fn irq_unmask(&self, index: u32) -> Result<()> {
1454        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1455        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1456        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1457        irq_set[0].index = index;
1458        irq_set[0].start = 0;
1459        irq_set[0].count = 1;
1460
1461        // SAFETY:
1462        // Safe as we are the owner of self and irq_set which are valid value
1463        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1464        if ret < 0 {
1465            Err(VfioError::VfioIrqUnmask(get_error()))
1466        } else {
1467            Ok(())
1468        }
1469    }
1470
1471    /// Mask vfio device irq
1472    pub fn irq_mask(&self, index: u32) -> Result<()> {
1473        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1474        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1475        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1476        irq_set[0].index = index;
1477        irq_set[0].start = 0;
1478        irq_set[0].count = 1;
1479
1480        // SAFETY:
1481        // Safe as we are the owner of self and irq_set which are valid value
1482        let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1483        if ret < 0 {
1484            Err(VfioError::VfioIrqMask(get_error()))
1485        } else {
1486            Ok(())
1487        }
1488    }
1489
1490    /// Get and validate VFIO device information.
1491    fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1492        let mut dev_info = vfio_device_info {
1493            argsz: mem::size_of::<vfio_device_info>() as u32,
1494            flags: 0,
1495            num_regions: 0,
1496            num_irqs: 0,
1497            ..Default::default()
1498        };
1499
1500        // SAFETY:
1501        // Safe as we are the owner of device_file and dev_info which are valid value,
1502        // and we verify the return value.
1503        let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO, &mut dev_info) };
1504        if ret < 0 {
1505            return Err(VfioError::VfioDeviceGetInfo(get_error()));
1506        }
1507
1508        let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1509            if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1510                || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1511            {
1512                return Err(VfioError::VfioDeviceGetInfo(get_error()));
1513            }
1514
1515            VfioDeviceType::Pci
1516        } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1517            VfioDeviceType::Platform
1518        } else {
1519            return Err(VfioError::UnknownDeviceType(dev_info.flags));
1520        };
1521
1522        Ok((dev_info, dev_type))
1523    }
1524
1525    /// Query interrupt information
1526    /// return: Vector of interrupts information, each of which contains flags and index
1527    pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1528        let mut irqs: Vec<VfioIrq> = Vec::new();
1529
1530        for i in 0..self.num_irqs {
1531            let argsz = mem::size_of::<vfio_irq_info>() as u32;
1532            let mut irq_info = vfio_irq_info {
1533                argsz,
1534                flags: 0,
1535                index: i,
1536                count: 0,
1537            };
1538            // SAFETY:
1539            // Safe as we are the owner of dev and irq_info which are valid value,
1540            // and we verify the return value.
1541            let ret = unsafe {
1542                ioctl_with_mut_ref(self.device_file(), VFIO_DEVICE_GET_IRQ_INFO, &mut irq_info)
1543            };
1544            if ret < 0 || irq_info.count != 1 {
1545                return Err(VfioError::VfioDeviceGetInfo(get_error()));
1546            }
1547
1548            let irq = VfioIrq {
1549                flags: irq_info.flags,
1550                index: irq_info.index,
1551            };
1552            irqs.push(irq);
1553        }
1554        Ok(irqs)
1555    }
1556
1557    #[allow(clippy::cast_ptr_alignment)]
1558    fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1559        let mut regions: Vec<VfioRegion> = Vec::new();
1560        for i in 0..num_regions {
1561            let argsz = mem::size_of::<vfio_region_info>() as u32;
1562            let mut reg_info = vfio_region_info {
1563                argsz,
1564                flags: 0,
1565                index: i,
1566                cap_offset: 0,
1567                size: 0,
1568                offset: 0,
1569            };
1570            let ret =
1571                // SAFETY:
1572                // Safe as we are the owner of dev and reg_info which are valid value,
1573                // and we verify the return value.
1574                unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO, &mut reg_info) };
1575            if ret < 0 {
1576                continue;
1577            }
1578
1579            let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1580            let mut cap_info: Option<(u32, u32)> = None;
1581            let mut msix_region_mmappable = false;
1582            if reg_info.argsz > argsz {
1583                let cap_len: usize = (reg_info.argsz - argsz) as usize;
1584                let mut region_with_cap =
1585                    vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1586                region_with_cap[0].region_info.argsz = reg_info.argsz;
1587                region_with_cap[0].region_info.flags = 0;
1588                region_with_cap[0].region_info.index = i;
1589                region_with_cap[0].region_info.cap_offset = 0;
1590                region_with_cap[0].region_info.size = 0;
1591                region_with_cap[0].region_info.offset = 0;
1592                // SAFETY:
1593                // Safe as we are the owner of dev and region_info which are valid value,
1594                // and we verify the return value.
1595                let ret = unsafe {
1596                    ioctl_with_mut_ref(
1597                        dev,
1598                        VFIO_DEVICE_GET_REGION_INFO,
1599                        &mut (region_with_cap[0].region_info),
1600                    )
1601                };
1602                if ret < 0 {
1603                    return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1604                }
1605
1606                // Some drivers (e.g. for NVIDIA vGPUs) do not fully populate the
1607                // `vfio_region_info` structure in response to the
1608                // `VFIO_DEVICE_GET_REGION_INFO` call if the passed size is not enough
1609                // to hold the entirety of the data.
1610                // This ensures we use complete data when we construct the `VfioRegion`
1611                // instance.
1612                reg_info = region_with_cap[0].region_info;
1613
1614                if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1615                    continue;
1616                }
1617
1618                let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1619                let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1620                let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1621                let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1622                let region_info_sz = reg_info.argsz;
1623
1624                // region_with_cap[0].cap_info may contain many structures, like
1625                // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1626                // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1627                // vfio_into_cap_header.
1628                // Go through all the cap structs.
1629                let info_ptr = region_with_cap.as_ptr() as *mut u8;
1630                let mut offset = region_with_cap[0].region_info.cap_offset;
1631                while offset != 0 {
1632                    if offset + cap_header_sz > region_info_sz {
1633                        break;
1634                    }
1635                    // SAFETY:
1636                    // Safe, as cap_header struct is in this function allocated region_with_cap
1637                    // vec.
1638                    let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1639                    // SAFETY:
1640                    // Safe, as cap_header struct is in this function allocated region_with_cap
1641                    // vec.
1642                    let cap_header = unsafe { &*(cap_ptr as *const vfio_info_cap_header) };
1643                    if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1644                        if offset + mmap_cap_sz > region_info_sz {
1645                            break;
1646                        }
1647                        // cap_ptr is vfio_region_info_cap_sparse_mmap here
1648                        let sparse_mmap =
1649                            // SAFETY:
1650                            // Safe, this vfio_region_info_cap_sparse_mmap is in this function
1651                            // allocated region_with_cap vec.
1652                            unsafe { &*(cap_ptr as *const vfio_region_info_cap_sparse_mmap) };
1653
1654                        let area_num = sparse_mmap.nr_areas;
1655                        if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1656                            break;
1657                        }
1658                        let areas =
1659                            // SAFETY:
1660                            // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1661                            // region_with_cap vec.
1662                            unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1663                        for area in areas.iter() {
1664                            mmaps.push(*area);
1665                        }
1666
1667                        // Sparse regions means the driver can decide which parts of the BAR are
1668                        // safe to mmap. If that overlaps with the MSIX
1669                        // data, that's the decision of the driver.
1670                        // This is required for some devices (e.g. NVIDIA vGPUs).
1671                        msix_region_mmappable = true;
1672                    } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1673                        if offset + type_cap_sz > region_info_sz {
1674                            break;
1675                        }
1676                        // cap_ptr is vfio_region_info_cap_type here
1677                        let cap_type_info =
1678                            // SAFETY:
1679                            // Safe, this vfio_region_info_cap_type is in this function allocated
1680                            // region_with_cap vec
1681                            unsafe { &*(cap_ptr as *const vfio_region_info_cap_type) };
1682
1683                        cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1684                    } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1685                        mmaps.push(vfio_region_sparse_mmap_area {
1686                            offset: 0,
1687                            size: region_with_cap[0].region_info.size,
1688                        });
1689                        msix_region_mmappable = true;
1690                    }
1691
1692                    offset = cap_header.next;
1693                }
1694            } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1695                mmaps.push(vfio_region_sparse_mmap_area {
1696                    offset: 0,
1697                    size: reg_info.size,
1698                });
1699            }
1700
1701            let region = VfioRegion {
1702                flags: reg_info.flags,
1703                size: reg_info.size,
1704                offset: reg_info.offset,
1705                mmaps,
1706                cap_info,
1707                msix_region_mmappable,
1708            };
1709            regions.push(region);
1710        }
1711
1712        Ok(regions)
1713    }
1714
1715    /// get a region's flag
1716    /// the return's value may conatin:
1717    ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1718    ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1719    ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1720    ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
1721    pub fn get_region_flags(&self, index: usize) -> u32 {
1722        match self.regions.get(index) {
1723            Some(v) => v.flags,
1724            None => {
1725                warn!("get_region_flags() with invalid index: {}", index);
1726                0
1727            }
1728        }
1729    }
1730
1731    /// get a region's offset
1732    /// return: Region offset from the start of vfio device descriptor
1733    pub fn get_region_offset(&self, index: usize) -> u64 {
1734        match self.regions.get(index) {
1735            Some(v) => v.offset,
1736            None => {
1737                warn!("get_region_offset with invalid index: {}", index);
1738                0
1739            }
1740        }
1741    }
1742
1743    /// get a region's size
1744    /// return: Region size from the start of vfio device descriptor
1745    pub fn get_region_size(&self, index: usize) -> u64 {
1746        match self.regions.get(index) {
1747            Some(v) => v.size,
1748            None => {
1749                warn!("get_region_size with invalid index: {}", index);
1750                0
1751            }
1752        }
1753    }
1754
1755    /// get a number of regions
1756    /// return: Number of regions of vfio device descriptor
1757    pub fn get_region_count(&self) -> usize {
1758        self.regions.len()
1759    }
1760
1761    /// get a region's mmap info vector
1762    pub fn get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area> {
1763        match self.regions.get(index) {
1764            Some(v) => v.mmaps.clone(),
1765            None => {
1766                warn!("get_region_mmap with invalid index: {}", index);
1767                Vec::new()
1768            }
1769        }
1770    }
1771
1772    /// get if the MSIX data with a region is safe to mmap, or if it should be removed
1773    /// before mmapping
1774    pub fn get_region_msix_mmappable(&self, index: usize) -> bool {
1775        match self.regions.get(index) {
1776            Some(v) => v.msix_region_mmappable,
1777            None => {
1778                warn!("get_region_msix_mmappable with invalid index: {}", index);
1779                false
1780            }
1781        }
1782    }
1783
1784    /// find the specified cap type in device regions
1785    /// Input:
1786    ///      type_:  cap type
1787    ///      sub_type: cap sub_type
1788    /// Output:
1789    ///     None: device doesn't have the specified cap type
1790    ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1791    ///                                     index and size
1792    pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1793        for (index, region) in self.regions.iter().enumerate() {
1794            if let Some(cap_info) = &region.cap_info {
1795                if cap_info.0 == type_ && cap_info.1 == sub_type {
1796                    return Some((index as u32, region.size));
1797                }
1798            }
1799        }
1800
1801        None
1802    }
1803
1804    /// Returns file offset corresponding to the given `VfioRegionAddr`.
1805    /// The offset can be used when reading/writing the VFIO device's FD directly.
1806    pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1807        let region = self
1808            .regions
1809            .get(addr.index)
1810            .ok_or(VfioError::InvalidIndex(addr.index))?;
1811        Ok(region.offset + addr.addr)
1812    }
1813
1814    /// Read region's data from VFIO device into buf
1815    /// index: region num
1816    /// buf: data destination and buf length is read size
1817    /// addr: offset in the region
1818    pub fn region_read(&self, index: usize, buf: &mut [u8], addr: u64) {
1819        let stub: &VfioRegion = self
1820            .regions
1821            .get(index)
1822            .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {index}"));
1823
1824        let size = buf.len() as u64;
1825        if size > stub.size || addr + size > stub.size {
1826            panic!(
1827                "tried to read VFIO region with invalid arguments: index={index}, addr=0x{addr:x}, size=0x{size:x}"
1828            );
1829        }
1830
1831        self.dev
1832            .read_exact_at(buf, stub.offset + addr)
1833            .unwrap_or_else(|e| {
1834                panic!("failed to read region: index={index}, addr=0x{addr:x}, error={e}")
1835            });
1836    }
1837
1838    /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
1839    pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1840        let mut val = mem::MaybeUninit::zeroed();
1841        let buf =
1842            // SAFETY:
1843            // Safe because we have zero-initialized `size_of::<T>()` bytes.
1844            unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1845        self.region_read(addr.index, buf, addr.addr + offset);
1846        // SAFETY:
1847        // Safe because any bit pattern is valid for a type that implements FromBytes.
1848        unsafe { val.assume_init() }
1849    }
1850
1851    /// write the data from buf into a vfio device region
1852    /// index: region num
1853    /// buf: data src and buf length is write size
1854    /// addr: offset in the region
1855    pub fn region_write(&self, index: usize, buf: &[u8], addr: u64) {
1856        let stub: &VfioRegion = self
1857            .regions
1858            .get(index)
1859            .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {index}"));
1860
1861        let size = buf.len() as u64;
1862        if size > stub.size
1863            || addr + size > stub.size
1864            || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1865        {
1866            panic!(
1867                "tried to write VFIO region with invalid arguments: index={index}, addr=0x{addr:x}, size=0x{size:x}"
1868            );
1869        }
1870
1871        self.dev
1872            .write_all_at(buf, stub.offset + addr)
1873            .unwrap_or_else(|e| {
1874                panic!("failed to write region: index={index}, addr=0x{addr:x}, error={e}")
1875            });
1876    }
1877
1878    /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
1879    pub fn region_write_to_addr(&self, data: &[u8], addr: &VfioRegionAddr, offset: u64) {
1880        self.region_write(addr.index, data, addr.addr + offset);
1881    }
1882
1883    /// get vfio device's descriptors which are passed into minijail process
1884    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1885        vec![
1886            self.dev.as_raw_descriptor(),
1887            self.group_descriptor,
1888            self.container.lock().as_raw_descriptor(),
1889        ]
1890    }
1891
1892    /// Add (iova, user_addr) map into vfio container iommu table
1893    /// # Safety
1894    ///
1895    /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
1896    pub unsafe fn vfio_dma_map(
1897        &self,
1898        iova: u64,
1899        size: u64,
1900        user_addr: u64,
1901        write_en: bool,
1902    ) -> Result<()> {
1903        self.container
1904            .lock()
1905            .vfio_dma_map(iova, size, user_addr, write_en)
1906    }
1907
1908    /// Remove (iova, user_addr) map from vfio container iommu table
1909    pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1910        self.container.lock().vfio_dma_unmap(iova, size)
1911    }
1912
1913    pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1914        self.container.lock().vfio_get_iommu_page_size_mask()
1915    }
1916
1917    pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1918        self.iova_alloc
1919            .lock()
1920            .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1921            .map_err(VfioError::Resources)
1922    }
1923
1924    pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1925        self.iova_alloc.lock().get(alloc).map(|res| res.0)
1926    }
1927
1928    pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1929        self.iova_alloc
1930            .lock()
1931            .release(alloc)
1932            .map_err(VfioError::Resources)
1933    }
1934
1935    pub fn get_max_addr(&self) -> u64 {
1936        self.iova_alloc.lock().get_max_addr()
1937    }
1938
1939    /// Gets the vfio device backing `File`.
1940    pub fn device_file(&self) -> &File {
1941        &self.dev
1942    }
1943
1944    /// close vfio device
1945    pub fn close(&self) {
1946        self.container.lock().remove_group(self.group_id, true);
1947    }
1948}
1949
1950pub struct VfioPciConfig {
1951    device: Arc<VfioDevice>,
1952}
1953
1954impl VfioPciConfig {
1955    pub fn new(device: Arc<VfioDevice>) -> Self {
1956        VfioPciConfig { device }
1957    }
1958
1959    pub fn read_config<T: IntoBytes + FromBytes>(&self, offset: u32) -> T {
1960        let mut config = T::new_zeroed();
1961        self.device.region_read(
1962            VFIO_PCI_CONFIG_REGION_INDEX as usize,
1963            config.as_mut_bytes(),
1964            offset.into(),
1965        );
1966        config
1967    }
1968
1969    pub fn write_config<T: Immutable + IntoBytes>(&self, config: T, offset: u32) {
1970        self.device.region_write(
1971            VFIO_PCI_CONFIG_REGION_INDEX as usize,
1972            config.as_bytes(),
1973            offset.into(),
1974        );
1975    }
1976
1977    /// Set the VFIO device this config refers to as the bus master.
1978    pub fn set_bus_master(&self) {
1979        /// Constant definitions from `linux/pci_regs.h`.
1980        const PCI_COMMAND: u32 = 0x4;
1981        /// Enable bus mastering
1982        const PCI_COMMAND_MASTER: u16 = 0x4;
1983
1984        let mut cmd: u16 = self.read_config(PCI_COMMAND);
1985
1986        if cmd & PCI_COMMAND_MASTER != 0 {
1987            return;
1988        }
1989
1990        cmd |= PCI_COMMAND_MASTER;
1991
1992        self.write_config(cmd, PCI_COMMAND);
1993    }
1994}
1995
1996impl AsRawDescriptor for VfioDevice {
1997    fn as_raw_descriptor(&self) -> RawDescriptor {
1998        self.dev.as_raw_descriptor()
1999    }
2000}