devices/pci/
vfio_pci.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::cmp::max;
6use std::cmp::Reverse;
7use std::collections::BTreeMap;
8use std::collections::BTreeSet;
9use std::path::Path;
10use std::path::PathBuf;
11use std::str::FromStr;
12use std::sync::Arc;
13
14use acpi_tables::aml::Aml;
15use base::debug;
16use base::error;
17use base::pagesize;
18use base::warn;
19use base::AsRawDescriptor;
20use base::AsRawDescriptors;
21use base::Event;
22use base::EventToken;
23use base::MemoryMapping;
24use base::Protection;
25use base::RawDescriptor;
26use base::Tube;
27use base::WaitContext;
28use base::WorkerThread;
29use hypervisor::MemCacheType;
30use resources::AddressRange;
31use resources::Alloc;
32use resources::AllocOptions;
33use resources::MmioType;
34use resources::SystemAllocator;
35use sync::Mutex;
36use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
37use vfio_sys::*;
38use vm_control::api::VmMemoryClient;
39use vm_control::HotPlugDeviceInfo;
40use vm_control::HotPlugDeviceType;
41use vm_control::PciId;
42use vm_control::VmMemoryDestination;
43use vm_control::VmMemoryRegionId;
44use vm_control::VmMemorySource;
45use vm_control::VmRequest;
46use vm_control::VmResponse;
47
48use crate::pci::acpi::DeviceVcfgRegister;
49use crate::pci::acpi::DsmMethod;
50use crate::pci::acpi::PowerResourceMethod;
51use crate::pci::acpi::SHM_OFFSET;
52use crate::pci::msi::MsiConfig;
53use crate::pci::msi::MsiStatus;
54use crate::pci::msi::PCI_MSI_FLAGS;
55use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58use crate::pci::msix::MsixConfig;
59use crate::pci::msix::MsixStatus;
60use crate::pci::msix::BITS_PER_PBA_ENTRY;
61use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63use crate::pci::pci_device::BarRange;
64use crate::pci::pci_device::Error as PciDeviceError;
65use crate::pci::pci_device::PciDevice;
66use crate::pci::pci_device::PreferredIrq;
67use crate::pci::pm::PciPmCap;
68use crate::pci::pm::PmConfig;
69use crate::pci::pm::PM_CAP_LENGTH;
70use crate::pci::PciAddress;
71use crate::pci::PciBarConfiguration;
72use crate::pci::PciBarIndex;
73use crate::pci::PciBarPrefetchable;
74use crate::pci::PciBarRegionType;
75use crate::pci::PciCapabilityID;
76use crate::pci::PciClassCode;
77use crate::pci::PciInterruptPin;
78use crate::pci::PCI_VCFG_DSM;
79use crate::pci::PCI_VCFG_NOTY;
80use crate::pci::PCI_VCFG_PM;
81use crate::pci::PCI_VENDOR_ID_INTEL;
82use crate::vfio::VfioDevice;
83use crate::vfio::VfioError;
84use crate::vfio::VfioIrqType;
85use crate::vfio::VfioPciConfig;
86use crate::IrqLevelEvent;
87use crate::Suspendable;
88
89const PCI_VENDOR_ID: u32 = 0x0;
90const PCI_DEVICE_ID: u32 = 0x2;
91const PCI_COMMAND: u32 = 0x4;
92const PCI_COMMAND_MEMORY: u8 = 0x2;
93const PCI_BASE_CLASS_CODE: u32 = 0x0B;
94const PCI_INTERRUPT_NUM: u32 = 0x3C;
95const PCI_INTERRUPT_PIN: u32 = 0x3D;
96
97const PCI_CAPABILITY_LIST: u32 = 0x34;
98const PCI_CAP_ID_MSI: u8 = 0x05;
99const PCI_CAP_ID_MSIX: u8 = 0x11;
100const PCI_CAP_ID_PM: u8 = 0x01;
101
102// Size of the standard PCI config space
103const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
104// Size of the standard PCIe config space: 4KB
105const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
106
107// Extended Capabilities
108const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
109const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
110const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
111const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
112
113struct VfioPmCap {
114    offset: u32,
115    capabilities: u32,
116    config: PmConfig,
117}
118
119impl VfioPmCap {
120    fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
121        let mut capabilities: u32 = config.read_config(cap_start);
122        capabilities |= (PciPmCap::default_cap() as u32) << 16;
123        VfioPmCap {
124            offset: cap_start,
125            capabilities,
126            config: PmConfig::new(false),
127        }
128    }
129
130    pub fn should_trigger_pme(&mut self) -> bool {
131        self.config.should_trigger_pme()
132    }
133
134    fn is_pm_reg(&self, offset: u32) -> bool {
135        (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
136    }
137
138    pub fn read(&self, offset: u32) -> u32 {
139        let offset = offset - self.offset;
140        if offset == 0 {
141            self.capabilities
142        } else {
143            let mut data = 0;
144            self.config.read(&mut data);
145            data
146        }
147    }
148
149    pub fn write(&mut self, offset: u64, data: &[u8]) {
150        let offset = offset - self.offset as u64;
151        if offset >= std::mem::size_of::<u32>() as u64 {
152            let offset = offset - std::mem::size_of::<u32>() as u64;
153            self.config.write(offset, data);
154        }
155    }
156}
157
158enum VfioMsiChange {
159    Disable,
160    Enable,
161    FunctionChanged,
162}
163
164struct VfioMsiCap {
165    config: MsiConfig,
166    offset: u32,
167}
168
169impl VfioMsiCap {
170    fn new(
171        config: &VfioPciConfig,
172        msi_cap_start: u32,
173        vm_socket_irq: Tube,
174        device_id: u32,
175        device_name: String,
176    ) -> Self {
177        let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
178        let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
179        let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
180
181        VfioMsiCap {
182            config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
183            offset: msi_cap_start,
184        }
185    }
186
187    fn is_msi_reg(&self, index: u64, len: usize) -> bool {
188        self.config.is_msi_reg(self.offset, index, len)
189    }
190
191    fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
192        let offset = index as u32 - self.offset;
193        match self.config.write_msi_capability(offset, data) {
194            MsiStatus::Enabled => Some(VfioMsiChange::Enable),
195            MsiStatus::Disabled => Some(VfioMsiChange::Disable),
196            MsiStatus::NothingToDo => None,
197        }
198    }
199
200    fn get_msi_irqfd(&self) -> Option<&Event> {
201        self.config.get_irqfd()
202    }
203
204    fn destroy(&mut self) {
205        self.config.destroy()
206    }
207}
208
209// MSI-X registers in MSI-X capability
210const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
211const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
212const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
213const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
214const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
215const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
216const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
217const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
218
219struct VfioMsixCap {
220    config: MsixConfig,
221    offset: u32,
222    table_size: u16,
223    table_pci_bar: PciBarIndex,
224    table_offset: u64,
225    table_size_bytes: u64,
226    pba_pci_bar: PciBarIndex,
227    pba_offset: u64,
228    pba_size_bytes: u64,
229    msix_interrupt_evt: Vec<Event>,
230}
231
232impl VfioMsixCap {
233    fn new(
234        config: &VfioPciConfig,
235        msix_cap_start: u32,
236        vm_socket_irq: Tube,
237        pci_id: u32,
238        device_name: String,
239    ) -> Self {
240        let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
241        let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
242        let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
243        let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
244        let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
245        let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
246        let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
247
248        let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
249        if table_pci_bar == pba_pci_bar
250            && pba_offset > table_offset
251            && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
252        {
253            table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
254        }
255
256        let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
257        let pba_size_bytes =
258            table_size.div_ceil(BITS_PER_PBA_ENTRY as u64) * MSIX_PBA_ENTRIES_MODULO;
259        let mut msix_interrupt_evt = Vec::new();
260        for _ in 0..table_size {
261            msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
262        }
263        VfioMsixCap {
264            config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
265            offset: msix_cap_start,
266            table_size: table_size as u16,
267            table_pci_bar,
268            table_offset,
269            table_size_bytes,
270            pba_pci_bar,
271            pba_offset,
272            pba_size_bytes,
273            msix_interrupt_evt,
274        }
275    }
276
277    // only msix control register is writable and need special handle in pci r/w
278    fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
279        let control_start = self.offset + PCI_MSIX_FLAGS;
280        let control_end = control_start + 2;
281
282        offset < control_end && offset + size > control_start
283    }
284
285    fn read_msix_control(&self, data: &mut u32) {
286        *data = self.config.read_msix_capability(*data);
287    }
288
289    fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
290        let old_enabled = self.config.enabled();
291        let old_masked = self.config.masked();
292
293        self.config
294            .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
295
296        let new_enabled = self.config.enabled();
297        let new_masked = self.config.masked();
298
299        if !old_enabled && new_enabled {
300            Some(VfioMsiChange::Enable)
301        } else if old_enabled && !new_enabled {
302            Some(VfioMsiChange::Disable)
303        } else if new_enabled && old_masked != new_masked {
304            Some(VfioMsiChange::FunctionChanged)
305        } else {
306            None
307        }
308    }
309
310    fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
311        bar_index == self.table_pci_bar
312            && offset >= self.table_offset
313            && offset < self.table_offset + self.table_size_bytes
314    }
315
316    fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
317        if bar_index == self.table_pci_bar {
318            AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
319        } else {
320            None
321        }
322    }
323
324    fn read_table(&self, offset: u64, data: &mut [u8]) {
325        let offset = offset - self.table_offset;
326        self.config.read_msix_table(offset, data);
327    }
328
329    fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
330        let offset = offset - self.table_offset;
331        self.config.write_msix_table(offset, data)
332    }
333
334    fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
335        bar_index == self.pba_pci_bar
336            && offset >= self.pba_offset
337            && offset < self.pba_offset + self.pba_size_bytes
338    }
339
340    fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
341        if bar_index == self.pba_pci_bar {
342            AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
343        } else {
344            None
345        }
346    }
347
348    fn read_pba(&self, offset: u64, data: &mut [u8]) {
349        let offset = offset - self.pba_offset;
350        self.config.read_pba_entries(offset, data);
351    }
352
353    fn write_pba(&mut self, offset: u64, data: &[u8]) {
354        let offset = offset - self.pba_offset;
355        self.config.write_pba_entries(offset, data);
356    }
357
358    fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
359        let irqfd = self.config.get_irqfd(index);
360        if let Some(fd) = irqfd {
361            if self.msix_vector_masked(index) {
362                Some(&self.msix_interrupt_evt[index])
363            } else {
364                Some(fd)
365            }
366        } else {
367            None
368        }
369    }
370
371    fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
372        let mut irqfds = Vec::new();
373
374        for i in 0..self.table_size {
375            irqfds.push(self.get_msix_irqfd(i as usize));
376        }
377
378        irqfds
379    }
380
381    fn table_size(&self) -> usize {
382        self.table_size.into()
383    }
384
385    fn clone_msix_evt(&self) -> Vec<Event> {
386        self.msix_interrupt_evt
387            .iter()
388            .map(|irq| irq.try_clone().unwrap())
389            .collect()
390    }
391
392    fn msix_vector_masked(&self, index: usize) -> bool {
393        !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
394    }
395
396    fn trigger(&mut self, index: usize) {
397        self.config.trigger(index as u16);
398    }
399
400    fn destroy(&mut self) {
401        self.config.destroy()
402    }
403}
404
405impl AsRawDescriptors for VfioMsixCap {
406    fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
407        let mut rds = vec![self.config.as_raw_descriptor()];
408        rds.extend(
409            self.msix_interrupt_evt
410                .iter()
411                .map(|evt| evt.as_raw_descriptor()),
412        );
413        rds
414    }
415}
416
417struct VfioResourceAllocator {
418    // The region that is not allocated yet.
419    regions: BTreeSet<AddressRange>,
420}
421
422impl VfioResourceAllocator {
423    // Creates a new `VfioResourceAllocator` for managing VFIO resources.
424    // Can return `Err` if `base` + `size` overflows a u64.
425    //
426    // * `base` - The starting address of the range to manage.
427    // * `size` - The size of the address range in bytes.
428    fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
429        if pool.is_empty() {
430            return Err(PciDeviceError::SizeZero);
431        }
432        let mut regions = BTreeSet::new();
433        regions.insert(pool);
434        Ok(VfioResourceAllocator { regions })
435    }
436
437    fn internal_allocate_from_slot(
438        &mut self,
439        slot: AddressRange,
440        range: AddressRange,
441    ) -> Result<u64, PciDeviceError> {
442        let slot_was_present = self.regions.remove(&slot);
443        assert!(slot_was_present);
444
445        let (before, after) = slot.non_overlapping_ranges(range);
446
447        if !before.is_empty() {
448            self.regions.insert(before);
449        }
450        if !after.is_empty() {
451            self.regions.insert(after);
452        }
453
454        Ok(range.start)
455    }
456
457    // Allocates a range of addresses from the managed region with a minimal alignment.
458    // Overlapping with a previous allocation is _not_ allowed.
459    // Returns allocated address.
460    fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
461        if size == 0 {
462            return Err(PciDeviceError::SizeZero);
463        }
464        if !alignment.is_power_of_two() {
465            return Err(PciDeviceError::BadAlignment);
466        }
467
468        // finds first region matching alignment and size.
469        let region = self.regions.iter().find(|range| {
470            match range.start % alignment {
471                0 => range.start.checked_add(size - 1),
472                r => range.start.checked_add(size - 1 + alignment - r),
473            }
474            .is_some_and(|end| end <= range.end)
475        });
476
477        match region {
478            Some(&slot) => {
479                let start = match slot.start % alignment {
480                    0 => slot.start,
481                    r => slot.start + alignment - r,
482                };
483                let end = start + size - 1;
484                let range = AddressRange::from_start_and_end(start, end);
485
486                self.internal_allocate_from_slot(slot, range)
487            }
488            None => Err(PciDeviceError::OutOfSpace),
489        }
490    }
491
492    // Allocates a range of addresses from the managed region with a required location.
493    // Overlapping with a previous allocation is allowed.
494    fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
495        if range.is_empty() {
496            return Err(PciDeviceError::SizeZero);
497        }
498
499        while let Some(&slot) = self
500            .regions
501            .iter()
502            .find(|avail_range| avail_range.overlaps(range))
503        {
504            let _address = self.internal_allocate_from_slot(slot, range)?;
505        }
506        Ok(())
507    }
508}
509
510struct VfioPciWorker {
511    address: PciAddress,
512    sysfs_path: PathBuf,
513    vm_socket: Tube,
514    name: String,
515    pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
516    msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
517}
518
519impl VfioPciWorker {
520    fn run(
521        &mut self,
522        req_irq_evt: Event,
523        wakeup_evt: Event,
524        acpi_notify_evt: Event,
525        kill_evt: Event,
526        msix_evt: Vec<Event>,
527        is_in_low_power: Arc<Mutex<bool>>,
528        gpe: Option<u32>,
529        notification_val: Arc<Mutex<Vec<u32>>>,
530    ) {
531        #[derive(EventToken, Debug)]
532        enum Token {
533            ReqIrq,
534            WakeUp,
535            AcpiNotifyEvent,
536            Kill,
537            MsixIrqi { index: usize },
538        }
539
540        let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
541            (&req_irq_evt, Token::ReqIrq),
542            (&wakeup_evt, Token::WakeUp),
543            (&acpi_notify_evt, Token::AcpiNotifyEvent),
544            (&kill_evt, Token::Kill),
545        ]) {
546            Ok(pc) => pc,
547            Err(e) => {
548                error!(
549                    "{} failed creating vfio WaitContext: {}",
550                    self.name.clone(),
551                    e
552                );
553                return;
554            }
555        };
556
557        for (index, msix_int) in msix_evt.iter().enumerate() {
558            wait_ctx
559                .add(msix_int, Token::MsixIrqi { index })
560                .expect("Failed to create vfio WaitContext for msix interrupt event")
561        }
562
563        'wait: loop {
564            let events = match wait_ctx.wait() {
565                Ok(v) => v,
566                Err(e) => {
567                    error!("{} failed polling vfio events: {}", self.name.clone(), e);
568                    break;
569                }
570            };
571
572            for event in events.iter().filter(|e| e.is_readable) {
573                match event.token {
574                    Token::MsixIrqi { index } => {
575                        if let Some(msix_cap) = &self.msix_cap {
576                            msix_cap.lock().trigger(index);
577                        }
578                    }
579                    Token::ReqIrq => {
580                        let device = HotPlugDeviceInfo {
581                            device_type: HotPlugDeviceType::EndPoint,
582                            path: self.sysfs_path.clone(),
583                            hp_interrupt: false,
584                        };
585
586                        let request = VmRequest::HotPlugVfioCommand { device, add: false };
587                        if self.vm_socket.send(&request).is_ok() {
588                            if let Err(e) = self.vm_socket.recv::<VmResponse>() {
589                                error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
590                            } else {
591                                break 'wait;
592                            }
593                        }
594                    }
595                    Token::WakeUp => {
596                        let _ = wakeup_evt.wait();
597
598                        if *is_in_low_power.lock() {
599                            if let Some(pm_cap) = &self.pm_cap {
600                                if pm_cap.lock().should_trigger_pme() {
601                                    let request =
602                                        VmRequest::PciPme(self.address.pme_requester_id());
603                                    if self.vm_socket.send(&request).is_ok() {
604                                        if let Err(e) = self.vm_socket.recv::<VmResponse>() {
605                                            error!(
606                                                "{} failed to send PME: {}",
607                                                self.name.clone(),
608                                                e
609                                            );
610                                        }
611                                    }
612                                }
613                            }
614                        }
615                    }
616                    Token::AcpiNotifyEvent => {
617                        if let Some(gpe) = gpe {
618                            if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
619                                notification_val.lock().push(val as u32);
620                                let request = VmRequest::Gpe {
621                                    gpe,
622                                    clear_evt: None,
623                                };
624                                if self.vm_socket.send(&request).is_ok() {
625                                    if let Err(e) = self.vm_socket.recv::<VmResponse>() {
626                                        error!("{} failed to send GPE: {}", self.name.clone(), e);
627                                    }
628                                }
629                            } else {
630                                error!("{} failed to read acpi_notify_evt", self.name.clone());
631                            }
632                        }
633                    }
634                    Token::Kill => break 'wait,
635                }
636            }
637        }
638    }
639}
640
641fn get_next_from_extcap_header(cap_header: u32) -> u32 {
642    (cap_header >> 20) & 0xffc
643}
644
645fn is_skipped_ext_cap(cap_id: u16) -> bool {
646    matches!(
647        cap_id,
648        // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
649        PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
650    )
651}
652
653enum DeviceData {
654    IntelGfxData { opregion_index: u32 },
655}
656
657/// PCI Express Extended Capabilities information
658#[derive(Copy, Clone)]
659struct ExtCap {
660    /// cap offset in Configuration Space
661    offset: u32,
662    /// cap size
663    size: u32,
664    /// next offset, set next non-skipped offset for non-skipped ext cap
665    next: u16,
666    /// whether to be exposed to guest
667    is_skipped: bool,
668}
669
670/// Implements the Vfio Pci device, then a pci device is added into vm
671pub struct VfioPciDevice {
672    device: Arc<VfioDevice>,
673    config: VfioPciConfig,
674    hotplug: bool,
675    hotplug_bus_number: Option<u8>,
676    preferred_address: PciAddress,
677    pci_address: Option<PciAddress>,
678    interrupt_evt: Option<IrqLevelEvent>,
679    acpi_notification_evt: Option<Event>,
680    mmio_regions: Vec<PciBarConfiguration>,
681    io_regions: Vec<PciBarConfiguration>,
682    pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
683    msi_cap: Option<VfioMsiCap>,
684    msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
685    irq_type: Option<VfioIrqType>,
686    vm_memory_client: VmMemoryClient,
687    device_data: Option<DeviceData>,
688    pm_evt: Option<Event>,
689    is_in_low_power: Arc<Mutex<bool>>,
690    worker_thread: Option<WorkerThread<VfioPciWorker>>,
691    vm_socket_vm: Option<Tube>,
692    sysfs_path: PathBuf,
693    // PCI Express Extended Capabilities
694    ext_caps: Vec<ExtCap>,
695    vcfg_shm_mmap: Option<MemoryMapping>,
696    mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
697    activated: bool,
698    acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
699    gpe: Option<u32>,
700    base_class_code: PciClassCode,
701}
702
703impl VfioPciDevice {
704    /// Constructs a new Vfio Pci device for the give Vfio device
705    pub fn new(
706        sysfs_path: &Path,
707        device: VfioDevice,
708        hotplug: bool,
709        hotplug_bus_number: Option<u8>,
710        guest_address: Option<PciAddress>,
711        vfio_device_socket_msi: Tube,
712        vfio_device_socket_msix: Tube,
713        vm_memory_client: VmMemoryClient,
714        vfio_device_socket_vm: Tube,
715    ) -> Result<Self, PciDeviceError> {
716        let preferred_address = if let Some(bus_num) = hotplug_bus_number {
717            debug!("hotplug bus {}", bus_num);
718            PciAddress {
719                // Caller specify pcie bus number for hotplug device
720                bus: bus_num,
721                // devfn should be 0, otherwise pcie root port couldn't detect it
722                dev: 0,
723                func: 0,
724            }
725        } else if let Some(guest_address) = guest_address {
726            debug!("guest PCI address {}", guest_address);
727            guest_address
728        } else {
729            let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
730                PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
731            })?;
732            debug!("parsed device PCI address {}", addr);
733            addr
734        };
735
736        let dev = Arc::new(device);
737        let config = VfioPciConfig::new(Arc::clone(&dev));
738        let mut msi_socket = Some(vfio_device_socket_msi);
739        let mut msix_socket = Some(vfio_device_socket_msix);
740        let mut msi_cap: Option<VfioMsiCap> = None;
741        let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
742        let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
743
744        let mut is_pcie = false;
745        let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
746        let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
747        let device_id: u16 = config.read_config(PCI_DEVICE_ID);
748        let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
749            .unwrap_or(PciClassCode::Other);
750
751        let pci_id = PciId::new(vendor_id, device_id);
752
753        while cap_next != 0 {
754            let cap_id: u8 = config.read_config(cap_next);
755            if cap_id == PCI_CAP_ID_PM {
756                pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
757            } else if cap_id == PCI_CAP_ID_MSI {
758                if let Some(msi_socket) = msi_socket.take() {
759                    msi_cap = Some(VfioMsiCap::new(
760                        &config,
761                        cap_next,
762                        msi_socket,
763                        pci_id.into(),
764                        dev.device_name().to_string(),
765                    ));
766                }
767            } else if cap_id == PCI_CAP_ID_MSIX {
768                if let Some(msix_socket) = msix_socket.take() {
769                    msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
770                        &config,
771                        cap_next,
772                        msix_socket,
773                        pci_id.into(),
774                        dev.device_name().to_string(),
775                    ))));
776                }
777            } else if cap_id == PciCapabilityID::PciExpress as u8 {
778                is_pcie = true;
779            }
780            let offset = cap_next + PCI_MSI_NEXT_POINTER;
781            cap_next = config.read_config::<u8>(offset).into();
782        }
783
784        let mut ext_caps: Vec<ExtCap> = Vec::new();
785        if is_pcie {
786            let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
787            while ext_cap_next != 0 {
788                let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
789                if ext_cap_config == 0 {
790                    break;
791                }
792                ext_caps.push(ExtCap {
793                    offset: ext_cap_next,
794                    // Calculate the size later
795                    size: 0,
796                    // init as the real value
797                    next: get_next_from_extcap_header(ext_cap_config) as u16,
798                    is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
799                });
800                ext_cap_next = get_next_from_extcap_header(ext_cap_config);
801            }
802
803            // Manage extended caps
804            //
805            // Extended capabilities are chained with each pointing to the next, so
806            // we can drop anything other than the head of the chain simply by
807            // modifying the previous next pointer. For the head of the chain, we
808            // can modify the capability ID to something that cannot match a valid
809            // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
810            // supported.
811            //
812            // reverse order by offset
813            ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
814            let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
815            let mut non_skipped_next: u16 = 0;
816            for ext_cap in ext_caps.iter_mut() {
817                if !ext_cap.is_skipped {
818                    ext_cap.next = non_skipped_next;
819                    non_skipped_next = ext_cap.offset as u16;
820                } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
821                    ext_cap.next = non_skipped_next;
822                }
823                ext_cap.size = next_offset - ext_cap.offset;
824                next_offset = ext_cap.offset;
825            }
826            // order by offset
827            ext_caps.reverse();
828        }
829
830        let is_intel_gfx =
831            base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
832        let device_data = if is_intel_gfx {
833            Some(DeviceData::IntelGfxData {
834                opregion_index: u32::MAX,
835            })
836        } else {
837            None
838        };
839
840        Ok(VfioPciDevice {
841            device: dev,
842            config,
843            hotplug,
844            hotplug_bus_number,
845            preferred_address,
846            pci_address: None,
847            interrupt_evt: None,
848            acpi_notification_evt: None,
849            mmio_regions: Vec::new(),
850            io_regions: Vec::new(),
851            pm_cap,
852            msi_cap,
853            msix_cap,
854            irq_type: None,
855            vm_memory_client,
856            device_data,
857            pm_evt: None,
858            is_in_low_power: Arc::new(Mutex::new(false)),
859            worker_thread: None,
860            vm_socket_vm: Some(vfio_device_socket_vm),
861            sysfs_path: sysfs_path.to_path_buf(),
862            ext_caps,
863            vcfg_shm_mmap: None,
864            mapped_mmio_bars: BTreeMap::new(),
865            activated: false,
866            acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
867            gpe: None,
868            base_class_code,
869        })
870    }
871
872    /// Gets the pci address of the device, if one has already been allocated.
873    pub fn pci_address(&self) -> Option<PciAddress> {
874        self.pci_address
875    }
876
877    pub fn is_gfx(&self) -> bool {
878        self.base_class_code == PciClassCode::DisplayController
879    }
880
881    fn is_intel_gfx(&self) -> bool {
882        matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
883    }
884
885    fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
886        if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
887            return self
888                .device
889                .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
890                .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
891        }
892        Err(PciDeviceError::AcpiNotifySetupFailed)
893    }
894
895    #[allow(dead_code)]
896    fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
897        if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
898            return self
899                .device
900                .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
901                .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
902        }
903        Err(PciDeviceError::AcpiNotifyDeactivationFailed)
904    }
905
906    #[allow(dead_code)]
907    fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
908        if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
909            return self
910                .device
911                .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
912                .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
913        }
914        Err(PciDeviceError::AcpiNotifyTestFailed)
915    }
916
917    fn enable_intx(&mut self) {
918        if let Some(ref interrupt_evt) = self.interrupt_evt {
919            if let Err(e) = self.device.irq_enable(
920                &[Some(interrupt_evt.get_trigger())],
921                VFIO_PCI_INTX_IRQ_INDEX,
922                0,
923            ) {
924                error!("{} Intx enable failed: {}", self.debug_label(), e);
925                return;
926            }
927            if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
928                error!("{} Intx mask failed: {}", self.debug_label(), e);
929                self.disable_intx();
930                return;
931            }
932            if let Err(e) = self
933                .device
934                .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
935            {
936                error!("{} resample enable failed: {}", self.debug_label(), e);
937                self.disable_intx();
938                return;
939            }
940            if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
941                error!("{} Intx unmask failed: {}", self.debug_label(), e);
942                self.disable_intx();
943                return;
944            }
945            self.irq_type = Some(VfioIrqType::Intx);
946        }
947    }
948
949    fn disable_intx(&mut self) {
950        if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
951            error!("{} Intx disable failed: {}", self.debug_label(), e);
952        }
953        self.irq_type = None;
954    }
955
956    fn disable_irqs(&mut self) {
957        match self.irq_type {
958            Some(VfioIrqType::Msi) => self.disable_msi(),
959            Some(VfioIrqType::Msix) => self.disable_msix(),
960            _ => (),
961        }
962
963        // Above disable_msi() or disable_msix() will enable intx again.
964        // so disable_intx here again.
965        if let Some(VfioIrqType::Intx) = self.irq_type {
966            self.disable_intx();
967        }
968    }
969
970    fn enable_msi(&mut self) {
971        self.disable_irqs();
972
973        let irqfd = match &self.msi_cap {
974            Some(cap) => {
975                if let Some(fd) = cap.get_msi_irqfd() {
976                    fd
977                } else {
978                    self.enable_intx();
979                    return;
980                }
981            }
982            None => {
983                self.enable_intx();
984                return;
985            }
986        };
987
988        if let Err(e) = self
989            .device
990            .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
991        {
992            error!("{} failed to enable msi: {}", self.debug_label(), e);
993            self.enable_intx();
994            return;
995        }
996
997        self.irq_type = Some(VfioIrqType::Msi);
998    }
999
1000    fn disable_msi(&mut self) {
1001        if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
1002            error!("{} failed to disable msi: {}", self.debug_label(), e);
1003            return;
1004        }
1005        self.irq_type = None;
1006
1007        self.enable_intx();
1008    }
1009
1010    fn enable_msix(&mut self) {
1011        if self.msix_cap.is_none() {
1012            return;
1013        }
1014
1015        self.disable_irqs();
1016        let cap = self.msix_cap.as_ref().unwrap().lock();
1017        let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1018
1019        let mut failed = false;
1020        if !vector_in_use {
1021            // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1022            // to vector 0. Then we enable it and immediately disable it, so that vfio will
1023            // activate physical device. If there are available msix vectors, just enable them
1024            // instead.
1025            let fd = Event::new().expect("failed to create event");
1026            let table_size = cap.table_size();
1027            let mut irqfds = vec![None; table_size];
1028            irqfds[0] = Some(&fd);
1029            for fd in irqfds.iter_mut().skip(1) {
1030                *fd = None;
1031            }
1032            if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1033                error!("{} failed to enable msix: {}", self.debug_label(), e);
1034                failed = true;
1035            }
1036            irqfds[0] = None;
1037            if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1038                error!("{} failed to enable msix: {}", self.debug_label(), e);
1039                failed = true;
1040            }
1041        } else {
1042            let result = self
1043                .device
1044                .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1045            if let Err(e) = result {
1046                error!("{} failed to enable msix: {}", self.debug_label(), e);
1047                failed = true;
1048            }
1049        }
1050
1051        std::mem::drop(cap);
1052        if failed {
1053            self.enable_intx();
1054            return;
1055        }
1056        self.irq_type = Some(VfioIrqType::Msix);
1057    }
1058
1059    fn disable_msix(&mut self) {
1060        if self.msix_cap.is_none() {
1061            return;
1062        }
1063        if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1064            error!("{} failed to disable msix: {}", self.debug_label(), e);
1065            return;
1066        }
1067        self.irq_type = None;
1068        self.enable_intx();
1069    }
1070
1071    fn msix_vectors_update(&self) -> Result<(), VfioError> {
1072        if let Some(cap) = &self.msix_cap {
1073            self.device
1074                .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1075        }
1076        Ok(())
1077    }
1078
1079    fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1080        if let Err(e) = self
1081            .device
1082            .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1083        {
1084            error!(
1085                "{} failed to update msix vector {}: {}",
1086                self.debug_label(),
1087                index,
1088                e
1089            );
1090        }
1091    }
1092
1093    fn adjust_bar_mmap(
1094        &self,
1095        bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1096        remove_mmaps: &[AddressRange],
1097    ) -> Vec<vfio_region_sparse_mmap_area> {
1098        let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1099        let pgmask = (pagesize() as u64) - 1;
1100
1101        for mmap in bar_mmaps.iter() {
1102            let mmap_range = if let Some(mmap_range) =
1103                AddressRange::from_start_and_size(mmap.offset, mmap.size)
1104            {
1105                mmap_range
1106            } else {
1107                continue;
1108            };
1109            let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1110                Ok(a) => a,
1111                Err(e) => {
1112                    error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1113                    mmaps.clear();
1114                    return mmaps;
1115                }
1116            };
1117
1118            for &(mut remove_range) in remove_mmaps.iter() {
1119                remove_range = remove_range.intersect(mmap_range);
1120                if !remove_range.is_empty() {
1121                    // align offsets to page size
1122                    let begin = remove_range.start & !pgmask;
1123                    let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1124                    let remove_range = AddressRange::from_start_and_end(begin, end);
1125                    if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1126                        error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1127                    }
1128                }
1129            }
1130
1131            for mmap in to_mmap.regions {
1132                mmaps.push(vfio_region_sparse_mmap_area {
1133                    offset: mmap.start,
1134                    size: mmap.end - mmap.start + 1,
1135                });
1136            }
1137        }
1138
1139        mmaps
1140    }
1141
1142    fn remove_bar_mmap_msix(
1143        &self,
1144        bar_index: PciBarIndex,
1145        bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1146    ) -> Vec<vfio_region_sparse_mmap_area> {
1147        let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1148        let mut msix_regions = Vec::new();
1149
1150        if let Some(t) = msix_cap.get_msix_table(bar_index) {
1151            msix_regions.push(t);
1152        }
1153        if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1154            msix_regions.push(p);
1155        }
1156
1157        if msix_regions.is_empty() {
1158            return bar_mmaps;
1159        }
1160
1161        self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1162    }
1163
1164    fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1165        let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1166        if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1167            // the bar storing msix table and pba couldn't mmap.
1168            // these bars should be trapped, so that msix could be emulated.
1169            let mut mmaps = self.device.get_region_mmap(index);
1170
1171            if self.msix_cap.is_some() && !self.device.get_region_msix_mmappable(index) {
1172                mmaps = self.remove_bar_mmap_msix(index, mmaps);
1173            }
1174            if mmaps.is_empty() {
1175                return mmaps_ids;
1176            }
1177
1178            for mmap in mmaps.iter() {
1179                let mmap_offset = mmap.offset;
1180                let mmap_size = mmap.size;
1181                let guest_map_start = bar_addr + mmap_offset;
1182                let region_offset = self.device.get_region_offset(index);
1183                let offset = region_offset + mmap_offset;
1184                let descriptor = match self.device.device_file().try_clone() {
1185                    Ok(device_file) => device_file.into(),
1186                    Err(_) => break,
1187                };
1188                match self.vm_memory_client.register_memory(
1189                    VmMemorySource::Descriptor {
1190                        descriptor,
1191                        offset,
1192                        size: mmap_size,
1193                    },
1194                    VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1195                    Protection::read_write(),
1196                    MemCacheType::CacheCoherent,
1197                ) {
1198                    Ok(id) => {
1199                        mmaps_ids.push(id);
1200                    }
1201                    Err(e) => {
1202                        error!("register_memory failed: {}", e);
1203                        break;
1204                    }
1205                }
1206            }
1207        }
1208
1209        mmaps_ids
1210    }
1211
1212    fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1213        for mmap_id in mmap_ids {
1214            if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1215                error!("unregister_memory failed: {}", e);
1216            }
1217        }
1218    }
1219
1220    fn disable_bars_mmap(&mut self) {
1221        for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1222            self.remove_bar_mmap(mmap_ids);
1223        }
1224        self.mapped_mmio_bars.clear();
1225    }
1226
1227    fn commit_bars_mmap(&mut self) {
1228        // Unmap all bars before remapping bars, to prevent issues with overlap
1229        let mut needs_map = Vec::new();
1230        for mmio_info in self.mmio_regions.iter() {
1231            let bar_idx = mmio_info.bar_index();
1232            let addr = mmio_info.address();
1233
1234            if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1235                if cur_addr == addr {
1236                    self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1237                    continue;
1238                } else {
1239                    self.remove_bar_mmap(&ids);
1240                }
1241            }
1242
1243            if addr != 0 {
1244                needs_map.push((bar_idx, addr));
1245            }
1246        }
1247
1248        for (bar_idx, addr) in needs_map.iter() {
1249            let ids = self.add_bar_mmap(*bar_idx, *addr);
1250            self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1251        }
1252    }
1253
1254    fn close(&mut self) {
1255        if let Some(msi) = self.msi_cap.as_mut() {
1256            msi.destroy();
1257        }
1258        if let Some(msix) = &self.msix_cap {
1259            msix.lock().destroy();
1260        }
1261        self.disable_bars_mmap();
1262        self.device.close();
1263    }
1264
1265    fn start_work_thread(&mut self) {
1266        let vm_socket = match self.vm_socket_vm.take() {
1267            Some(socket) => socket,
1268            None => return,
1269        };
1270
1271        let req_evt = match Event::new() {
1272            Ok(evt) => {
1273                if let Err(e) = self
1274                    .device
1275                    .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1276                {
1277                    error!("{} enable req_irq failed: {}", self.debug_label(), e);
1278                    return;
1279                }
1280                evt
1281            }
1282            Err(_) => return,
1283        };
1284
1285        let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1286            Ok(v) => v,
1287            Err(e) => {
1288                error!(
1289                    "{} failed creating PM Event pair: {}",
1290                    self.debug_label(),
1291                    e
1292                );
1293                return;
1294            }
1295        };
1296        self.pm_evt = Some(self_pm_evt);
1297
1298        let (self_acpi_notify_evt, acpi_notify_evt) =
1299            match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1300                Ok(v) => v,
1301                Err(e) => {
1302                    error!(
1303                        "{} failed creating ACPI Event pair: {}",
1304                        self.debug_label(),
1305                        e
1306                    );
1307                    return;
1308                }
1309            };
1310        self.acpi_notification_evt = Some(self_acpi_notify_evt);
1311
1312        if let Err(e) = self.enable_acpi_notification() {
1313            error!("{}: {}", self.debug_label(), e);
1314        }
1315
1316        let mut msix_evt = Vec::new();
1317        if let Some(msix_cap) = &self.msix_cap {
1318            msix_evt = msix_cap.lock().clone_msix_evt();
1319        }
1320
1321        let name = self.device.device_name().to_string();
1322        let address = self.pci_address.expect("Unassigned PCI Address.");
1323        let sysfs_path = self.sysfs_path.clone();
1324        let pm_cap = self.pm_cap.clone();
1325        let msix_cap = self.msix_cap.clone();
1326        let is_in_low_power = self.is_in_low_power.clone();
1327        let gpe_nr = self.gpe;
1328        let notification_val = self.acpi_notifier_val.clone();
1329        self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1330            let mut worker = VfioPciWorker {
1331                address,
1332                sysfs_path,
1333                vm_socket,
1334                name,
1335                pm_cap,
1336                msix_cap,
1337            };
1338            worker.run(
1339                req_evt,
1340                pm_evt,
1341                acpi_notify_evt,
1342                kill_evt,
1343                msix_evt,
1344                is_in_low_power,
1345                gpe_nr,
1346                notification_val,
1347            );
1348            worker
1349        }));
1350        self.activated = true;
1351    }
1352
1353    fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1354        let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1355        let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1356
1357        while i <= VFIO_PCI_ROM_REGION_INDEX {
1358            let mut low: u32 = 0xffffffff;
1359            let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1360                0x30
1361            } else {
1362                0x10 + i * 4
1363            };
1364            self.config.write_config(low, offset);
1365            low = self.config.read_config(offset);
1366
1367            let low_flag = low & 0xf;
1368            let is_64bit = low_flag & 0x4 == 0x4;
1369            if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1370                let mut upper: u32 = 0xffffffff;
1371                if is_64bit {
1372                    self.config.write_config(upper, offset + 4);
1373                    upper = self.config.read_config(offset + 4);
1374                }
1375
1376                low &= 0xffff_fff0;
1377                let mut size: u64 = u64::from(upper);
1378                size <<= 32;
1379                size |= u64::from(low);
1380                size = !size + 1;
1381                let region_type = if is_64bit {
1382                    PciBarRegionType::Memory64BitRegion
1383                } else {
1384                    PciBarRegionType::Memory32BitRegion
1385                };
1386                let prefetch = if low_flag & 0x8 == 0x8 {
1387                    PciBarPrefetchable::Prefetchable
1388                } else {
1389                    PciBarPrefetchable::NotPrefetchable
1390                };
1391                mem_bars.push(PciBarConfiguration::new(
1392                    i as usize,
1393                    size,
1394                    region_type,
1395                    prefetch,
1396                ));
1397            } else if low_flag & 0x1 == 0x1 {
1398                let size = !(low & 0xffff_fffc) + 1;
1399                self.io_regions.push(PciBarConfiguration::new(
1400                    i as usize,
1401                    size.into(),
1402                    PciBarRegionType::IoRegion,
1403                    PciBarPrefetchable::NotPrefetchable,
1404                ));
1405            }
1406
1407            if is_64bit {
1408                i += 2;
1409            } else {
1410                i += 1;
1411            }
1412        }
1413        mem_bars
1414    }
1415
1416    fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1417        let offset: u32 = bar_info.reg_index() as u32 * 4;
1418        let mmio_region = *bar_info;
1419        self.mmio_regions.push(mmio_region.set_address(bar_addr));
1420
1421        let val: u32 = self.config.read_config(offset);
1422        let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1423        self.config.write_config(low, offset);
1424        if bar_info.is_64bit_memory() {
1425            let upper = (bar_addr >> 32) as u32;
1426            self.config.write_config(upper, offset + 4);
1427        }
1428    }
1429
1430    fn allocate_root_barmem(
1431        &mut self,
1432        mem_bars: &[PciBarConfiguration],
1433        resources: &mut SystemAllocator,
1434    ) -> Result<Vec<BarRange>, PciDeviceError> {
1435        let address = self.pci_address.unwrap();
1436        let mut ranges: Vec<BarRange> = Vec::new();
1437        for mem_bar in mem_bars {
1438            let bar_size = mem_bar.size();
1439            let mut bar_addr: u64 = 0;
1440            // Don't allocate mmio for hotplug device, OS will allocate it from
1441            // its parent's bridge window.
1442            if !self.hotplug {
1443                bar_addr = resources
1444                    .allocate_mmio(
1445                        bar_size,
1446                        Alloc::PciBar {
1447                            bus: address.bus,
1448                            dev: address.dev,
1449                            func: address.func,
1450                            bar: mem_bar.bar_index() as u8,
1451                        },
1452                        "vfio_bar".to_string(),
1453                        AllocOptions::new()
1454                            .prefetchable(mem_bar.is_prefetchable())
1455                            .max_address(if mem_bar.is_64bit_memory() {
1456                                u64::MAX
1457                            } else {
1458                                u32::MAX.into()
1459                            })
1460                            .align(bar_size),
1461                    )
1462                    .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1463                ranges.push(BarRange {
1464                    addr: bar_addr,
1465                    size: bar_size,
1466                    prefetchable: mem_bar.is_prefetchable(),
1467                });
1468            }
1469            self.configure_barmem(mem_bar, bar_addr);
1470        }
1471        Ok(ranges)
1472    }
1473
1474    fn allocate_nonroot_barmem(
1475        &mut self,
1476        mem_bars: &mut [PciBarConfiguration],
1477        resources: &mut SystemAllocator,
1478    ) -> Result<Vec<BarRange>, PciDeviceError> {
1479        const NON_PREFETCHABLE: usize = 0;
1480        const PREFETCHABLE: usize = 1;
1481        const ARRAY_SIZE: usize = 2;
1482        let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1483        let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1484            match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1485                Ok(a) => a,
1486                Err(e) => {
1487                    error!(
1488                        "{} init nonroot VfioResourceAllocator failed: {}",
1489                        self.debug_label(),
1490                        e
1491                    );
1492                    return Err(e);
1493                }
1494            },
1495            match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1496                Ok(a) => a,
1497                Err(e) => {
1498                    error!(
1499                        "{} init nonroot VfioResourceAllocator failed: {}",
1500                        self.debug_label(),
1501                        e
1502                    );
1503                    return Err(e);
1504                }
1505            },
1506        ];
1507        let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1508        // the window must be 1M-aligned as per the PCI spec
1509        let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1510        let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1511
1512        // Descend by bar size, this could reduce allocated size for all the bars.
1513        mem_bars.sort_by_key(|a| Reverse(a.size()));
1514        for mem_bar in mem_bars {
1515            let prefetchable = mem_bar.is_prefetchable();
1516            let is_64bit = mem_bar.is_64bit_memory();
1517
1518            // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1519            // as all the prefetchable bars should be in one region
1520            if prefetchable && !is_64bit {
1521                memtype[PREFETCHABLE] = MmioType::Low;
1522            }
1523            let i = if prefetchable {
1524                PREFETCHABLE
1525            } else {
1526                NON_PREFETCHABLE
1527            };
1528            let bar_size = mem_bar.size();
1529            let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1530                Ok(s) => s,
1531                Err(e) => {
1532                    error!(
1533                        "{} nonroot allocate_wit_align failed: {}",
1534                        self.debug_label(),
1535                        e
1536                    );
1537                    return Err(e);
1538                }
1539            };
1540            window_sz[i] = max(window_sz[i], start + bar_size);
1541            alignment[i] = max(alignment[i], bar_size);
1542            let mem_info = (*mem_bar).set_address(start);
1543            membars[i].push(mem_info);
1544        }
1545
1546        let address = self.pci_address.unwrap();
1547        let mut ranges: Vec<BarRange> = Vec::new();
1548        for (index, bars) in membars.iter().enumerate() {
1549            if bars.is_empty() {
1550                continue;
1551            }
1552
1553            let i = if index == 1 {
1554                PREFETCHABLE
1555            } else {
1556                NON_PREFETCHABLE
1557            };
1558            let mut window_addr: u64 = 0;
1559            // Don't allocate mmio for hotplug device, OS will allocate it from
1560            // its parent's bridge window.
1561            if !self.hotplug {
1562                window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1563                let alloc = if i == NON_PREFETCHABLE {
1564                    Alloc::PciBridgeWindow {
1565                        bus: address.bus,
1566                        dev: address.dev,
1567                        func: address.func,
1568                    }
1569                } else {
1570                    Alloc::PciBridgePrefetchWindow {
1571                        bus: address.bus,
1572                        dev: address.dev,
1573                        func: address.func,
1574                    }
1575                };
1576                window_addr = resources
1577                    .mmio_allocator(memtype[i])
1578                    .allocate_with_align(
1579                        window_sz[i],
1580                        alloc,
1581                        "vfio_bar_window".to_string(),
1582                        alignment[i],
1583                    )
1584                    .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1585                for mem_info in bars {
1586                    let bar_addr = window_addr + mem_info.address();
1587                    ranges.push(BarRange {
1588                        addr: bar_addr,
1589                        size: mem_info.size(),
1590                        prefetchable: mem_info.is_prefetchable(),
1591                    });
1592                }
1593            }
1594
1595            for mem_info in bars {
1596                let bar_addr = window_addr + mem_info.address();
1597                self.configure_barmem(mem_info, bar_addr);
1598            }
1599        }
1600        Ok(ranges)
1601    }
1602
1603    /// Return the supported iova max address of the Vfio Pci device
1604    pub fn get_max_iova(&self) -> u64 {
1605        self.device.get_max_addr()
1606    }
1607
1608    fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1609        self.ext_caps
1610            .iter()
1611            .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1612            .cloned()
1613    }
1614
1615    fn is_skipped_reg(&self, reg: u32) -> bool {
1616        // fast handle for pci config space
1617        if reg < PCI_CONFIG_SPACE_SIZE {
1618            return false;
1619        }
1620
1621        self.get_ext_cap_by_reg(reg)
1622            .is_some_and(|cap| cap.is_skipped)
1623    }
1624}
1625
1626impl PciDevice for VfioPciDevice {
1627    fn debug_label(&self) -> String {
1628        format!("vfio {} device", self.device.device_name())
1629    }
1630
1631    fn preferred_address(&self) -> Option<PciAddress> {
1632        Some(self.preferred_address)
1633    }
1634
1635    fn allocate_address(
1636        &mut self,
1637        resources: &mut SystemAllocator,
1638    ) -> Result<PciAddress, PciDeviceError> {
1639        if self.pci_address.is_none() {
1640            let mut address = self.preferred_address;
1641            while address.func < 8 {
1642                if resources.reserve_pci(address, self.debug_label()) {
1643                    self.pci_address = Some(address);
1644                    break;
1645                } else if self.hotplug_bus_number.is_none() {
1646                    break;
1647                } else {
1648                    address.func += 1;
1649                }
1650            }
1651            if let Some(msi_cap) = &mut self.msi_cap {
1652                msi_cap.config.set_pci_address(self.pci_address.unwrap());
1653            }
1654            if let Some(msix_cap) = &mut self.msix_cap {
1655                msix_cap
1656                    .lock()
1657                    .config
1658                    .set_pci_address(self.pci_address.unwrap());
1659            }
1660        }
1661        self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1662    }
1663
1664    fn keep_rds(&self) -> Vec<RawDescriptor> {
1665        let mut rds = self.device.keep_rds();
1666        if let Some(ref interrupt_evt) = self.interrupt_evt {
1667            rds.extend(interrupt_evt.as_raw_descriptors());
1668        }
1669        rds.push(self.vm_memory_client.as_raw_descriptor());
1670        if let Some(vm_socket_vm) = &self.vm_socket_vm {
1671            rds.push(vm_socket_vm.as_raw_descriptor());
1672        }
1673        if let Some(msi_cap) = &self.msi_cap {
1674            rds.push(msi_cap.config.get_msi_socket());
1675        }
1676        if let Some(msix_cap) = &self.msix_cap {
1677            rds.extend(msix_cap.lock().as_raw_descriptors());
1678        }
1679        rds
1680    }
1681
1682    fn preferred_irq(&self) -> PreferredIrq {
1683        // Do not use a fixed IRQ for VFIO devices. The sysfs "irq" file reports a host-assigned IRQ
1684        // number that is not meaningful for the guest and can exceed the u8 range required by the
1685        // MP table. Let the VMM allocate a guest IRQ instead; VFIO handles host/guest
1686        // interrupt mapping regardless of the guest IRQ number chosen.
1687        PreferredIrq::Any
1688    }
1689
1690    fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1691        // Keep event/resample event references.
1692        self.interrupt_evt = Some(irq_evt);
1693
1694        // enable INTX
1695        self.enable_intx();
1696
1697        self.config
1698            .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1699        self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1700    }
1701
1702    fn allocate_io_bars(
1703        &mut self,
1704        resources: &mut SystemAllocator,
1705    ) -> Result<Vec<BarRange>, PciDeviceError> {
1706        let address = self
1707            .pci_address
1708            .expect("allocate_address must be called prior to allocate_device_bars");
1709
1710        let mut mem_bars = self.collect_bars();
1711
1712        let ranges = if address.bus == 0 {
1713            self.allocate_root_barmem(&mem_bars, resources)?
1714        } else {
1715            self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1716        };
1717
1718        // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1719        // driver doesn't claim this vga device, then xorg couldn't boot up.
1720        if self.is_intel_gfx() {
1721            let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1722            cmd |= PCI_COMMAND_MEMORY;
1723            self.config.write_config(cmd, PCI_COMMAND);
1724        }
1725        Ok(ranges)
1726    }
1727
1728    fn allocate_device_bars(
1729        &mut self,
1730        resources: &mut SystemAllocator,
1731    ) -> Result<Vec<BarRange>, PciDeviceError> {
1732        let mut ranges: Vec<BarRange> = Vec::new();
1733
1734        if !self.is_intel_gfx() {
1735            return Ok(ranges);
1736        }
1737
1738        // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1739        // then write this gpa into pci cfg register
1740        if let Some((index, size)) = self.device.get_cap_type_info(
1741            VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1742            VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1743        ) {
1744            let address = self
1745                .pci_address
1746                .expect("allocate_address must be called prior to allocate_device_bars");
1747            let bar_addr = resources
1748                .allocate_mmio(
1749                    size,
1750                    Alloc::PciBar {
1751                        bus: address.bus,
1752                        dev: address.dev,
1753                        func: address.func,
1754                        bar: (index * 4) as u8,
1755                    },
1756                    "vfio_bar".to_string(),
1757                    AllocOptions::new().max_address(u32::MAX.into()),
1758                )
1759                .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1760            ranges.push(BarRange {
1761                addr: bar_addr,
1762                size,
1763                prefetchable: false,
1764            });
1765            self.device_data = Some(DeviceData::IntelGfxData {
1766                opregion_index: index,
1767            });
1768
1769            self.mmio_regions.push(
1770                PciBarConfiguration::new(
1771                    index as usize,
1772                    size,
1773                    PciBarRegionType::Memory32BitRegion,
1774                    PciBarPrefetchable::NotPrefetchable,
1775                )
1776                .set_address(bar_addr),
1777            );
1778            self.config.write_config(bar_addr as u32, 0xFC);
1779        }
1780
1781        Ok(ranges)
1782    }
1783
1784    fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1785        for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1786            if region.bar_index() == bar_num {
1787                let command: u8 = self.config.read_config(PCI_COMMAND);
1788                if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1789                    return None;
1790                } else {
1791                    return Some(*region);
1792                }
1793            }
1794        }
1795
1796        None
1797    }
1798
1799    fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1800        Ok(())
1801    }
1802
1803    fn read_config_register(&self, reg_idx: usize) -> u32 {
1804        let reg: u32 = (reg_idx * 4) as u32;
1805        let mut config: u32 = self.config.read_config(reg);
1806
1807        // See VfioPciDevice::new for details how extended caps are managed
1808        if reg >= PCI_CONFIG_SPACE_SIZE {
1809            let ext_cap = self.get_ext_cap_by_reg(reg);
1810            if let Some(ext_cap) = ext_cap {
1811                if ext_cap.offset == reg {
1812                    config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1813                }
1814
1815                if ext_cap.is_skipped {
1816                    if reg == PCI_CONFIG_SPACE_SIZE {
1817                        config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1818                    } else {
1819                        config = 0;
1820                    }
1821                }
1822            }
1823        }
1824
1825        // Ignore IO bar
1826        if (0x10..=0x24).contains(&reg) {
1827            let bar_idx = (reg as usize - 0x10) / 4;
1828            if let Some(bar) = self.get_bar_configuration(bar_idx) {
1829                if bar.is_io() {
1830                    config = 0;
1831                }
1832            }
1833        } else if let Some(msix_cap) = &self.msix_cap {
1834            let msix_cap = msix_cap.lock();
1835            if msix_cap.is_msix_control_reg(reg, 4) {
1836                msix_cap.read_msix_control(&mut config);
1837            }
1838        } else if let Some(pm_cap) = &self.pm_cap {
1839            let pm_cap = pm_cap.lock();
1840            if pm_cap.is_pm_reg(reg) {
1841                config = pm_cap.read(reg);
1842            }
1843        }
1844
1845        // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1846        if self.is_intel_gfx() && reg == 0x50 {
1847            config &= 0xffff00ff;
1848        }
1849
1850        config
1851    }
1852
1853    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1854        // When guest write config register at the first time, start worker thread
1855        if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1856            self.start_work_thread();
1857        };
1858
1859        let start = (reg_idx * 4) as u64 + offset;
1860
1861        if let Some(pm_cap) = self.pm_cap.as_mut() {
1862            let mut pm_cap = pm_cap.lock();
1863            if pm_cap.is_pm_reg(start as u32) {
1864                pm_cap.write(start, data);
1865            }
1866        }
1867
1868        let mut msi_change: Option<VfioMsiChange> = None;
1869        if let Some(msi_cap) = self.msi_cap.as_mut() {
1870            if msi_cap.is_msi_reg(start, data.len()) {
1871                msi_change = msi_cap.write_msi_reg(start, data);
1872            }
1873        }
1874
1875        match msi_change {
1876            Some(VfioMsiChange::Enable) => self.enable_msi(),
1877            Some(VfioMsiChange::Disable) => self.disable_msi(),
1878            _ => (),
1879        }
1880
1881        msi_change = None;
1882        if let Some(msix_cap) = &self.msix_cap {
1883            let mut msix_cap = msix_cap.lock();
1884            if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1885                msi_change = msix_cap.write_msix_control(data);
1886            }
1887        }
1888
1889        match msi_change {
1890            Some(VfioMsiChange::Enable) => self.enable_msix(),
1891            Some(VfioMsiChange::Disable) => self.disable_msix(),
1892            Some(VfioMsiChange::FunctionChanged) => {
1893                if let Err(e) = self.msix_vectors_update() {
1894                    error!("update msix vectors failed: {}", e);
1895                }
1896            }
1897            _ => (),
1898        }
1899
1900        if !self.is_skipped_reg(start as u32) {
1901            self.device
1902                .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1903        }
1904
1905        // if guest enable memory access, then enable bar mappable once
1906        if start == PCI_COMMAND as u64
1907            && data.len() == 2
1908            && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1909        {
1910            self.commit_bars_mmap();
1911        } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1912            let bar_idx = (start as u32 - 0x10) / 4;
1913            let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1914            let val = u32::from_le_bytes(value);
1915            let mut modify = false;
1916            for region in self.mmio_regions.iter_mut() {
1917                if region.bar_index() == bar_idx as usize {
1918                    let old_addr = region.address();
1919                    let new_addr = val & 0xFFFFFFF0;
1920                    if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1921                        // Change 32bit bar address
1922                        *region = region.set_address(u64::from(new_addr));
1923                        modify = true;
1924                    } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1925                        // Change 64bit bar low address
1926                        *region =
1927                            region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1928                        modify = true;
1929                    }
1930                    break;
1931                } else if region.is_64bit_memory()
1932                    && ((bar_idx % 2) == 1)
1933                    && (region.bar_index() + 1 == bar_idx as usize)
1934                {
1935                    // Change 64bit bar high address
1936                    let old_addr = region.address();
1937                    if val != (old_addr >> 32) as u32 {
1938                        let mut new_addr = (u64::from(val)) << 32;
1939                        new_addr |= old_addr & 0xFFFFFFFF;
1940                        *region = region.set_address(new_addr);
1941                        modify = true;
1942                    }
1943                    break;
1944                }
1945            }
1946            if modify {
1947                // if bar is changed under memory enabled, mmap the
1948                // new bar immediately.
1949                let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1950                if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1951                    self.commit_bars_mmap();
1952                }
1953            }
1954        }
1955    }
1956
1957    fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1958        if reg_idx == PCI_VCFG_NOTY {
1959            let mut q = self.acpi_notifier_val.lock();
1960            let mut val = 0;
1961            if !q.is_empty() {
1962                val = q.remove(0);
1963            }
1964            drop(q);
1965            return val;
1966        }
1967
1968        warn!(
1969            "{} read unsupported vcfg register {}",
1970            self.debug_label(),
1971            reg_idx
1972        );
1973        0xFFFF_FFFF
1974    }
1975
1976    fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1977        match reg_idx {
1978            PCI_VCFG_PM => {
1979                match value {
1980                    0 => {
1981                        if let Some(pm_evt) =
1982                            self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1983                        {
1984                            *self.is_in_low_power.lock() = true;
1985                            let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1986                        } else {
1987                            let _ = self.device.pm_low_power_enter();
1988                        }
1989                    }
1990                    _ => {
1991                        *self.is_in_low_power.lock() = false;
1992                        let _ = self.device.pm_low_power_exit();
1993                    }
1994                };
1995            }
1996            PCI_VCFG_DSM => {
1997                if let Some(shm) = &self.vcfg_shm_mmap {
1998                    let mut args = [0u8; 4096];
1999                    if let Err(e) = shm.read_slice(&mut args, 0) {
2000                        error!("failed to read DSM Args: {}", e);
2001                        return;
2002                    }
2003                    let res = match self.device.acpi_dsm(&args) {
2004                        Ok(r) => r,
2005                        Err(e) => {
2006                            error!("failed to call DSM: {}", e);
2007                            return;
2008                        }
2009                    };
2010                    if let Err(e) = shm.write_slice(&res, 0) {
2011                        error!("failed to write DSM result: {}", e);
2012                        return;
2013                    }
2014                    if let Err(e) = shm.msync() {
2015                        error!("failed to msync: {}", e)
2016                    }
2017                }
2018            }
2019            _ => warn!(
2020                "{} write unsupported vcfg register {}",
2021                self.debug_label(),
2022                reg_idx
2023            ),
2024        };
2025    }
2026
2027    fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2028        if let Some(msix_cap) = &self.msix_cap {
2029            let msix_cap = msix_cap.lock();
2030            if msix_cap.is_msix_table(bar_index, offset) {
2031                msix_cap.read_table(offset, data);
2032                return;
2033            } else if msix_cap.is_msix_pba(bar_index, offset) {
2034                msix_cap.read_pba(offset, data);
2035                return;
2036            }
2037        }
2038        self.device.region_read(bar_index, data, offset);
2039    }
2040
2041    fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2042        // Ignore igd opregion's write
2043        if let Some(device_data) = &self.device_data {
2044            match *device_data {
2045                DeviceData::IntelGfxData { opregion_index } => {
2046                    if opregion_index == bar_index as u32 {
2047                        return;
2048                    }
2049                }
2050            }
2051        }
2052
2053        if let Some(msix_cap) = &self.msix_cap {
2054            let mut msix_cap = msix_cap.lock();
2055            if msix_cap.is_msix_table(bar_index, offset) {
2056                let behavior = msix_cap.write_table(offset, data);
2057                if let MsixStatus::EntryChanged(index) = behavior {
2058                    let irqfd = msix_cap.get_msix_irqfd(index);
2059                    self.msix_vector_update(index, irqfd);
2060                }
2061                return;
2062            } else if msix_cap.is_msix_pba(bar_index, offset) {
2063                msix_cap.write_pba(offset, data);
2064                return;
2065            }
2066        }
2067
2068        self.device.region_write(bar_index, data, offset);
2069    }
2070
2071    fn destroy_device(&mut self) {
2072        self.close();
2073    }
2074
2075    fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2076        let mut amls = Vec::new();
2077        let mut shm = None;
2078        if let Some(pci_address) = self.pci_address {
2079            let vcfg_offset = pci_address.to_config_address(0, 13);
2080            if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2081                vcfg_register.to_aml_bytes(&mut amls);
2082                shm = vcfg_register
2083                    .create_shm_mmap()
2084                    .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2085                self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2086                // All vfio-pci devices should have virtual _PRx method, otherwise
2087                // host couldn't know whether device has enter into suspend state,
2088                // host would always think it is in active state, so its parent PCIe
2089                // switch couldn't enter into suspend state.
2090                PowerResourceMethod {}.to_aml_bytes(&mut amls);
2091                // TODO: WIP: Ideally, we should generate DSM only if the physical
2092                // device has a _DSM; however, such information is not provided by
2093                // Linux. As a temporary workaround, we chech whether there is an
2094                // associated ACPI companion device node and skip generating guest
2095                // _DSM if there is none.
2096                let acpi_path = self.sysfs_path.join("firmware_node/path");
2097                if acpi_path.exists() {
2098                    DsmMethod {}.to_aml_bytes(&mut amls);
2099                }
2100            }
2101        }
2102
2103        (amls, shm)
2104    }
2105
2106    fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2107        if let Some(gpe_nr) = resources.allocate_gpe() {
2108            base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2109            self.gpe = Some(gpe_nr);
2110        }
2111        self.gpe
2112    }
2113}
2114
2115impl Suspendable for VfioPciDevice {
2116    fn sleep(&mut self) -> anyhow::Result<()> {
2117        if let Some(worker_thread) = self.worker_thread.take() {
2118            let res = worker_thread.stop();
2119            self.pci_address = Some(res.address);
2120            self.sysfs_path = res.sysfs_path;
2121            self.pm_cap = res.pm_cap;
2122            self.msix_cap = res.msix_cap;
2123            self.vm_socket_vm = Some(res.vm_socket);
2124        }
2125        Ok(())
2126    }
2127
2128    fn wake(&mut self) -> anyhow::Result<()> {
2129        if self.activated {
2130            self.start_work_thread();
2131        }
2132        Ok(())
2133    }
2134}
2135
2136#[cfg(test)]
2137mod tests {
2138    use resources::AddressRange;
2139
2140    use super::VfioResourceAllocator;
2141
2142    #[test]
2143    fn no_overlap() {
2144        // regions [32, 95]
2145        let mut memory =
2146            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2147        memory
2148            .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2149            .unwrap();
2150        memory
2151            .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2152            .unwrap();
2153
2154        let mut iter = memory.regions.iter();
2155        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2156    }
2157
2158    #[test]
2159    fn complete_overlap() {
2160        // regions [32, 95]
2161        let mut memory =
2162            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2163        // regions [32, 47], [64, 95]
2164        memory
2165            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2166            .unwrap();
2167        // regions [64, 95]
2168        memory
2169            .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2170            .unwrap();
2171
2172        let mut iter = memory.regions.iter();
2173        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2174    }
2175
2176    #[test]
2177    fn partial_overlap_one() {
2178        // regions [32, 95]
2179        let mut memory =
2180            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2181        // regions [32, 47], [64, 95]
2182        memory
2183            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2184            .unwrap();
2185        // regions [32, 39], [64, 95]
2186        memory
2187            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2188            .unwrap();
2189
2190        let mut iter = memory.regions.iter();
2191        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2192        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2193    }
2194
2195    #[test]
2196    fn partial_overlap_two() {
2197        // regions [32, 95]
2198        let mut memory =
2199            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2200        // regions [32, 47], [64, 95]
2201        memory
2202            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2203            .unwrap();
2204        // regions [32, 39], [72, 95]
2205        memory
2206            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2207            .unwrap();
2208
2209        let mut iter = memory.regions.iter();
2210        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2211        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2212    }
2213
2214    #[test]
2215    fn partial_overlap_three() {
2216        // regions [32, 95]
2217        let mut memory =
2218            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2219        // regions [32, 39], [48, 95]
2220        memory
2221            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2222            .unwrap();
2223        // regions [32, 39], [48, 63], [72, 95]
2224        memory
2225            .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2226            .unwrap();
2227        // regions [32, 35], [76, 95]
2228        memory
2229            .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2230            .unwrap();
2231
2232        let mut iter = memory.regions.iter();
2233        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2234        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2235    }
2236}